from pyspark import SparkContext
from pyspark.sql import SQLContext
from functools import reduce
import pyspark.sql.functions as F
sc = SparkContext.getOrCreate()
sql = SQLContext(sc)
input_list = [
(1,"2019-11-07 10:30:00") ,(1,"2019-11-08 10:30:00")
,(1,"2019-11-09 10:30:00")
,(1,"2019-11-11 10:30:00")
,(1,"2019-11-12 10:30:00")
,(1,"2019-11-13 10:30:00")
,(1,"2019-11-14 10:30:00")
,(2,"2019-11-08 10:30:00")
,(2,"2019-11-09 10:30:00")
,(3,"2019-11-09 10:30:00")
,(3,"2019-11-10 10:30:00")
,(3,"2019-11-11 10:30:00")
,(2,"2019-11-15 10:30:00")
,(2,"2019-11-18 10:30:00")
,(4,"2019-11-10 10:30:00")
,(4,"2019-11-11 10:30:00") ]
sparkDF = sql.createDataFrame(input_list,['customerid','date'])
sparkDF = sparkDF.withColumn('date_timestamp',F.to_timestamp(F.col('date'), 'yyyy-MM-dd HH:mm:ss')) sparkDF.show()