import pandas as pd
from pyspark.sql.types import StringType, IntegerType
from pyspark.sql.functions import col
save_path = os.path.join(base_path, stg_dir, "testCsvEncoding")
d = [{"code": "00034321"}, {"code": "55964445226"}]
df = pd.DataFrame(d)
spark_df = spark.createDataFrame(df)
spark_df.display()
field = "code"
spark_df = spark_df.withColumn(field, col(field).cast(StringType()))
spark_df.display()
spark_df.coalesce(1).write.mode('overwrite').format("csv").option("header", "true").option("encoding", "gbk").save(save_path)
example codes are as above, i am trying to convert pandas df to pyspark df, and then save data to a csv file, but somehow it always convert a string field to a number field in csv file as attached. Anyone can help?