<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Pyspark read multiple Parquet type expansion failure in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/pyspark-read-multiple-parquet-type-expansion-failure/m-p/7238#M3159</link>
    <description>&lt;P&gt;Hi @Erik Louie​&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Help us build a vibrant and resourceful community by recognizing and highlighting insightful contributions. Mark the best answers and show your appreciation!&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Regards&lt;/P&gt;</description>
    <pubDate>Thu, 30 Mar 2023 04:45:38 GMT</pubDate>
    <dc:creator>Anonymous</dc:creator>
    <dc:date>2023-03-30T04:45:38Z</dc:date>
    <item>
      <title>Pyspark read multiple Parquet type expansion failure</title>
      <link>https://community.databricks.com/t5/data-engineering/pyspark-read-multiple-parquet-type-expansion-failure/m-p/7236#M3157</link>
      <description>&lt;P&gt;&lt;B&gt;Problem&lt;/B&gt;&lt;/P&gt;&lt;P&gt;Reading nearly equivalent parquet tables in a directory with some having column X with type float and some with type double fails.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;B&gt;Attempts at resolving&lt;/B&gt;&lt;/P&gt;&lt;OL&gt;&lt;LI&gt;Using streaming files&lt;/LI&gt;&lt;LI&gt;Removing delta caching, vectorization&lt;/LI&gt;&lt;LI&gt;Using ,cache() explicitly&lt;/LI&gt;&lt;/OL&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;B&gt;Notes&lt;/B&gt;&lt;/P&gt;&lt;P&gt;&lt;A href="https://issues.apache.org/jira/projects/SPARK/issues/SPARK-40876" alt="https://issues.apache.org/jira/projects/SPARK/issues/SPARK-40876" target="_blank"&gt;This is a known problem&lt;/A&gt;, but I need a work around.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;B&gt;Example code&lt;/B&gt;&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;(spark.read.option("mergeSchema", False)
    .option("spark.databricks.io.cache.enabled", False)
    .parquet(
        f"s3://my-bucket/data/*"
    )
    .write.mode("append").saveAsTable("my_table"))&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Wed, 22 Mar 2023 21:03:21 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/pyspark-read-multiple-parquet-type-expansion-failure/m-p/7236#M3157</guid>
      <dc:creator>Erik_L</dc:creator>
      <dc:date>2023-03-22T21:03:21Z</dc:date>
    </item>
    <item>
      <title>Re: Pyspark read multiple Parquet type expansion failure</title>
      <link>https://community.databricks.com/t5/data-engineering/pyspark-read-multiple-parquet-type-expansion-failure/m-p/7237#M3158</link>
      <description>&lt;P&gt;After many, many hours of trying to resolve this, I figured out a hack that _solves_ the problem, but it's not optimal. I basically read the directory listing of files and then merge them via unions and do a save out.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;my_schema = StructType([
    StructField("ordered", StringType()),
    StructField("by", TimestampType()),
    StructField("schema", LongType()),
    StructField("provided", DoubleType()),
])
df = spark.createDataFrame(data=[], schema=my_schema)
&amp;nbsp;
# ...
        for table_file in table_files:
            df = df.union(
                spark.read.option("mergeSchema", False)
                .option("spark.databricks.io.cache.enabled", False)
                .parquet(
                    f"s3://my-bucket/data/{table_file}"
                )
                # Transformations
                .select('ordered', 'by', 'schema', 'provided')
            )&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Wed, 22 Mar 2023 21:34:31 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/pyspark-read-multiple-parquet-type-expansion-failure/m-p/7237#M3158</guid>
      <dc:creator>Erik_L</dc:creator>
      <dc:date>2023-03-22T21:34:31Z</dc:date>
    </item>
    <item>
      <title>Re: Pyspark read multiple Parquet type expansion failure</title>
      <link>https://community.databricks.com/t5/data-engineering/pyspark-read-multiple-parquet-type-expansion-failure/m-p/7238#M3159</link>
      <description>&lt;P&gt;Hi @Erik Louie​&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Help us build a vibrant and resourceful community by recognizing and highlighting insightful contributions. Mark the best answers and show your appreciation!&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Regards&lt;/P&gt;</description>
      <pubDate>Thu, 30 Mar 2023 04:45:38 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/pyspark-read-multiple-parquet-type-expansion-failure/m-p/7238#M3159</guid>
      <dc:creator>Anonymous</dc:creator>
      <dc:date>2023-03-30T04:45:38Z</dc:date>
    </item>
  </channel>
</rss>

