<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: pyspark SQL cannot resolve 'explode()' due to data type mismatch in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/pyspark-sql-cannot-resolve-explode-due-to-data-type-mismatch/m-p/20876#M14137</link>
    <description>&lt;P&gt;It's on line 10&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;sql_script = "select create_date, item['_id'], item['_VALUE'] from my_data lateral view explode(items.item) t as item"&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;</description>
    <pubDate>Sun, 29 May 2022 00:13:07 GMT</pubDate>
    <dc:creator>KevinXu</dc:creator>
    <dc:date>2022-05-29T00:13:07Z</dc:date>
    <item>
      <title>pyspark SQL cannot resolve 'explode()' due to data type mismatch</title>
      <link>https://community.databricks.com/t5/data-engineering/pyspark-sql-cannot-resolve-explode-due-to-data-type-mismatch/m-p/20874#M14135</link>
      <description>&lt;P&gt;Running Pyspark script getting the following error depending on which xml I query:&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;cannot resolve 'explode(...)' due to data type mismatch&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;The pyspark code:&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;from pyspark.sql import SparkSession
&amp;nbsp;
JOB_NAME = "Complex file to delimeted files transformer"
&amp;nbsp;
spark = SparkSession.builder.appName(JOB_NAME)\
    .config("spark.scheduler.mode", "FAIR")\
    .config('spark.jars.packages', 'com.databricks:spark-xml_2.12:0.12.0')\
    .getOrCreate()
&amp;nbsp;
sql_script = "select create_date, item['_id'], item['_VALUE'] from my_data lateral view explode(items.item) t as item"
&amp;nbsp;
# works fine
read_options = {"rowTag": "my_data"}
df = spark.read\
    .format("xml")\
    .options(**read_options)\
    .load("./xml")
df.createOrReplaceTempView("my_data")
spark.sql(sql_script).show()
&amp;nbsp;
# Error
df2 = spark.read\
    .format("xml")\
    .options(**read_options)\
    .load("./xml/test2.xml")
df2.createOrReplaceTempView("my_data")
spark.sql(sql_script).show()&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;the xml is in xml folder.&lt;/P&gt;&lt;P&gt;test1.xml:&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;&amp;lt;my_data&amp;gt;&amp;lt;create_date&amp;gt;2021-05-01&amp;lt;/create_date&amp;gt;&amp;lt;items&amp;gt;&amp;lt;item id="1"&amp;gt;item 1&amp;lt;/item&amp;gt;&amp;lt;item id="2"&amp;gt;item 2&amp;lt;/item&amp;gt;&amp;lt;/items&amp;gt;
&amp;lt;/my_data&amp;gt;&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;test2.xml:&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;&amp;lt;my_data&amp;gt;&amp;lt;create_date&amp;gt;2021-06-01&amp;lt;/create_date&amp;gt;&amp;lt;items&amp;gt;&amp;lt;item id="3"&amp;gt;item 3&amp;lt;/item&amp;gt;&amp;lt;/items&amp;gt;
&amp;lt;/my_data&amp;gt;&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Expected result: the same SQL statement should work all the time and not break, nor have a chance of erroring if one run happens to have only one&amp;nbsp;&amp;lt;item&amp;gt; in&amp;nbsp;&amp;lt;items&amp;gt;.&lt;/P&gt;</description>
      <pubDate>Wed, 11 May 2022 12:54:07 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/pyspark-sql-cannot-resolve-explode-due-to-data-type-mismatch/m-p/20874#M14135</guid>
      <dc:creator>KevinXu</dc:creator>
      <dc:date>2022-05-11T12:54:07Z</dc:date>
    </item>
    <item>
      <title>Re: pyspark SQL cannot resolve 'explode()' due to data type mismatch</title>
      <link>https://community.databricks.com/t5/data-engineering/pyspark-sql-cannot-resolve-explode-due-to-data-type-mismatch/m-p/20876#M14137</link>
      <description>&lt;P&gt;It's on line 10&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;sql_script = "select create_date, item['_id'], item['_VALUE'] from my_data lateral view explode(items.item) t as item"&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Sun, 29 May 2022 00:13:07 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/pyspark-sql-cannot-resolve-explode-due-to-data-type-mismatch/m-p/20876#M14137</guid>
      <dc:creator>KevinXu</dc:creator>
      <dc:date>2022-05-29T00:13:07Z</dc:date>
    </item>
  </channel>
</rss>

