<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Autoloader infering struct as a string when reading json data in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/autoloader-infering-struct-as-a-string-when-reading-json-data/m-p/110782#M43685</link>
    <description>&lt;P&gt;Hi Everyone,&lt;/P&gt;&lt;P&gt;Trying to read JSON files with autoloader is failing to infer the schema correctly, every nested or struct column is being inferred as a string.&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;spark.readStream.format("cloudFiles")
 .option("cloudFiles.format", "json")
 .option("cloudFiles.schemaLocation", CHECKPOINT_PATH)
 .option("multiLine", True)
 .load(f"/Volumes/{CATALOG}/{SCHEMA}/files/")&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;When I read the same files normally with spark is actually to infer the schema correctly.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;spark.read.format("json").option("multiline", True).load(f"/Volumes/{CATALOG}/{SCHEMA}/files/")&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;I've deleted the checkpoint also to see if that was causing the problem but still the same.&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;Here are the schemas to compare&lt;/P&gt;&lt;P&gt;Autoloader:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;root
 |-- changelog: string (nullable = true)
 |-- issue: string (nullable = true)
 |-- issue_event_type_name: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- user: string (nullable = true)
 |-- webhookEvent: string (nullable = true)
 |-- project: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)
 |-- issue_id: string (nullable = true)
 |-- _rescued_data: string (nullable = true)&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Spark Normal:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;root
 |-- changelog: struct (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- items: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- field: string (nullable = true)
 |    |    |    |-- fieldId: string (nullable = true)
 |    |    |    |-- fieldtype: string (nullable = true)
 |    |    |    |-- from: string (nullable = true)
 |    |    |    |-- fromString: string (nullable = true)
 |    |    |    |-- tmpFromAccountId: string (nullable = true)
 |    |    |    |-- tmpToAccountId: string (nullable = true)
 |    |    |    |-- to: string (nullable = true)
 |    |    |    |-- toString: string (nullable = true)
 |-- issue: struct (nullable = true)
 |    |-- fields: struct (nullable = true)
 |    |    |-- assignee: string (nullable = true)
 |    |    |-- attachment: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- components: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- created: string (nullable = true)
 |    |    |-- creator: struct (nullable = true)
 |    |    |    |-- accountId: string (nullable = true)
 |    |    |    |-- accountType: string (nullable = true)
 |    |    |    |-- active: boolean (nullable = true)
 |    |    |    |-- avatarUrls: struct (nullable = true)
 |    |    |    |    |-- 16x16: string (nullable = true)
...&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Thu, 20 Feb 2025 17:27:18 GMT</pubDate>
    <dc:creator>robertomatus</dc:creator>
    <dc:date>2025-02-20T17:27:18Z</dc:date>
    <item>
      <title>Autoloader infering struct as a string when reading json data</title>
      <link>https://community.databricks.com/t5/data-engineering/autoloader-infering-struct-as-a-string-when-reading-json-data/m-p/110782#M43685</link>
      <description>&lt;P&gt;Hi Everyone,&lt;/P&gt;&lt;P&gt;Trying to read JSON files with autoloader is failing to infer the schema correctly, every nested or struct column is being inferred as a string.&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;spark.readStream.format("cloudFiles")
 .option("cloudFiles.format", "json")
 .option("cloudFiles.schemaLocation", CHECKPOINT_PATH)
 .option("multiLine", True)
 .load(f"/Volumes/{CATALOG}/{SCHEMA}/files/")&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;When I read the same files normally with spark is actually to infer the schema correctly.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;spark.read.format("json").option("multiline", True).load(f"/Volumes/{CATALOG}/{SCHEMA}/files/")&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;I've deleted the checkpoint also to see if that was causing the problem but still the same.&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;Here are the schemas to compare&lt;/P&gt;&lt;P&gt;Autoloader:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;root
 |-- changelog: string (nullable = true)
 |-- issue: string (nullable = true)
 |-- issue_event_type_name: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- user: string (nullable = true)
 |-- webhookEvent: string (nullable = true)
 |-- project: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)
 |-- issue_id: string (nullable = true)
 |-- _rescued_data: string (nullable = true)&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Spark Normal:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;root
 |-- changelog: struct (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- items: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- field: string (nullable = true)
 |    |    |    |-- fieldId: string (nullable = true)
 |    |    |    |-- fieldtype: string (nullable = true)
 |    |    |    |-- from: string (nullable = true)
 |    |    |    |-- fromString: string (nullable = true)
 |    |    |    |-- tmpFromAccountId: string (nullable = true)
 |    |    |    |-- tmpToAccountId: string (nullable = true)
 |    |    |    |-- to: string (nullable = true)
 |    |    |    |-- toString: string (nullable = true)
 |-- issue: struct (nullable = true)
 |    |-- fields: struct (nullable = true)
 |    |    |-- assignee: string (nullable = true)
 |    |    |-- attachment: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- components: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- created: string (nullable = true)
 |    |    |-- creator: struct (nullable = true)
 |    |    |    |-- accountId: string (nullable = true)
 |    |    |    |-- accountType: string (nullable = true)
 |    |    |    |-- active: boolean (nullable = true)
 |    |    |    |-- avatarUrls: struct (nullable = true)
 |    |    |    |    |-- 16x16: string (nullable = true)
...&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 20 Feb 2025 17:27:18 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/autoloader-infering-struct-as-a-string-when-reading-json-data/m-p/110782#M43685</guid>
      <dc:creator>robertomatus</dc:creator>
      <dc:date>2025-02-20T17:27:18Z</dc:date>
    </item>
    <item>
      <title>Re: Autoloader infering struct as a string when reading json data</title>
      <link>https://community.databricks.com/t5/data-engineering/autoloader-infering-struct-as-a-string-when-reading-json-data/m-p/110811#M43697</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/126407"&gt;@robertomatus&lt;/a&gt;&amp;nbsp;,&lt;/P&gt;&lt;P&gt;As per my understanding,&amp;nbsp;It looks like Auto Loader isn't inferring nested structures correctly, likely because of how it handles schema inference differently from spark.read.json().&lt;/P&gt;&lt;P&gt;You can try explicitly defining the schema using .schema() to ensure it recognizes structs properly. If your schema changes over time, enabling schema evolution with .option("cloudFiles.inferColumnTypes", "true") and .option("cloudFiles.schemaEvolutionMode", "rescue") might help.&lt;/P&gt;&lt;P&gt;Alternatively, pre-processing your JSON files with spark.read.json() before using Auto Loader could ensure the correct structure.&lt;/P&gt;&lt;P&gt;Hope this helps! Let me know if you need more details.&lt;/P&gt;&lt;P&gt;Regards,&lt;/P&gt;&lt;P&gt;Brahma&lt;/P&gt;</description>
      <pubDate>Fri, 21 Feb 2025 03:55:23 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/autoloader-infering-struct-as-a-string-when-reading-json-data/m-p/110811#M43697</guid>
      <dc:creator>Brahmareddy</dc:creator>
      <dc:date>2025-02-21T03:55:23Z</dc:date>
    </item>
    <item>
      <title>Re: Autoloader infering struct as a string when reading json data</title>
      <link>https://community.databricks.com/t5/data-engineering/autoloader-infering-struct-as-a-string-when-reading-json-data/m-p/110823#M43704</link>
      <description>&lt;P&gt;Hi &lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/102548"&gt;@Brahmareddy&lt;/a&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Thank you for your answer,&amp;nbsp; I found some ways of getting the schema from spark.read.json() and then give it to the autoloader, which works, but the thing is it would be better if we wouldn't have to find this types of workarounds.&lt;/P&gt;&lt;P&gt;If autoloader is just basically spark streaming why they infer the schema differently.&lt;/P&gt;</description>
      <pubDate>Fri, 21 Feb 2025 09:01:45 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/autoloader-infering-struct-as-a-string-when-reading-json-data/m-p/110823#M43704</guid>
      <dc:creator>robertomatus</dc:creator>
      <dc:date>2025-02-21T09:01:45Z</dc:date>
    </item>
    <item>
      <title>Re: Autoloader infering struct as a string when reading json data</title>
      <link>https://community.databricks.com/t5/data-engineering/autoloader-infering-struct-as-a-string-when-reading-json-data/m-p/110885#M43727</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/126407"&gt;@robertomatus&lt;/a&gt;&amp;nbsp;,&lt;/P&gt;&lt;P&gt;You're right—it would be much better if we didn’t have to rely on workarounds. The reason AutoLoader infers schema differently from spark.read.json() is that it's optimized for streaming large-scale data efficiently. Unlike spark.read.json(), which scans all files, AutoLoader samples data to infer schema faster and supports incremental schema evolution for handling new columns over time.&lt;/P&gt;&lt;P&gt;If you want a more reliable approach, consider defining the schema manually and passing it to AutoLoader instead of relying on inference. Another option is to use spark.read.json() on a small sample once, extract the schema, and then provide it to AutoLoader. You can also enable schema evolution using .option("cloudFiles.schemaEvolutionMode", "rescue") to handle unexpected changes dynamically.&lt;/P&gt;&lt;P&gt;While it would be great if AutoLoader handled this seamlessly, these steps can help make schema inference more predictable and reduce inconsistencies.&lt;/P&gt;&lt;P&gt;Hoping you have a good day.&lt;/P&gt;&lt;P&gt;Regards,&lt;/P&gt;&lt;P&gt;Brahma&lt;/P&gt;</description>
      <pubDate>Fri, 21 Feb 2025 15:57:43 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/autoloader-infering-struct-as-a-string-when-reading-json-data/m-p/110885#M43727</guid>
      <dc:creator>Brahmareddy</dc:creator>
      <dc:date>2025-02-21T15:57:43Z</dc:date>
    </item>
  </channel>
</rss>

