<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Read Array of Arrays of Objects JSON file using Spark in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/read-array-of-arrays-of-objects-json-file-using-spark/m-p/142293#M51912</link>
    <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/201156"&gt;@Joost1024&lt;/a&gt;&amp;nbsp;,&lt;/P&gt;&lt;P&gt;So here's the issue. It seems that JSON &lt;SPAN class=""&gt;DataFrameReader&lt;/SPAN&gt;&amp;nbsp;expects to have a JSON object. But in your case we're dealing with JSON array at root level - not a JSON object.&lt;/P&gt;&lt;P&gt;So for instance,if we would just rewrite your file in following way then spark would be able to infer schema without any issues:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;{
    "data": [
            [
                {
                "entity_id": "sensor.solaredge_lifetime_energy",
                "state": "19848848.0",
                "attributes": {
                    "state_class": "total",
                    "unit_of_measurement": "Wh",
                    "device_class": "energy",
                    "friendly_name": "solaredge Lifetime energy"
                },
                "last_changed": "2025-12-14T23:00:00+00:00",
                "last_updated": "2025-12-14T23:00:00+00:00"
                },
                {
                "entity_id": "sensor.solaredge_lifetime_energy",
                "state": "19849120.0",
                "attributes": {
                    "state_class": "total",
                    "unit_of_measurement": "Wh",
                    "device_class": "energy",
                    "friendly_name": "solaredge Lifetime energy"
                },
                "last_changed": "2025-12-14T23:15:00+00:00",
                "last_updated": "2025-12-14T23:15:00+00:00"
                },
                {
                "entity_id": "sensor.solaredge_lifetime_energy",
                "state": "19849580.0",
                "attributes": {
                    "state_class": "total",
                    "unit_of_measurement": "Wh",
                    "device_class": "energy",
                    "friendly_name": "solaredge Lifetime energy"
                },
                "last_changed": "2025-12-14T23:30:00+00:00",
                "last_updated": "2025-12-14T23:30:00+00:00"
                }
            ],
            [
                {
                "entity_id": "sensor.home_temperature",
                "state": "21.5",
                "attributes": {
                    "state_class": "measurement",
                    "unit_of_measurement": "°C",
                    "device_class": "temperature",
                    "friendly_name": "Home Temperature"
                },
                "last_changed": "2025-12-14T23:00:00+00:00",
                "last_updated": "2025-12-14T23:00:00+00:00"
                },
                {
                "entity_id": "sensor.home_temperature",
                "state": "21.3",
                "attributes": {
                    "state_class": "measurement",
                    "unit_of_measurement": "°C",
                    "device_class": "temperature",
                    "friendly_name": "Home Temperature"
                },
                "last_changed": "2025-12-14T23:15:00+00:00",
                "last_updated": "2025-12-14T23:15:00+00:00"
                }
            ],
            [
                {
                "entity_id": "sensor.power_consumption",
                "state": "1250.0",
                "attributes": {
                    "state_class": "measurement",
                    "unit_of_measurement": "W",
                    "device_class": "power",
                    "friendly_name": "Power Consumption"
                },
                "last_changed": "2025-12-14T23:00:00+00:00",
                "last_updated": "2025-12-14T23:00:00+00:00"
                },
                {
                "entity_id": "sensor.power_consumption",
                "state": "1180.0",
                "attributes": {
                    "state_class": "measurement",
                    "unit_of_measurement": "W",
                    "device_class": "power",
                    "friendly_name": "Power Consumption"
                },
                "last_changed": "2025-12-14T23:15:00+00:00",
                "last_updated": "2025-12-14T23:15:00+00:00"
                },
                {
                "entity_id": "sensor.power_consumption",
                "state": "1320.0",
                "attributes": {
                    "state_class": "measurement",
                    "unit_of_measurement": "W",
                    "device_class": "power",
                    "friendly_name": "Power Consumption"
                },
                "last_changed": "2025-12-14T23:30:00+00:00",
                "last_updated": "2025-12-14T23:30:00+00:00"
                },
                {
                "entity_id": "sensor.power_consumption",
                "state": "1295.0",
                "attributes": {
                    "state_class": "measurement",
                    "unit_of_measurement": "W",
                    "device_class": "power",
                    "friendly_name": "Power Consumption"
                },
                "last_changed": "2025-12-14T23:45:00+00:00",
                "last_updated": "2025-12-14T23:45:00+00:00"
                }
            ]
            ]

}&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="szymon_dybczak_0-1766229529369.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/22443i3F3B7AA389B71A02/image-size/medium?v=v2&amp;amp;px=400" role="button" title="szymon_dybczak_0-1766229529369.png" alt="szymon_dybczak_0-1766229529369.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Ok, so for json arrays as a top objects we can try different approach. We can read json file as a text (important thing here - we want to use option&amp;nbsp;&lt;STRONG&gt;wholeText=True&amp;nbsp;&lt;/STRONG&gt;to not split by new lines) and then use from_json function to parse it correctly:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;from pyspark.sql.functions import lit, from_json, col, explode
from pyspark.sql.types import StructType, StructField, ArrayType, StringType


attributes_schema = StructType([
    StructField("state_class", StringType(), nullable=True),
    StructField("unit_of_measurement", StringType(), nullable=True),
    StructField("device_class", StringType(), nullable=True),
    StructField("friendly_name", StringType(), nullable=True)
])

sensor_reading_schema = StructType([
    StructField("entity_id", StringType(), nullable=True),
    StructField("state", StringType(), nullable=True),
    StructField("attributes", attributes_schema, nullable=True),
    StructField("last_changed", StringType(), nullable=True),
    StructField("last_updated", StringType(), nullable=True)
])


df_text = spark.read.text('/Volumes/logging_demo/default/logs/sample_data.json', wholetext=True)

array_schema = ArrayType(ArrayType(sensor_reading_schema))
df_parsed = df_text.select(from_json(col("value"), array_schema).alias("data"))


# df_flat = df_parsed.select(explode(col("data")).alias("inner_array")) \
#                    .select(explode(col("inner_array")).alias("sensor")) \
#                    .select("sensor.*")

display(df_parsed)&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;And as you can see on below screenshot - now we parsed our file correctly:&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="szymon_dybczak_1-1766229807019.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/22444i167B608ADFC862B6/image-size/medium?v=v2&amp;amp;px=400" role="button" title="szymon_dybczak_1-1766229807019.png" alt="szymon_dybczak_1-1766229807019.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;Of course you can flattened it further - just uncomment df_flat dataframe.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Sat, 20 Dec 2025 11:26:09 GMT</pubDate>
    <dc:creator>szymon_dybczak</dc:creator>
    <dc:date>2025-12-20T11:26:09Z</dc:date>
    <item>
      <title>Read Array of Arrays of Objects JSON file using Spark</title>
      <link>https://community.databricks.com/t5/data-engineering/read-array-of-arrays-of-objects-json-file-using-spark/m-p/142084#M51876</link>
      <description>&lt;P&gt;Hi Databricks Community!&amp;nbsp;&lt;/P&gt;&lt;P&gt;This is my first post in this forum, so I hope you can forgive me if it's not according to the forum best practices &lt;span class="lia-unicode-emoji" title=":slightly_smiling_face:"&gt;🙂&lt;/span&gt;&lt;/P&gt;&lt;P&gt;After lots of searching, I decided to share the peculiar issue I'm running into in this community.&lt;/P&gt;&lt;P&gt;I try to load a JSON format that is exposed via the Home Assistant&amp;nbsp;&lt;SPAN&gt;&lt;FONT face="courier new,courier"&gt;/api/history/period/&lt;/FONT&gt; endpoint. The format consists of an array of arrays:&lt;BR /&gt;&lt;/SPAN&gt;&lt;/P&gt;&lt;PRE&gt;[&lt;BR /&gt;&lt;SPAN&gt;   [&lt;BR /&gt;&lt;/SPAN&gt;&lt;SPAN&gt;      {&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;         "entity_id": "sensor.solaredge_lifetime_energy",&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;         "state": "19848848.0",&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;         "attributes": {&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;            "state_class": "total",&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;            "unit_of_measurement": "Wh",&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;            "device_class": "energy",&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;            "friendly_name": "solaredge Lifetime energy"&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;         },&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;         "last_changed": "2025-12-14T23:00:00+00:00",&lt;BR /&gt;&lt;/SPAN&gt;&lt;SPAN&gt;         "last_updated": "2025-12-14T23:00:00+00:00"&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;      },&lt;BR /&gt;&lt;/SPAN&gt;      { ... }&lt;BR /&gt;&lt;SPAN&gt;   ],&lt;BR /&gt;&lt;/SPAN&gt;   [ ... ]&lt;BR /&gt;]&lt;/PRE&gt;&lt;P&gt;&lt;SPAN&gt;Each array contains measurements of a specific sensor. Every object has the same 5 properties / fields.&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;The file(s) live in S3 and I try to read them as follows:&lt;/SPAN&gt;&lt;/P&gt;&lt;DIV&gt;&lt;PRE&gt;&lt;SPAN&gt;schema &lt;/SPAN&gt;&lt;SPAN&gt;=&lt;/SPAN&gt; &lt;SPAN&gt;StructType&lt;/SPAN&gt;&lt;SPAN&gt;([&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;   StructField&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;"entity_id"&lt;/SPAN&gt;&lt;SPAN&gt;, &lt;/SPAN&gt;&lt;SPAN&gt;StringType&lt;/SPAN&gt;&lt;SPAN&gt;(), &lt;/SPAN&gt;&lt;SPAN&gt;False&lt;/SPAN&gt;&lt;SPAN&gt;),&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;   StructField&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;"state"&lt;/SPAN&gt;&lt;SPAN&gt;, &lt;/SPAN&gt;&lt;SPAN&gt;StringType&lt;/SPAN&gt;&lt;SPAN&gt;(), &lt;/SPAN&gt;&lt;SPAN&gt;True&lt;/SPAN&gt;&lt;SPAN&gt;),&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;   StructField&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;"attributes"&lt;/SPAN&gt;&lt;SPAN&gt;, &lt;/SPAN&gt;&lt;SPAN&gt;MapType&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;StringType&lt;/SPAN&gt;&lt;SPAN&gt;(), &lt;/SPAN&gt;&lt;SPAN&gt;StringType&lt;/SPAN&gt;&lt;SPAN&gt;()), &lt;/SPAN&gt;&lt;SPAN&gt;True&lt;/SPAN&gt;&lt;SPAN&gt;),&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;   StructField&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;"last_changed"&lt;/SPAN&gt;&lt;SPAN&gt;, &lt;/SPAN&gt;&lt;SPAN&gt;TimestampType&lt;/SPAN&gt;&lt;SPAN&gt;(), &lt;/SPAN&gt;&lt;SPAN&gt;False&lt;/SPAN&gt;&lt;SPAN&gt;),&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;   StructField&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;"last_updated"&lt;/SPAN&gt;&lt;SPAN&gt;, &lt;/SPAN&gt;&lt;SPAN&gt;TimestampType&lt;/SPAN&gt;&lt;SPAN&gt;(), &lt;/SPAN&gt;&lt;SPAN&gt;False&lt;/SPAN&gt;&lt;SPAN&gt;),&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;])&lt;/SPAN&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;SPAN&gt;spark.read.&lt;/SPAN&gt;&lt;SPAN&gt;format&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;'json'&lt;/SPAN&gt;&lt;SPAN&gt;).&lt;/SPAN&gt;&lt;SPAN&gt;option&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;"multiLine"&lt;/SPAN&gt;&lt;SPAN&gt;, &lt;/SPAN&gt;&lt;SPAN&gt;"true"&lt;/SPAN&gt;&lt;SPAN&gt;).&lt;/SPAN&gt;&lt;SPAN&gt;schema&lt;/SPAN&gt;&lt;SPAN&gt;(schema).&lt;/SPAN&gt;&lt;SPAN&gt;load&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;&amp;lt;S3 location&amp;gt;&lt;/SPAN&gt;&lt;SPAN&gt;).&lt;/SPAN&gt;&lt;SPAN&gt;limit&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;10&lt;/SPAN&gt;&lt;SPAN&gt;).&lt;/SPAN&gt;&lt;SPAN&gt;display&lt;/SPAN&gt;&lt;SPAN&gt;()&lt;/SPAN&gt;&lt;/PRE&gt;&lt;/DIV&gt;&lt;P&gt;&lt;SPAN&gt;For some strange reason the result is 1 row per file with all null values, while the correct columns are there.&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;I tried / checked different things so far:&lt;/SPAN&gt;&lt;/P&gt;&lt;OL&gt;&lt;LI&gt;&lt;SPAN&gt;Validated the JSON: The JSON file is valid and doesn't contain any malformed / missing values.&lt;/SPAN&gt;&lt;/LI&gt;&lt;LI&gt;&lt;SPAN&gt;I 'flattened' the list of lists to a single top-level list using&amp;nbsp;&lt;/SPAN&gt;&lt;FONT face="courier new,courier"&gt;flattened = [element &lt;SPAN&gt;for &lt;/SPAN&gt;&lt;SPAN&gt;sub_list &lt;/SPAN&gt;&lt;SPAN&gt;in &lt;/SPAN&gt;&lt;SPAN&gt;data &lt;/SPAN&gt;&lt;SPAN&gt;for &lt;/SPAN&gt;&lt;SPAN&gt;element &lt;/SPAN&gt;&lt;SPAN&gt;in &lt;/SPAN&gt;&lt;/FONT&gt;&lt;SPAN&gt;&lt;FONT face="courier new,courier"&gt;sub_list].&lt;/FONT&gt; &lt;STRONG&gt;The resulting JSON is loaded just fine and is exactly what I want as the output.&lt;/STRONG&gt;&lt;/SPAN&gt;&lt;/LI&gt;&lt;LI&gt;&lt;SPAN&gt;I tried removing the schema definition, but the result remains the same.&lt;/SPAN&gt;&lt;/LI&gt;&lt;/OL&gt;&lt;P&gt;Any suggestions? I don't think I should transform the source JSON in order to be able to load it using Spark.&lt;/P&gt;</description>
      <pubDate>Wed, 17 Dec 2025 12:09:53 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/read-array-of-arrays-of-objects-json-file-using-spark/m-p/142084#M51876</guid>
      <dc:creator>Joost1024</dc:creator>
      <dc:date>2025-12-17T12:09:53Z</dc:date>
    </item>
    <item>
      <title>Re: Read Array of Arrays of Objects JSON file using Spark</title>
      <link>https://community.databricks.com/t5/data-engineering/read-array-of-arrays-of-objects-json-file-using-spark/m-p/142088#M51879</link>
      <description>&lt;P&gt;Greetings&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/201156"&gt;@Joost1024&lt;/a&gt;&amp;nbsp;, I did some digging.&amp;nbsp;&lt;/P&gt;
&lt;P class="p1"&gt;You’re running into a root type mismatch.&lt;/P&gt;
&lt;P class="p1"&gt;Your JSON’s top level is an array of arrays, but the schema you provided describes a single struct (one record). Spark can’t reconcile those two shapes, so it does what it always does in this situation: it gives you one row per file and fills the struct fields with nulls.&lt;/P&gt;
&lt;P class="p1"&gt;What’s happening&lt;/P&gt;
&lt;P class="p1"&gt;The file’s root is an array of arrays of structs — think &lt;SPAN class="s2"&gt;[[{...}, {...}], [...]]&lt;/SPAN&gt;. Your schema, however, describes only the inner object, not the outer container. When &lt;SPAN class="s2"&gt;multiLine&lt;/SPAN&gt; JSON is enabled, Spark treats each file as a single JSON value. Since the root type doesn’t match the schema, Spark can’t align the fields and you end up with nulls across the board.&lt;/P&gt;
&lt;P class="p1"&gt;The good news: you don’t need to change the source JSON. There are two clean ways to handle this directly in Spark.&lt;/P&gt;
&lt;P class="p1"&gt;Option A: Let Spark infer the nested arrays, then explode twice&lt;/P&gt;
&lt;P class="p1"&gt;Have Spark read the file as-is, infer the shape, and flatten it step by step:&lt;/P&gt;
&lt;PRE&gt;&lt;CODE&gt;from pyspark.sql import functions as F

df0 = (spark.read.format("json")
       .option("multiLine", "true")
       .load("&amp;lt;S3 location&amp;gt;"))

# df0 has a single column `value`: array&amp;lt;array&amp;lt;struct&amp;lt;entity_id,...&amp;gt;&amp;gt;&amp;gt;
df = (df0
      .select(F.explode("value").alias("arr"))   # array&amp;lt;struct&amp;gt;
      .select(F.explode("arr").alias("row"))     # struct
      .select(
          "row.entity_id",
          F.col("row.state").alias("state"),
          "row.attributes",
          F.to_timestamp(
              "row.last_changed",
              "yyyy-MM-dd'T'HH:mm:ssXXX"
          ).alias("last_changed"),
          F.to_timestamp(
              "row.last_updated",
              "yyyy-MM-dd'T'HH:mm:ssXXX"
          ).alias("last_updated"),
      ))

display(df.limit(10))&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P class="p1"&gt;A couple of notes:&lt;/P&gt;
&lt;UL&gt;
&lt;LI&gt;
&lt;P class="p1"&gt;The timestamp pattern &lt;SPAN class="s1"&gt;yyyy-MM-dd'T'HH:mm:ssXXX&lt;/SPAN&gt; correctly handles ISO-8601 offsets like &lt;SPAN class="s1"&gt;+00:00&lt;/SPAN&gt;.&lt;/P&gt;
&lt;/LI&gt;
&lt;LI&gt;
&lt;P class="p1"&gt;If &lt;SPAN class="s1"&gt;state&lt;/SPAN&gt; should be numeric, just cast it after the explode.&lt;/P&gt;
&lt;/LI&gt;
&lt;/UL&gt;
&lt;P class="p1"&gt;Option B: Define a schema that actually matches the root&lt;/P&gt;
&lt;P class="p1"&gt;Instead of fighting the JSON shape, describe it accurately: a single top-level field that is an array of arrays of structs.&lt;/P&gt;
&lt;PRE&gt;&lt;CODE&gt;from pyspark.sql import functions as F, types as T

inner = T.StructType([
    T.StructField("entity_id", T.StringType(), False),
    T.StructField("state", T.StringType(), True),
    T.StructField("attributes", T.MapType(T.StringType(), T.StringType()), True),
    T.StructField("last_changed", T.StringType(), False),
    T.StructField("last_updated", T.StringType(), False),
])

schema = T.StructType([
    T.StructField("value", T.ArrayType(T.ArrayType(inner)), True)
])

df0 = (spark.read.format("json")
       .option("multiLine", "true")
       # .option("primitivesAsString", "true")  # optional, see notes below
       .schema(schema)
       .load("&amp;lt;S3 location&amp;gt;"))

df = (df0
      .select(F.explode("value").alias("arr"))
      .select(F.explode("arr").alias("row"))
      .select(
          "row.entity_id",
          F.col("row.state").alias("state"),
          "row.attributes",
          F.to_timestamp(
              "row.last_changed",
              "yyyy-MM-dd'T'HH:mm:ssXXX"
          ).alias("last_changed"),
          F.to_timestamp(
              "row.last_updated",
              "yyyy-MM-dd'T'HH:mm:ssXXX"
          ).alias("last_updated"),
      ))

display(df.limit(10))&lt;/CODE&gt;&lt;/PRE&gt;
&lt;P class="p1"&gt;Extra tips worth knowing&lt;/P&gt;
&lt;P class="p1"&gt;If &lt;SPAN class="s1"&gt;attributes&lt;/SPAN&gt; can contain numbers or booleans, you have a couple of safe options:&lt;/P&gt;
&lt;UL&gt;
&lt;LI&gt;
&lt;P class="p1"&gt;Use &lt;SPAN class="s1"&gt;.option("primitivesAsString", "true")&lt;/SPAN&gt; so everything lands as strings and nothing silently becomes null.&lt;/P&gt;
&lt;/LI&gt;
&lt;LI&gt;
&lt;P class="p1"&gt;Or widen the map type and normalize downstream once the data is flattened.&lt;/P&gt;
&lt;/LI&gt;
&lt;/UL&gt;
&lt;P class="p1"&gt;Also worth calling out: your original approach worked once you flattened the JSON externally because you removed that extra array level. The double-explode here is doing the same thing, just inside Spark where it belongs.&lt;/P&gt;
&lt;P class="p1"&gt;&amp;nbsp;&lt;/P&gt;
&lt;P class="p1"&gt;Hope this helps, Louis.&lt;/P&gt;</description>
      <pubDate>Wed, 17 Dec 2025 13:18:24 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/read-array-of-arrays-of-objects-json-file-using-spark/m-p/142088#M51879</guid>
      <dc:creator>Louis_Frolio</dc:creator>
      <dc:date>2025-12-17T13:18:24Z</dc:date>
    </item>
    <item>
      <title>Re: Read Array of Arrays of Objects JSON file using Spark</title>
      <link>https://community.databricks.com/t5/data-engineering/read-array-of-arrays-of-objects-json-file-using-spark/m-p/142090#M51881</link>
      <description>&lt;P&gt;Thank you so much for your extensive explanation&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/34815"&gt;@Louis_Frolio&lt;/a&gt;! Now it makes complete sense.&lt;/P&gt;</description>
      <pubDate>Wed, 17 Dec 2025 13:32:57 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/read-array-of-arrays-of-objects-json-file-using-spark/m-p/142090#M51881</guid>
      <dc:creator>Joost1024</dc:creator>
      <dc:date>2025-12-17T13:32:57Z</dc:date>
    </item>
    <item>
      <title>Re: Read Array of Arrays of Objects JSON file using Spark</title>
      <link>https://community.databricks.com/t5/data-engineering/read-array-of-arrays-of-objects-json-file-using-spark/m-p/142091#M51882</link>
      <description>&lt;P&gt;I guess I was a bit over enthusiastic by accepting the answer.&lt;/P&gt;&lt;P&gt;When I run the following on the single object array of arrays (as shown in the original post) I get a single row with column "value" and value null.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;DIV&gt;&lt;PRE&gt;&lt;SPAN&gt;from&lt;/SPAN&gt;&lt;SPAN&gt; pyspark.sql &lt;/SPAN&gt;&lt;SPAN&gt;import&lt;/SPAN&gt;&lt;SPAN&gt; functions &lt;/SPAN&gt;&lt;SPAN&gt;as&lt;/SPAN&gt;&lt;SPAN&gt; F, types &lt;/SPAN&gt;&lt;SPAN&gt;as&lt;/SPAN&gt;&lt;SPAN&gt; T&lt;/SPAN&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;SPAN&gt;inner &lt;/SPAN&gt;&lt;SPAN&gt;=&lt;/SPAN&gt;&lt;SPAN&gt; T.&lt;/SPAN&gt;&lt;SPAN&gt;StructType&lt;/SPAN&gt;&lt;SPAN&gt;([&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;   T.&lt;/SPAN&gt;&lt;SPAN&gt;StructField&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;"entity_id"&lt;/SPAN&gt;&lt;SPAN&gt;, T.&lt;/SPAN&gt;&lt;SPAN&gt;StringType&lt;/SPAN&gt;&lt;SPAN&gt;(), &lt;/SPAN&gt;&lt;SPAN&gt;False&lt;/SPAN&gt;&lt;SPAN&gt;),&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;   T.&lt;/SPAN&gt;&lt;SPAN&gt;StructField&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;"state"&lt;/SPAN&gt;&lt;SPAN&gt;, T.&lt;/SPAN&gt;&lt;SPAN&gt;StringType&lt;/SPAN&gt;&lt;SPAN&gt;(), &lt;/SPAN&gt;&lt;SPAN&gt;True&lt;/SPAN&gt;&lt;SPAN&gt;),&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;   T.&lt;/SPAN&gt;&lt;SPAN&gt;StructField&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;"attributes"&lt;/SPAN&gt;&lt;SPAN&gt;, T.&lt;/SPAN&gt;&lt;SPAN&gt;MapType&lt;/SPAN&gt;&lt;SPAN&gt;(T.&lt;/SPAN&gt;&lt;SPAN&gt;StringType&lt;/SPAN&gt;&lt;SPAN&gt;(), T.&lt;/SPAN&gt;&lt;SPAN&gt;StringType&lt;/SPAN&gt;&lt;SPAN&gt;()), &lt;/SPAN&gt;&lt;SPAN&gt;True&lt;/SPAN&gt;&lt;SPAN&gt;),&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;   T.&lt;/SPAN&gt;&lt;SPAN&gt;StructField&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;"last_changed"&lt;/SPAN&gt;&lt;SPAN&gt;, T.&lt;/SPAN&gt;&lt;SPAN&gt;StringType&lt;/SPAN&gt;&lt;SPAN&gt;(), &lt;/SPAN&gt;&lt;SPAN&gt;False&lt;/SPAN&gt;&lt;SPAN&gt;),&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;   T.&lt;/SPAN&gt;&lt;SPAN&gt;StructField&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;"last_updated"&lt;/SPAN&gt;&lt;SPAN&gt;, T.&lt;/SPAN&gt;&lt;SPAN&gt;StringType&lt;/SPAN&gt;&lt;SPAN&gt;(), &lt;/SPAN&gt;&lt;SPAN&gt;False&lt;/SPAN&gt;&lt;SPAN&gt;),&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;])&lt;/SPAN&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;SPAN&gt;schema &lt;/SPAN&gt;&lt;SPAN&gt;=&lt;/SPAN&gt;&lt;SPAN&gt; T.&lt;/SPAN&gt;&lt;SPAN&gt;StructType&lt;/SPAN&gt;&lt;SPAN&gt;([&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;   T.&lt;/SPAN&gt;&lt;SPAN&gt;StructField&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;"value"&lt;/SPAN&gt;&lt;SPAN&gt;, T.&lt;/SPAN&gt;&lt;SPAN&gt;ArrayType&lt;/SPAN&gt;&lt;SPAN&gt;(T.&lt;/SPAN&gt;&lt;SPAN&gt;ArrayType&lt;/SPAN&gt;&lt;SPAN&gt;(inner)), &lt;/SPAN&gt;&lt;SPAN&gt;True&lt;/SPAN&gt;&lt;SPAN&gt;)&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;])&lt;/SPAN&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;SPAN&gt;df0 &lt;/SPAN&gt;&lt;SPAN&gt;=&lt;/SPAN&gt;&lt;SPAN&gt; (spark.read.&lt;/SPAN&gt;&lt;SPAN&gt;format&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;"json"&lt;/SPAN&gt;&lt;SPAN&gt;)&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;   .&lt;/SPAN&gt;&lt;SPAN&gt;option&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;"multiLine"&lt;/SPAN&gt;&lt;SPAN&gt;, &lt;/SPAN&gt;&lt;SPAN&gt;"true"&lt;/SPAN&gt;&lt;SPAN&gt;)&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;   .&lt;/SPAN&gt;&lt;SPAN&gt;option&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;"primitivesAsString"&lt;/SPAN&gt;&lt;SPAN&gt;, &lt;/SPAN&gt;&lt;SPAN&gt;"true"&lt;/SPAN&gt;&lt;SPAN&gt;)&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;   .&lt;/SPAN&gt;&lt;SPAN&gt;schema&lt;/SPAN&gt;&lt;SPAN&gt;(schema)&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;   .&lt;/SPAN&gt;&lt;SPAN&gt;load&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;"&amp;lt;S3 path&amp;gt;/original-single-item.json"&lt;/SPAN&gt;&lt;SPAN&gt;))&lt;/SPAN&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;SPAN&gt;display&lt;/SPAN&gt;&lt;SPAN&gt;(df0&lt;/SPAN&gt;&lt;SPAN&gt;)&lt;/SPAN&gt;&lt;/PRE&gt;&lt;/DIV&gt;</description>
      <pubDate>Wed, 17 Dec 2025 13:59:42 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/read-array-of-arrays-of-objects-json-file-using-spark/m-p/142091#M51882</guid>
      <dc:creator>Joost1024</dc:creator>
      <dc:date>2025-12-17T13:59:42Z</dc:date>
    </item>
    <item>
      <title>Re: Read Array of Arrays of Objects JSON file using Spark</title>
      <link>https://community.databricks.com/t5/data-engineering/read-array-of-arrays-of-objects-json-file-using-spark/m-p/142263#M51909</link>
      <description>&lt;P&gt;Any idea&amp;nbsp;&lt;A href="https://community.databricks.com/t5/user/viewprofilepage/user-id/34815" target="_blank"&gt;@Louis_Frolio&lt;/A&gt;? Is this DF supposed to be displayed as null?&lt;/P&gt;</description>
      <pubDate>Fri, 19 Dec 2025 14:11:44 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/read-array-of-arrays-of-objects-json-file-using-spark/m-p/142263#M51909</guid>
      <dc:creator>Joost1024</dc:creator>
      <dc:date>2025-12-19T14:11:44Z</dc:date>
    </item>
    <item>
      <title>Re: Read Array of Arrays of Objects JSON file using Spark</title>
      <link>https://community.databricks.com/t5/data-engineering/read-array-of-arrays-of-objects-json-file-using-spark/m-p/142293#M51912</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/201156"&gt;@Joost1024&lt;/a&gt;&amp;nbsp;,&lt;/P&gt;&lt;P&gt;So here's the issue. It seems that JSON &lt;SPAN class=""&gt;DataFrameReader&lt;/SPAN&gt;&amp;nbsp;expects to have a JSON object. But in your case we're dealing with JSON array at root level - not a JSON object.&lt;/P&gt;&lt;P&gt;So for instance,if we would just rewrite your file in following way then spark would be able to infer schema without any issues:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;{
    "data": [
            [
                {
                "entity_id": "sensor.solaredge_lifetime_energy",
                "state": "19848848.0",
                "attributes": {
                    "state_class": "total",
                    "unit_of_measurement": "Wh",
                    "device_class": "energy",
                    "friendly_name": "solaredge Lifetime energy"
                },
                "last_changed": "2025-12-14T23:00:00+00:00",
                "last_updated": "2025-12-14T23:00:00+00:00"
                },
                {
                "entity_id": "sensor.solaredge_lifetime_energy",
                "state": "19849120.0",
                "attributes": {
                    "state_class": "total",
                    "unit_of_measurement": "Wh",
                    "device_class": "energy",
                    "friendly_name": "solaredge Lifetime energy"
                },
                "last_changed": "2025-12-14T23:15:00+00:00",
                "last_updated": "2025-12-14T23:15:00+00:00"
                },
                {
                "entity_id": "sensor.solaredge_lifetime_energy",
                "state": "19849580.0",
                "attributes": {
                    "state_class": "total",
                    "unit_of_measurement": "Wh",
                    "device_class": "energy",
                    "friendly_name": "solaredge Lifetime energy"
                },
                "last_changed": "2025-12-14T23:30:00+00:00",
                "last_updated": "2025-12-14T23:30:00+00:00"
                }
            ],
            [
                {
                "entity_id": "sensor.home_temperature",
                "state": "21.5",
                "attributes": {
                    "state_class": "measurement",
                    "unit_of_measurement": "°C",
                    "device_class": "temperature",
                    "friendly_name": "Home Temperature"
                },
                "last_changed": "2025-12-14T23:00:00+00:00",
                "last_updated": "2025-12-14T23:00:00+00:00"
                },
                {
                "entity_id": "sensor.home_temperature",
                "state": "21.3",
                "attributes": {
                    "state_class": "measurement",
                    "unit_of_measurement": "°C",
                    "device_class": "temperature",
                    "friendly_name": "Home Temperature"
                },
                "last_changed": "2025-12-14T23:15:00+00:00",
                "last_updated": "2025-12-14T23:15:00+00:00"
                }
            ],
            [
                {
                "entity_id": "sensor.power_consumption",
                "state": "1250.0",
                "attributes": {
                    "state_class": "measurement",
                    "unit_of_measurement": "W",
                    "device_class": "power",
                    "friendly_name": "Power Consumption"
                },
                "last_changed": "2025-12-14T23:00:00+00:00",
                "last_updated": "2025-12-14T23:00:00+00:00"
                },
                {
                "entity_id": "sensor.power_consumption",
                "state": "1180.0",
                "attributes": {
                    "state_class": "measurement",
                    "unit_of_measurement": "W",
                    "device_class": "power",
                    "friendly_name": "Power Consumption"
                },
                "last_changed": "2025-12-14T23:15:00+00:00",
                "last_updated": "2025-12-14T23:15:00+00:00"
                },
                {
                "entity_id": "sensor.power_consumption",
                "state": "1320.0",
                "attributes": {
                    "state_class": "measurement",
                    "unit_of_measurement": "W",
                    "device_class": "power",
                    "friendly_name": "Power Consumption"
                },
                "last_changed": "2025-12-14T23:30:00+00:00",
                "last_updated": "2025-12-14T23:30:00+00:00"
                },
                {
                "entity_id": "sensor.power_consumption",
                "state": "1295.0",
                "attributes": {
                    "state_class": "measurement",
                    "unit_of_measurement": "W",
                    "device_class": "power",
                    "friendly_name": "Power Consumption"
                },
                "last_changed": "2025-12-14T23:45:00+00:00",
                "last_updated": "2025-12-14T23:45:00+00:00"
                }
            ]
            ]

}&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="szymon_dybczak_0-1766229529369.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/22443i3F3B7AA389B71A02/image-size/medium?v=v2&amp;amp;px=400" role="button" title="szymon_dybczak_0-1766229529369.png" alt="szymon_dybczak_0-1766229529369.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Ok, so for json arrays as a top objects we can try different approach. We can read json file as a text (important thing here - we want to use option&amp;nbsp;&lt;STRONG&gt;wholeText=True&amp;nbsp;&lt;/STRONG&gt;to not split by new lines) and then use from_json function to parse it correctly:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;from pyspark.sql.functions import lit, from_json, col, explode
from pyspark.sql.types import StructType, StructField, ArrayType, StringType


attributes_schema = StructType([
    StructField("state_class", StringType(), nullable=True),
    StructField("unit_of_measurement", StringType(), nullable=True),
    StructField("device_class", StringType(), nullable=True),
    StructField("friendly_name", StringType(), nullable=True)
])

sensor_reading_schema = StructType([
    StructField("entity_id", StringType(), nullable=True),
    StructField("state", StringType(), nullable=True),
    StructField("attributes", attributes_schema, nullable=True),
    StructField("last_changed", StringType(), nullable=True),
    StructField("last_updated", StringType(), nullable=True)
])


df_text = spark.read.text('/Volumes/logging_demo/default/logs/sample_data.json', wholetext=True)

array_schema = ArrayType(ArrayType(sensor_reading_schema))
df_parsed = df_text.select(from_json(col("value"), array_schema).alias("data"))


# df_flat = df_parsed.select(explode(col("data")).alias("inner_array")) \
#                    .select(explode(col("inner_array")).alias("sensor")) \
#                    .select("sensor.*")

display(df_parsed)&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;And as you can see on below screenshot - now we parsed our file correctly:&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="szymon_dybczak_1-1766229807019.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/22444i167B608ADFC862B6/image-size/medium?v=v2&amp;amp;px=400" role="button" title="szymon_dybczak_1-1766229807019.png" alt="szymon_dybczak_1-1766229807019.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;Of course you can flattened it further - just uncomment df_flat dataframe.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Sat, 20 Dec 2025 11:26:09 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/read-array-of-arrays-of-objects-json-file-using-spark/m-p/142293#M51912</guid>
      <dc:creator>szymon_dybczak</dc:creator>
      <dc:date>2025-12-20T11:26:09Z</dc:date>
    </item>
    <item>
      <title>Re: Read Array of Arrays of Objects JSON file using Spark</title>
      <link>https://community.databricks.com/t5/data-engineering/read-array-of-arrays-of-objects-json-file-using-spark/m-p/142367#M51932</link>
      <description>&lt;P&gt;Nice job&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/110502"&gt;@szymon_dybczak&lt;/a&gt;&amp;nbsp;.&amp;nbsp; Thanks for helping to make Databricks Community stronger!&lt;/P&gt;
&lt;P&gt;Cheers, Louis.&lt;/P&gt;</description>
      <pubDate>Mon, 22 Dec 2025 16:00:56 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/read-array-of-arrays-of-objects-json-file-using-spark/m-p/142367#M51932</guid>
      <dc:creator>Louis_Frolio</dc:creator>
      <dc:date>2025-12-22T16:00:56Z</dc:date>
    </item>
  </channel>
</rss>

