<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic DLT Streaming Schema and Select in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/dlt-streaming-schema-and-select/m-p/86867#M37350</link>
    <description>&lt;P&gt;I am reading JSON files written to adls from Kafka using dlt and spark.readStream to create a streaming table for my raw ingest data. My schema is two arrays at the top level&lt;/P&gt;&lt;P&gt;NewRecord array, OldRecord array.&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;I pass the schema and I run a select on NewFecord.* so I get only the fields in the new record array. The problem is in my streaming table, it returns a null NewRecord and OldRecord column and then all the fields in NewRecord.&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;how can I only have the newRecord fields in my table?&lt;/P&gt;</description>
    <pubDate>Fri, 30 Aug 2024 13:50:08 GMT</pubDate>
    <dc:creator>ggsmith</dc:creator>
    <dc:date>2024-08-30T13:50:08Z</dc:date>
    <item>
      <title>DLT Streaming Schema and Select</title>
      <link>https://community.databricks.com/t5/data-engineering/dlt-streaming-schema-and-select/m-p/86867#M37350</link>
      <description>&lt;P&gt;I am reading JSON files written to adls from Kafka using dlt and spark.readStream to create a streaming table for my raw ingest data. My schema is two arrays at the top level&lt;/P&gt;&lt;P&gt;NewRecord array, OldRecord array.&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;I pass the schema and I run a select on NewFecord.* so I get only the fields in the new record array. The problem is in my streaming table, it returns a null NewRecord and OldRecord column and then all the fields in NewRecord.&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;how can I only have the newRecord fields in my table?&lt;/P&gt;</description>
      <pubDate>Fri, 30 Aug 2024 13:50:08 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/dlt-streaming-schema-and-select/m-p/86867#M37350</guid>
      <dc:creator>ggsmith</dc:creator>
      <dc:date>2024-08-30T13:50:08Z</dc:date>
    </item>
    <item>
      <title>Re: DLT Streaming Schema and Select</title>
      <link>https://community.databricks.com/t5/data-engineering/dlt-streaming-schema-and-select/m-p/86874#M37352</link>
      <description>&lt;P&gt;Edit: Adding code for clarity.&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;# Top-level struct [OldRecord, NewRecord]
schema = StructType([
    StructField("NewRecord", StructType([...
          ]),
          "OldRecord", StructType([....
          ])
    )

# streaming query
@dlt.table(
    name="newrecord_raw",
    table_properties={"quality": "bronze"},
    temporary=False,
)
def create_table():
    query = (
        spark.readStream.format("cloudFiles")
        .schema(schema)
        .option("cloudFiles.format", "json")
        .option("checkpointLocation", "/Volumes/dev/streaming/")
        .load(sink_dir)
        .select("NewRecord.*")
        .withColumn("load_dt", to_timestamp(current_timestamp()))
    )
    return query&lt;/LI-CODE&gt;</description>
      <pubDate>Fri, 30 Aug 2024 14:44:19 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/dlt-streaming-schema-and-select/m-p/86874#M37352</guid>
      <dc:creator>ggsmith</dc:creator>
      <dc:date>2024-08-30T14:44:19Z</dc:date>
    </item>
    <item>
      <title>Re: DLT Streaming Schema and Select</title>
      <link>https://community.databricks.com/t5/data-engineering/dlt-streaming-schema-and-select/m-p/86982#M37361</link>
      <description>&lt;P&gt;I did a full refresh from the delta tables pipeline and that fixed it. I guess it was remembering the first run where I just had the top level arrays as two columns in the table.&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Fri, 30 Aug 2024 21:54:52 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/dlt-streaming-schema-and-select/m-p/86982#M37361</guid>
      <dc:creator>ggsmith</dc:creator>
      <dc:date>2024-08-30T21:54:52Z</dc:date>
    </item>
  </channel>
</rss>

