<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Incremental load from two tables in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/incremental-load-from-two-tables/m-p/106135#M42400</link>
    <description>&lt;P&gt;Hi,&amp;nbsp;&lt;/P&gt;&lt;P&gt;I am looking to build a ETL process for a incremental load silver table.&lt;/P&gt;&lt;P&gt;This silver table, lets say "contracts_silver", is built by joining two bronze tables, "contracts_raw" and "customer".&lt;/P&gt;&lt;P&gt;contracts_silver&lt;/P&gt;&lt;TABLE border="1" width="546px"&gt;&lt;TBODY&gt;&lt;TR&gt;&lt;TD width="146.891px" height="30px"&gt;CONTRACT_ID&lt;/TD&gt;&lt;TD width="109.828px" height="30px"&gt;STATUS&lt;/TD&gt;&lt;TD width="288.281px" height="30px"&gt;CUSTOMER_NAME&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD width="146.891px" height="30px"&gt;1&lt;/TD&gt;&lt;TD width="109.828px" height="30px"&gt;SIGNED&lt;/TD&gt;&lt;TD width="288.281px" height="30px"&gt;Peter Smith&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD width="146.891px" height="30px"&gt;2&lt;/TD&gt;&lt;TD width="109.828px" height="30px"&gt;SIGNED&lt;/TD&gt;&lt;TD width="288.281px" height="30px"&gt;John Smith&lt;/TD&gt;&lt;/TR&gt;&lt;/TBODY&gt;&lt;/TABLE&gt;&lt;P&gt;contracts_raw&lt;/P&gt;&lt;TABLE border="1" width="100%"&gt;&lt;TBODY&gt;&lt;TR&gt;&lt;TD width="33.333333333333336%" height="30px"&gt;ID&lt;/TD&gt;&lt;TD width="33.333333333333336%" height="30px"&gt;STATUS&lt;/TD&gt;&lt;TD width="16.666666666666668%" height="30px"&gt;DATE&lt;/TD&gt;&lt;TD width="16.666666666666668%" height="30px"&gt;CUSTOMER_ID&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD width="33.333333333333336%" height="30px"&gt;1&lt;/TD&gt;&lt;TD width="33.333333333333336%" height="30px"&gt;SIGNED&lt;/TD&gt;&lt;TD width="16.666666666666668%" height="30px"&gt;2025-01-15&lt;/TD&gt;&lt;TD width="16.666666666666668%" height="30px"&gt;1&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD width="33.333333333333336%" height="30px"&gt;2&lt;/TD&gt;&lt;TD width="33.333333333333336%" height="30px"&gt;SIGNED&lt;/TD&gt;&lt;TD width="16.666666666666668%" height="30px"&gt;2025-01-15&lt;/TD&gt;&lt;TD width="16.666666666666668%" height="30px"&gt;2&lt;/TD&gt;&lt;/TR&gt;&lt;/TBODY&gt;&lt;/TABLE&gt;&lt;P&gt;customer_raw&lt;/P&gt;&lt;TABLE border="1" width="482px"&gt;&lt;TBODY&gt;&lt;TR&gt;&lt;TD width="184.531px" height="30px"&gt;ID&lt;/TD&gt;&lt;TD width="184.641px" height="30px"&gt;NAME&lt;/TD&gt;&lt;TD width="111.828px" height="30px"&gt;DOB&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD width="184.531px" height="30px"&gt;1&lt;/TD&gt;&lt;TD width="184.641px" height="30px"&gt;Peter Smith&lt;/TD&gt;&lt;TD width="111.828px" height="30px"&gt;2025-01-15&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD width="184.531px" height="30px"&gt;2&lt;/TD&gt;&lt;TD width="184.641px" height="30px"&gt;John Smith&lt;/TD&gt;&lt;TD width="111.828px" height="30px"&gt;2025-01-15&lt;/TD&gt;&lt;/TR&gt;&lt;/TBODY&gt;&lt;/TABLE&gt;&lt;P&gt;The "contracts_raw" table will grow faster than "customer_raw".&lt;/P&gt;&lt;P&gt;Updates are supposed to be in batch. In any update, I can have inserts, updates or deletes on both raw tables.&lt;/P&gt;&lt;P&gt;Considering that at any given time I can have updates happening on both raw tables, or only one of then, is databricks capable auomatically detecting the need to update the "contracts_silver" table if:&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;Update 1: in contracts_raw, ID 1 is changed to STATUS cancelled&lt;/LI&gt;&lt;LI&gt;Update 2: in customer_raw, ID2, name is changed to John J. Smith&lt;/LI&gt;&lt;/UL&gt;&lt;P&gt;The goal is to always reprocess, but only rows that will eventually change in the silver table.&lt;/P&gt;</description>
    <pubDate>Fri, 17 Jan 2025 19:59:38 GMT</pubDate>
    <dc:creator>garciargs</dc:creator>
    <dc:date>2025-01-17T19:59:38Z</dc:date>
    <item>
      <title>Incremental load from two tables</title>
      <link>https://community.databricks.com/t5/data-engineering/incremental-load-from-two-tables/m-p/106135#M42400</link>
      <description>&lt;P&gt;Hi,&amp;nbsp;&lt;/P&gt;&lt;P&gt;I am looking to build a ETL process for a incremental load silver table.&lt;/P&gt;&lt;P&gt;This silver table, lets say "contracts_silver", is built by joining two bronze tables, "contracts_raw" and "customer".&lt;/P&gt;&lt;P&gt;contracts_silver&lt;/P&gt;&lt;TABLE border="1" width="546px"&gt;&lt;TBODY&gt;&lt;TR&gt;&lt;TD width="146.891px" height="30px"&gt;CONTRACT_ID&lt;/TD&gt;&lt;TD width="109.828px" height="30px"&gt;STATUS&lt;/TD&gt;&lt;TD width="288.281px" height="30px"&gt;CUSTOMER_NAME&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD width="146.891px" height="30px"&gt;1&lt;/TD&gt;&lt;TD width="109.828px" height="30px"&gt;SIGNED&lt;/TD&gt;&lt;TD width="288.281px" height="30px"&gt;Peter Smith&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD width="146.891px" height="30px"&gt;2&lt;/TD&gt;&lt;TD width="109.828px" height="30px"&gt;SIGNED&lt;/TD&gt;&lt;TD width="288.281px" height="30px"&gt;John Smith&lt;/TD&gt;&lt;/TR&gt;&lt;/TBODY&gt;&lt;/TABLE&gt;&lt;P&gt;contracts_raw&lt;/P&gt;&lt;TABLE border="1" width="100%"&gt;&lt;TBODY&gt;&lt;TR&gt;&lt;TD width="33.333333333333336%" height="30px"&gt;ID&lt;/TD&gt;&lt;TD width="33.333333333333336%" height="30px"&gt;STATUS&lt;/TD&gt;&lt;TD width="16.666666666666668%" height="30px"&gt;DATE&lt;/TD&gt;&lt;TD width="16.666666666666668%" height="30px"&gt;CUSTOMER_ID&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD width="33.333333333333336%" height="30px"&gt;1&lt;/TD&gt;&lt;TD width="33.333333333333336%" height="30px"&gt;SIGNED&lt;/TD&gt;&lt;TD width="16.666666666666668%" height="30px"&gt;2025-01-15&lt;/TD&gt;&lt;TD width="16.666666666666668%" height="30px"&gt;1&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD width="33.333333333333336%" height="30px"&gt;2&lt;/TD&gt;&lt;TD width="33.333333333333336%" height="30px"&gt;SIGNED&lt;/TD&gt;&lt;TD width="16.666666666666668%" height="30px"&gt;2025-01-15&lt;/TD&gt;&lt;TD width="16.666666666666668%" height="30px"&gt;2&lt;/TD&gt;&lt;/TR&gt;&lt;/TBODY&gt;&lt;/TABLE&gt;&lt;P&gt;customer_raw&lt;/P&gt;&lt;TABLE border="1" width="482px"&gt;&lt;TBODY&gt;&lt;TR&gt;&lt;TD width="184.531px" height="30px"&gt;ID&lt;/TD&gt;&lt;TD width="184.641px" height="30px"&gt;NAME&lt;/TD&gt;&lt;TD width="111.828px" height="30px"&gt;DOB&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD width="184.531px" height="30px"&gt;1&lt;/TD&gt;&lt;TD width="184.641px" height="30px"&gt;Peter Smith&lt;/TD&gt;&lt;TD width="111.828px" height="30px"&gt;2025-01-15&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD width="184.531px" height="30px"&gt;2&lt;/TD&gt;&lt;TD width="184.641px" height="30px"&gt;John Smith&lt;/TD&gt;&lt;TD width="111.828px" height="30px"&gt;2025-01-15&lt;/TD&gt;&lt;/TR&gt;&lt;/TBODY&gt;&lt;/TABLE&gt;&lt;P&gt;The "contracts_raw" table will grow faster than "customer_raw".&lt;/P&gt;&lt;P&gt;Updates are supposed to be in batch. In any update, I can have inserts, updates or deletes on both raw tables.&lt;/P&gt;&lt;P&gt;Considering that at any given time I can have updates happening on both raw tables, or only one of then, is databricks capable auomatically detecting the need to update the "contracts_silver" table if:&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;Update 1: in contracts_raw, ID 1 is changed to STATUS cancelled&lt;/LI&gt;&lt;LI&gt;Update 2: in customer_raw, ID2, name is changed to John J. Smith&lt;/LI&gt;&lt;/UL&gt;&lt;P&gt;The goal is to always reprocess, but only rows that will eventually change in the silver table.&lt;/P&gt;</description>
      <pubDate>Fri, 17 Jan 2025 19:59:38 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/incremental-load-from-two-tables/m-p/106135#M42400</guid>
      <dc:creator>garciargs</dc:creator>
      <dc:date>2025-01-17T19:59:38Z</dc:date>
    </item>
    <item>
      <title>Re: Incremental load from two tables</title>
      <link>https://community.databricks.com/t5/data-engineering/incremental-load-from-two-tables/m-p/106189#M42419</link>
      <description>&lt;P&gt;Hi &lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/130616"&gt;@garciargs&lt;/a&gt;&amp;nbsp;,&lt;BR /&gt;Yes, in databricks you can do it using &lt;STRONG&gt;DLT (Delta Live Table)&lt;/STRONG&gt; and &lt;STRONG&gt;Spark Structured Streaming&lt;/STRONG&gt;, where you have to enable CDF (Change Data Feed) on both&amp;nbsp;&lt;STRONG&gt;&lt;EM&gt;contracts_raw&lt;/EM&gt;&lt;/STRONG&gt;&amp;nbsp;and&amp;nbsp;&lt;EM&gt;&lt;STRONG&gt;customer_raw&lt;/STRONG&gt;&lt;/EM&gt; which would track all DML changes over raw tables.&lt;/P&gt;&lt;LI-CODE lang="python"&gt;-- New Delta table with CDF enabled
CREATE TABLE myDeltaTable (
    id INT,
    name STRING,
    age INT
)
TBLPROPERTIES (delta.enableChangeDataFeed = true);

-- Enable CDF on existing table
ALTER TABLE myDeltaTable
SET TBLPROPERTIES (delta.enableChangeDataFeed = true);&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;In a DLT notebook, you can read from both tables during data operations such as append, update, and delete, and then update your silver table accordingly. The following code is a rough example of how you can achieve this.&lt;/SPAN&gt;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;import dlt
from pyspark.sql.functions import col

# Enable CDF on all new tables by default
spark.sql("SET spark.databricks.delta.properties.defaults.enableChangeDataFeed = true")

@dlt.table(quality='bronze')
def customer_raw():
    return (spark.readStream.format("cloudFiles")
            .option("cloudFiles.format", "csv")  # Change to your file format
            .load("s3a://&amp;lt;BUCKET_NAME&amp;gt;/&amp;lt;FILE_PATH&amp;gt;/customer"))  # Change to your cloud storage path

@dlt.table(quality='bronze')
def contracts_raw():
    return (spark.readStream.format("cloudFiles")
            .option("cloudFiles.format", "csv")  # Change to your file format
            .load("s3a://&amp;lt;BUCKET_NAME&amp;gt;/&amp;lt;FILE_PATH&amp;gt;/contracts"))  # Change to your cloud storage path

@dlt.table(quality='silver')
def contracts_silver():
    customer_df = (spark.readStream
                   .option("readChangeFeed", "true")
                   .table("customer_raw"))

    contracts_df = (spark.readStream
                    .option("readChangeFeed", "true")
                    .table("contracts_raw"))

    joined_df = customer_df.join(contracts_df, customer_df["customer_id"] == contracts_df["customer_id"], "inner")
    # Note: You can perform a merge statement for each batch of data
    return joined_df.select(customer_df["*"], contracts_df["contract_details"])
&lt;/LI-CODE&gt;&lt;P&gt;Refer following link to how apply changes works in DLT&amp;nbsp;&lt;A href="https://docs.databricks.com/en/delta-live-tables/cdc.html" target="_blank"&gt;The APPLY CHANGES APIs: Simplify change data capture with Delta Live Tables | Databricks on AWS&lt;/A&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;Regards,&lt;BR /&gt;Hari Prasad&lt;/P&gt;</description>
      <pubDate>Sat, 18 Jan 2025 14:55:24 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/incremental-load-from-two-tables/m-p/106189#M42419</guid>
      <dc:creator>hari-prasad</dc:creator>
      <dc:date>2025-01-18T14:55:24Z</dc:date>
    </item>
    <item>
      <title>Re: Incremental load from two tables</title>
      <link>https://community.databricks.com/t5/data-engineering/incremental-load-from-two-tables/m-p/106299#M42440</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/98469"&gt;@hari-prasad&lt;/a&gt;&amp;nbsp;,&lt;/P&gt;&lt;P&gt;Thank you! Will give it a try.&lt;/P&gt;&lt;P&gt;Regards!&lt;/P&gt;</description>
      <pubDate>Mon, 20 Jan 2025 11:51:09 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/incremental-load-from-two-tables/m-p/106299#M42440</guid>
      <dc:creator>garciargs</dc:creator>
      <dc:date>2025-01-20T11:51:09Z</dc:date>
    </item>
  </channel>
</rss>

