<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Delta Live Table SCD2 performance issue in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/delta-live-table-scd2-performance-issue/m-p/112544#M44247</link>
    <description>&lt;P&gt;The DLT automatically applies OPTIMIZE and VACUUM to the data, so I believe that's the case.&lt;/P&gt;</description>
    <pubDate>Fri, 14 Mar 2025 07:42:46 GMT</pubDate>
    <dc:creator>scorpusfx1</dc:creator>
    <dc:date>2025-03-14T07:42:46Z</dc:date>
    <item>
      <title>Delta Live Table SCD2 performance issue</title>
      <link>https://community.databricks.com/t5/data-engineering/delta-live-table-scd2-performance-issue/m-p/112451#M44214</link>
      <description>&lt;P&gt;&lt;SPAN&gt;Hi Community,&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;I am working on ingestion pipelines that take data from Parquet files (200 MB per day) and integrate them into my Lakehouse. This data is used to create an SCD Type 2 using apply_changes, with the row ID as the key and the file date as the sequence.&lt;/P&gt;&lt;P&gt;Since the past two weeks, we have observed a significant increase in processing time for this SCD2 step (from 15 minutes to 45 minutes), and I have been unable to optimize it.&lt;/P&gt;&lt;P&gt;Do you have any suggestions for optimizing the SCD2 processing?&lt;/P&gt;&lt;P&gt;More details: I receive a 200 MB Parquet file daily, ingest it, and process it through the SCD2 step to detect historical changes.&lt;/P&gt;</description>
      <pubDate>Thu, 13 Mar 2025 09:01:45 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/delta-live-table-scd2-performance-issue/m-p/112451#M44214</guid>
      <dc:creator>scorpusfx1</dc:creator>
      <dc:date>2025-03-13T09:01:45Z</dc:date>
    </item>
    <item>
      <title>Re: Delta Live Table SCD2 performance issue</title>
      <link>https://community.databricks.com/t5/data-engineering/delta-live-table-scd2-performance-issue/m-p/112484#M44226</link>
      <description>&lt;P&gt;Are your parquet files clubbed together as a single file if not then try to club them into a single file and then read.&lt;/P&gt;</description>
      <pubDate>Thu, 13 Mar 2025 15:04:26 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/delta-live-table-scd2-performance-issue/m-p/112484#M44226</guid>
      <dc:creator>BricksGuy</dc:creator>
      <dc:date>2025-03-13T15:04:26Z</dc:date>
    </item>
    <item>
      <title>Re: Delta Live Table SCD2 performance issue</title>
      <link>https://community.databricks.com/t5/data-engineering/delta-live-table-scd2-performance-issue/m-p/112538#M44246</link>
      <description>&lt;P&gt;hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/153237"&gt;@scorpusfx1&lt;/a&gt;&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;What kind of source data do you have? Are these parquet files daily full snapshots of source tables? If so, you should use&amp;nbsp;&lt;STRONG&gt;apply_changes_from_snapshot,&amp;nbsp;&lt;/STRONG&gt;which is exactly built for this use case.&amp;nbsp;&lt;A href="https://docs.databricks.com/aws/en/dlt/python-ref#change-data-capture-from-database-snapshots-with-python-in-dlt" target="_blank"&gt;https://docs.databricks.com/aws/en/dlt/python-ref#change-data-capture-from-database-snapshots-with-python-in-dlt&lt;/A&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Fri, 14 Mar 2025 05:40:09 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/delta-live-table-scd2-performance-issue/m-p/112538#M44246</guid>
      <dc:creator>Stefan-Koch</dc:creator>
      <dc:date>2025-03-14T05:40:09Z</dc:date>
    </item>
    <item>
      <title>Re: Delta Live Table SCD2 performance issue</title>
      <link>https://community.databricks.com/t5/data-engineering/delta-live-table-scd2-performance-issue/m-p/112544#M44247</link>
      <description>&lt;P&gt;The DLT automatically applies OPTIMIZE and VACUUM to the data, so I believe that's the case.&lt;/P&gt;</description>
      <pubDate>Fri, 14 Mar 2025 07:42:46 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/delta-live-table-scd2-performance-issue/m-p/112544#M44247</guid>
      <dc:creator>scorpusfx1</dc:creator>
      <dc:date>2025-03-14T07:42:46Z</dc:date>
    </item>
    <item>
      <title>Re: Delta Live Table SCD2 performance issue</title>
      <link>https://community.databricks.com/t5/data-engineering/delta-live-table-scd2-performance-issue/m-p/112545#M44248</link>
      <description>&lt;P&gt;Thank you for the response. Indeed, this function works well for my case &lt;span class="lia-unicode-emoji" title=":slightly_smiling_face:"&gt;🙂&lt;/span&gt;&lt;/P&gt;&lt;P&gt;However, I am currently using a Data Factory to extract the data into files and process them like the apply_change_snapshot function does.&lt;/P&gt;&lt;P&gt;Today, my pipeline is no longer working, and it's generating the following exception:&amp;nbsp;&lt;SPAN&gt;terminated with exception: [DELTA_MERGE_MATERIALIZE_SOURCE_FAILED_REPEATEDLY] Keeping the source of the MERGE statement materialized has failed repeatedly. SQLSTATE: XXKST.&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;Any idea please &lt;span class="lia-unicode-emoji" title=":disappointed_face:"&gt;😞&lt;/span&gt;&amp;nbsp;&lt;/SPAN&gt;&lt;/P&gt;</description>
      <pubDate>Fri, 14 Mar 2025 07:52:14 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/delta-live-table-scd2-performance-issue/m-p/112545#M44248</guid>
      <dc:creator>scorpusfx1</dc:creator>
      <dc:date>2025-03-14T07:52:14Z</dc:date>
    </item>
  </channel>
</rss>

