<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Identify updated rows during incremental refresh in DLT Materialized Views in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/identify-updated-rows-during-incremental-refresh-in-dlt/m-p/130657#M48863</link>
    <description>&lt;DIV class=""&gt;&lt;DIV class=""&gt;&lt;P&gt;&lt;SPAN&gt;Hello, every time that I run a delta live table materialized view in serverless , I get a log of "COMPLETE RECOMPUTE". I realised I was using current_timestamp as a column in my MV to identify rows which got updated in the last refresh. But that makes the MV refresh non-deterministic leading to a&amp;nbsp;log of "COMPLETE RECOMPUTE"&amp;nbsp;every time.&lt;BR /&gt;&lt;BR /&gt;How do I perform incremental refresh and also be able to identify which rows changed as part of the refresh?&lt;/SPAN&gt;&lt;/P&gt;&lt;/DIV&gt;&lt;/DIV&gt;</description>
    <pubDate>Wed, 03 Sep 2025 13:01:51 GMT</pubDate>
    <dc:creator>ManoramTaparia</dc:creator>
    <dc:date>2025-09-03T13:01:51Z</dc:date>
    <item>
      <title>Identify updated rows during incremental refresh in DLT Materialized Views</title>
      <link>https://community.databricks.com/t5/data-engineering/identify-updated-rows-during-incremental-refresh-in-dlt/m-p/130657#M48863</link>
      <description>&lt;DIV class=""&gt;&lt;DIV class=""&gt;&lt;P&gt;&lt;SPAN&gt;Hello, every time that I run a delta live table materialized view in serverless , I get a log of "COMPLETE RECOMPUTE". I realised I was using current_timestamp as a column in my MV to identify rows which got updated in the last refresh. But that makes the MV refresh non-deterministic leading to a&amp;nbsp;log of "COMPLETE RECOMPUTE"&amp;nbsp;every time.&lt;BR /&gt;&lt;BR /&gt;How do I perform incremental refresh and also be able to identify which rows changed as part of the refresh?&lt;/SPAN&gt;&lt;/P&gt;&lt;/DIV&gt;&lt;/DIV&gt;</description>
      <pubDate>Wed, 03 Sep 2025 13:01:51 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/identify-updated-rows-during-incremental-refresh-in-dlt/m-p/130657#M48863</guid>
      <dc:creator>ManoramTaparia</dc:creator>
      <dc:date>2025-09-03T13:01:51Z</dc:date>
    </item>
    <item>
      <title>Re: Identify updated rows during incremental refresh in DLT Materialized Views</title>
      <link>https://community.databricks.com/t5/data-engineering/identify-updated-rows-during-incremental-refresh-in-dlt/m-p/130694#M48877</link>
      <description>&lt;P class=""&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/182330"&gt;@ManoramTaparia&lt;/a&gt; The issue is that current_timestamp() makes your MV non-deterministic, forcing complete recomputes. Here's how to fix it:&lt;/P&gt;&lt;H2&gt;Solution: Use the Source Table's Change Tracking&lt;/H2&gt;&lt;H3&gt;Option 1: Leverage Source Table's Timestamp Column&lt;BR /&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/97035"&gt;@Dlt&lt;/a&gt;.table(&lt;BR /&gt;name="my_materialized_view"&lt;BR /&gt;)&lt;BR /&gt;def my_mv():&lt;BR /&gt;return (&lt;BR /&gt;spark.readStream&lt;BR /&gt;.option("readChangeFeed", "true")&lt;BR /&gt;.table("source_table")&lt;BR /&gt;.select(&lt;BR /&gt;"*",&lt;BR /&gt;col("_commit_timestamp"). alias("last_updated") # From CDF&lt;BR /&gt;&lt;BR /&gt;Option 2: Track Changes with Watermarking&lt;BR /&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/97035"&gt;@Dlt&lt;/a&gt;.table(&lt;BR /&gt;name="incremental_mv"&lt;BR /&gt;)&lt;BR /&gt;def incremental_mv():&lt;BR /&gt;df = dlt.read("source_table")&lt;BR /&gt;&lt;BR /&gt;# Add deterministic timestamp from source data&lt;BR /&gt;return df.select(&lt;BR /&gt;"*",&lt;BR /&gt;col("modified_date"). alias("refresh_timestamp") # Use source column&lt;BR /&gt;).filter(&lt;BR /&gt;col("modified_date") &amp;gt; spark.conf.get("last_refresh_time", "1900-01-01")&lt;BR /&gt;&lt;BR /&gt;Option 3: Use DLT Change Data Feed&lt;/H3&gt;&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/97035"&gt;@Dlt&lt;/a&gt;.table(&lt;BR /&gt;name="tracked_mv",&lt;BR /&gt;table_properties={&lt;BR /&gt;"delta.enableChangeDataFeed": "true"&lt;BR /&gt;}&lt;BR /&gt;)&lt;BR /&gt;def tracked_mv():&lt;BR /&gt;return dlt.read_stream("source_table")&lt;/P&gt;&lt;P&gt;# Query changes separately&lt;BR /&gt;changes_df = spark.read.format("delta") \&lt;BR /&gt;.option("readChangeFeed", "true") \&lt;BR /&gt;.option("startingVersion", last_version) \&lt;BR /&gt;.table("tracked_mv")&lt;/P&gt;&lt;H2&gt;Key Points&lt;/H2&gt;&lt;OL class=""&gt;&lt;LI&gt;&lt;STRONG&gt;Never use current_timestamp()&lt;/STRONG&gt; in MV definitions&lt;/LI&gt;&lt;LI&gt;&lt;STRONG&gt;Use source table timestamps&lt;/STRONG&gt; or change feeds for tracking&lt;/LI&gt;&lt;LI&gt;&lt;STRONG&gt;Enable CDF&lt;/STRONG&gt; on source tables for proper change tracking&lt;/LI&gt;&lt;/OL&gt;&lt;P class=""&gt;The CDF approach (Option 3) gives you both incremental refresh AND change identification without forcing recomputes.&lt;/P&gt;&lt;P class=""&gt;Which source table timestamp columns are available?&lt;/P&gt;&lt;H3&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;/H3&gt;</description>
      <pubDate>Wed, 03 Sep 2025 14:58:41 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/identify-updated-rows-during-incremental-refresh-in-dlt/m-p/130694#M48877</guid>
      <dc:creator>ck7007</dc:creator>
      <dc:date>2025-09-03T14:58:41Z</dc:date>
    </item>
  </channel>
</rss>

