<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: DLT Serverless incremental refresh of materialized view in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/dlt-serverless-incremental-refresh-of-materialized-view/m-p/95913#M39188</link>
    <description>&lt;P&gt;I split up materialized view in 3 separate ones:&lt;BR /&gt;&lt;BR /&gt;step1:&lt;/P&gt;&lt;LI-CODE lang="python"&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/97035"&gt;@Dlt&lt;/a&gt;.table(name="step1", table_properties={"delta.enableRowTracking": "true"})
def step1():
    isolate_names = dlt.read("soruce").select("Name").groupBy("Name").count()
    return isolate_names&lt;/LI-CODE&gt;&lt;P&gt;&lt;BR /&gt;&lt;BR /&gt;step2:&lt;BR /&gt;uses step1 and does some extra processing steps (notable functions: split, explode, slice, ... -&amp;gt; select expressions mostly)&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/97035"&gt;@Dlt&lt;/a&gt;.table(name="step2", table_properties={"delta.enableRowTracking": "true"})
def step2():
    asa_telemetry = dlt.read("step1").select("Name")
....&lt;/LI-CODE&gt;&lt;P&gt;&lt;BR /&gt;&lt;BR /&gt;step3:&lt;/P&gt;&lt;LI-CODE lang="python"&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/97035"&gt;@Dlt&lt;/a&gt;.table(name=step3, table_properties={"delta.enableRowTracking": "true"})
def step3():
    step2 = dlt.read("step2")

    meta = spark.table("MetaData").alias("meta")

    add_id = (
        step2.alias("step2")
        .join(
            meta,
            on=[
                f.col("step2.colA") == f.col("meta.col1"),
                f.col("step2.colB") == f.col("meta.col2"),
            ],
            how="left",
        )
        .select("step2.*", "meta.Id")
    )

    return add_id&lt;/LI-CODE&gt;&lt;P&gt;&lt;BR /&gt;&lt;BR /&gt;For step 1 I get this message:&lt;BR /&gt;&lt;SPAN&gt;Flow 'step1' has been planned in DLT to be executed as GROUP_AGGREGATE.&lt;BR /&gt;&lt;BR /&gt;&lt;/SPAN&gt;For step2 I get this message:&lt;BR /&gt;&lt;SPAN&gt;Flow 'step2' has been planned in DLT to be executed as COMPLETE_RECOMPUTE. Another option is available:GROUP_AGGREGATE. COMPLETE_RECOMPUTE was chosen in the current run for its optimal performance.&lt;BR /&gt;&lt;BR /&gt;&lt;/SPAN&gt;So step1 and 2 don't have incrementalisation issues&lt;BR /&gt;&lt;BR /&gt;Step 3 has issues:&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;{
  "planning_information": {
    "technique_information": [
      {
        "incrementalization_issues": [
          {
            "issue_type": "CDF_UNAVAILABLE",
            "prevent_incrementalization": true,
            "table_information": {
              "table_name": "step2",
            }
          }
        ]
      },
      {
        "maintenance_type": "MAINTENANCE_TYPE_ROW_BASED",
        "incrementalization_issues": [
          {
            "issue_type": "ROW_TRACKING_NOT_ENABLED",
            "prevent_incrementalization": true,
            "table_information": {
              "table_name": "step2",
            }
          },
          {
            "issue_type": "PLAN_NOT_INCREMENTALIZABLE",
            "prevent_incrementalization": true,
            "operator_name": "Join",
            "join_type": "LEFT_OUTER"
          }
        ]
      },
      {
        "maintenance_type": "MAINTENANCE_TYPE_COMPLETE_RECOMPUTE",
        "is_chosen": true,
        "is_applicable": true,
        "cost": 78952
      }
    ],
    "source_table_information": [
      {
        "table_name": "`step2",
        "table_id": "bd21e05c-8011-485a-8f0f-8dc82656d31e",
        "full_size": 3791,
        "is_size_after_pruning": true,
        "is_row_id_enabled": true,
        "is_cdf_enabled": true,
        "is_deletion_vector_enabled": false
      },
      {
        "table_name": "MetaData",
        "full_size": 1747,
        "is_size_after_pruning": true,
        "is_row_id_enabled": false,
        "is_cdf_enabled": false,
        "is_deletion_vector_enabled": true
      }
    ],
    "target_table_information": {
      "table_name": "step3",
      "full_size": 3943,
      "is_row_id_enabled": true,
      "is_cdf_enabled": true,
      "is_deletion_vector_enabled": false
    }
  }
}&lt;/LI-CODE&gt;&lt;P&gt;So the problem really seems to be the left join?&lt;/P&gt;</description>
    <pubDate>Thu, 24 Oct 2024 09:04:59 GMT</pubDate>
    <dc:creator>L1000</dc:creator>
    <dc:date>2024-10-24T09:04:59Z</dc:date>
    <item>
      <title>DLT Serverless incremental refresh of materialized view</title>
      <link>https://community.databricks.com/t5/data-engineering/dlt-serverless-incremental-refresh-of-materialized-view/m-p/95715#M39158</link>
      <description>&lt;P&gt;I have a materialized view that always does a "COMPLETE_RECOMPUTE", but I can't figure out why.&lt;BR /&gt;I found how I can get the logs:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;SELECT * FROM event_log(pipeline_id)
WHERE event_type = 'planning_information'
ORDER BY timestamp desc;&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;DIV&gt;&lt;DIV&gt;&amp;nbsp;&lt;/DIV&gt;&lt;/DIV&gt;&lt;P&gt;And for my table I got this info:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;{
  "planning_information": {
    "technique_information": [
      {
        "maintenance_type": "MAINTENANCE_TYPE_GENERIC_AGGREGATE",
        "incrementalization_issues": [
          {
            "issue_type": "PLAN_NOT_INCREMENTALIZABLE",
            "prevent_incrementalization": true,
            "operator_name": "Aggregate",
            "plan_not_incrementalizable_sub_type": "AGGREGATE_NOT_TOP_NODE"
          }
        ]
      },
      {
        "maintenance_type": "MAINTENANCE_TYPE_ROW_BASED",
        "incrementalization_issues": [
          {
            "issue_type": "PLAN_NOT_INCREMENTALIZABLE",
            "prevent_incrementalization": true,
            "operator_name": "Join",
            "join_type": "LEFT_OUTER"
          }
        ]
      },
      {
        "maintenance_type": "MAINTENANCE_TYPE_COMPLETE_RECOMPUTE",
        "is_chosen": true,
        "is_applicable": true,
        "cost": 34936819266
      }
    ],
}&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;BR /&gt;It seems like there is an issue with the left outer join?&lt;BR /&gt;But normally that is a supported operation:&lt;/P&gt;&lt;P&gt;&lt;A href="https://learn.microsoft.com/en-us/azure/databricks/optimizations/incremental-refresh#enzyme-support" target="_blank" rel="noopener"&gt;https://learn.microsoft.com/en-us/azure/databricks/optimizations/incremental-refresh#enzyme-support&lt;/A&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;</description>
      <pubDate>Wed, 23 Oct 2024 12:57:03 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/dlt-serverless-incremental-refresh-of-materialized-view/m-p/95715#M39158</guid>
      <dc:creator>L1000</dc:creator>
      <dc:date>2024-10-23T12:57:03Z</dc:date>
    </item>
    <item>
      <title>Re: DLT Serverless incremental refresh of materialized view</title>
      <link>https://community.databricks.com/t5/data-engineering/dlt-serverless-incremental-refresh-of-materialized-view/m-p/95784#M39173</link>
      <description>&lt;P&gt;From the plan it says that aggregation is happening before the join. Can you ensure the join happens first and then the aggregation?&amp;nbsp; Are you able to share the query?&lt;/P&gt;</description>
      <pubDate>Wed, 23 Oct 2024 22:39:23 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/dlt-serverless-incremental-refresh-of-materialized-view/m-p/95784#M39173</guid>
      <dc:creator>randomquartile</dc:creator>
      <dc:date>2024-10-23T22:39:23Z</dc:date>
    </item>
    <item>
      <title>Re: DLT Serverless incremental refresh of materialized view</title>
      <link>https://community.databricks.com/t5/data-engineering/dlt-serverless-incremental-refresh-of-materialized-view/m-p/95827#M39180</link>
      <description>&lt;P&gt;There is some stuff that happens in separate custom functions, but all the functions consist of supported operations.&lt;BR /&gt;&lt;BR /&gt;The aggregation is .groupBy("Name").count() before anything happens as a way to distinct the data. (because 'distinct' is not supported but 'group by' is).&lt;BR /&gt;Afterwards it's a bit of processing and then finally a join on some fields extracted from the 'Name'.&lt;BR /&gt;&lt;BR /&gt;So Materialized Views don't support joins as a final step?&lt;/P&gt;</description>
      <pubDate>Thu, 24 Oct 2024 06:46:29 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/dlt-serverless-incremental-refresh-of-materialized-view/m-p/95827#M39180</guid>
      <dc:creator>L1000</dc:creator>
      <dc:date>2024-10-24T06:46:29Z</dc:date>
    </item>
    <item>
      <title>Re: DLT Serverless incremental refresh of materialized view</title>
      <link>https://community.databricks.com/t5/data-engineering/dlt-serverless-incremental-refresh-of-materialized-view/m-p/95876#M39186</link>
      <description>&lt;P&gt;I have split up the materialized view in 3 separate ones:&lt;BR /&gt;&lt;BR /&gt;MV1: deduplicate by grouping:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;.table(name="step1", table_properties={"delta.enableRowTracking": "true"})
def step1():
    isolate_names = dlt.read("source_data").select("Name").groupBy("Name").count()
    return isolate_names&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;MV2:&lt;BR /&gt;step2: Use step1 and process data (notable functions: split, slice, sha2, explode -&amp;gt; select expressions)&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;.table(name="step2", table_properties={"delta.enableRowTracking": "true"})
def step2():
    df = dlt.read(step1).select("Name")
...&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;BR /&gt;MV3:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;.table(name="step3", table_properties={"delta.enableRowTracking": "true"})
def step3():
    step2 = dlt.read("step2")

    meta = spark.table("MetaData").alias("meta")

    add_meta = (
        step2.alias("step2")
        .join(
            meta,
            on=[
                f.col("step2.colA") == f.col("meta.col1"),
                f.col("step2.colB") == f.col("meta.col2"),
            ],
            how="left",
        )
        .select("step2.*", "meta.Id")
    )

    return add_meta&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;for step 3 I get now these incrementalization issues:&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;{
  "planning_information": {
    "technique_information": [
      {
        "incrementalization_issues": [
          {
            "issue_type": "CDF_UNAVAILABLE",
            "prevent_incrementalization": true,
            "table_information": {
              "table_name": "step2",
            }
          }
        ]
      },
      {
        "maintenance_type": "MAINTENANCE_TYPE_ROW_BASED",
        "incrementalization_issues": [
          {
            "issue_type": "ROW_TRACKING_NOT_ENABLED",
            "prevent_incrementalization": true,
            "table_information": {
              "table_name": "step2",
            }
          },
          {
            "issue_type": "PLAN_NOT_INCREMENTALIZABLE",
            "prevent_incrementalization": true,
            "operator_name": "Join",
            "join_type": "LEFT_OUTER"
          }
        ]
      },
      {
        "maintenance_type": "MAINTENANCE_TYPE_COMPLETE_RECOMPUTE",
        "is_chosen": true,
        "is_applicable": true,
        "cost": 78952
      }
    ],
    "source_table_information": [
      {
        "table_name": "step2",
        "full_size": 3791,
        "is_size_after_pruning": true,
        "is_row_id_enabled": true,
        "is_cdf_enabled": true,
        "is_deletion_vector_enabled": false
      },
      {
        "table_name": "meta",
        "full_size": 1747,
        "is_size_after_pruning": true,
        "is_row_id_enabled": false,
        "is_cdf_enabled": false,
        "is_deletion_vector_enabled": true
      }
    ],
    "target_table_information": {
      "table_name": "step3",
      "full_size": 3943,
      "is_row_id_enabled": true,
      "is_cdf_enabled": true,
      "is_deletion_vector_enabled": false
    }
  }
}&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;for step1 and step2 I got these messages:&lt;BR /&gt;Step1&lt;SPAN&gt;&amp;nbsp;has been planned in DLT to be executed as GROUP_AGGREGATE.&lt;/SPAN&gt; (-&amp;gt; incremental?)&lt;BR /&gt;Step2&amp;nbsp;&lt;SPAN&gt;has been planned in DLT to be executed as COMPLETE_RECOMPUTE. Another option is available:GROUP_AGGREGATE. COMPLETE_RECOMPUTE was chosen in the current run for its optimal performance.&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 24 Oct 2024 08:00:34 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/dlt-serverless-incremental-refresh-of-materialized-view/m-p/95876#M39186</guid>
      <dc:creator>L1000</dc:creator>
      <dc:date>2024-10-24T08:00:34Z</dc:date>
    </item>
    <item>
      <title>Re: DLT Serverless incremental refresh of materialized view</title>
      <link>https://community.databricks.com/t5/data-engineering/dlt-serverless-incremental-refresh-of-materialized-view/m-p/95913#M39188</link>
      <description>&lt;P&gt;I split up materialized view in 3 separate ones:&lt;BR /&gt;&lt;BR /&gt;step1:&lt;/P&gt;&lt;LI-CODE lang="python"&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/97035"&gt;@Dlt&lt;/a&gt;.table(name="step1", table_properties={"delta.enableRowTracking": "true"})
def step1():
    isolate_names = dlt.read("soruce").select("Name").groupBy("Name").count()
    return isolate_names&lt;/LI-CODE&gt;&lt;P&gt;&lt;BR /&gt;&lt;BR /&gt;step2:&lt;BR /&gt;uses step1 and does some extra processing steps (notable functions: split, explode, slice, ... -&amp;gt; select expressions mostly)&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/97035"&gt;@Dlt&lt;/a&gt;.table(name="step2", table_properties={"delta.enableRowTracking": "true"})
def step2():
    asa_telemetry = dlt.read("step1").select("Name")
....&lt;/LI-CODE&gt;&lt;P&gt;&lt;BR /&gt;&lt;BR /&gt;step3:&lt;/P&gt;&lt;LI-CODE lang="python"&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/97035"&gt;@Dlt&lt;/a&gt;.table(name=step3, table_properties={"delta.enableRowTracking": "true"})
def step3():
    step2 = dlt.read("step2")

    meta = spark.table("MetaData").alias("meta")

    add_id = (
        step2.alias("step2")
        .join(
            meta,
            on=[
                f.col("step2.colA") == f.col("meta.col1"),
                f.col("step2.colB") == f.col("meta.col2"),
            ],
            how="left",
        )
        .select("step2.*", "meta.Id")
    )

    return add_id&lt;/LI-CODE&gt;&lt;P&gt;&lt;BR /&gt;&lt;BR /&gt;For step 1 I get this message:&lt;BR /&gt;&lt;SPAN&gt;Flow 'step1' has been planned in DLT to be executed as GROUP_AGGREGATE.&lt;BR /&gt;&lt;BR /&gt;&lt;/SPAN&gt;For step2 I get this message:&lt;BR /&gt;&lt;SPAN&gt;Flow 'step2' has been planned in DLT to be executed as COMPLETE_RECOMPUTE. Another option is available:GROUP_AGGREGATE. COMPLETE_RECOMPUTE was chosen in the current run for its optimal performance.&lt;BR /&gt;&lt;BR /&gt;&lt;/SPAN&gt;So step1 and 2 don't have incrementalisation issues&lt;BR /&gt;&lt;BR /&gt;Step 3 has issues:&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;{
  "planning_information": {
    "technique_information": [
      {
        "incrementalization_issues": [
          {
            "issue_type": "CDF_UNAVAILABLE",
            "prevent_incrementalization": true,
            "table_information": {
              "table_name": "step2",
            }
          }
        ]
      },
      {
        "maintenance_type": "MAINTENANCE_TYPE_ROW_BASED",
        "incrementalization_issues": [
          {
            "issue_type": "ROW_TRACKING_NOT_ENABLED",
            "prevent_incrementalization": true,
            "table_information": {
              "table_name": "step2",
            }
          },
          {
            "issue_type": "PLAN_NOT_INCREMENTALIZABLE",
            "prevent_incrementalization": true,
            "operator_name": "Join",
            "join_type": "LEFT_OUTER"
          }
        ]
      },
      {
        "maintenance_type": "MAINTENANCE_TYPE_COMPLETE_RECOMPUTE",
        "is_chosen": true,
        "is_applicable": true,
        "cost": 78952
      }
    ],
    "source_table_information": [
      {
        "table_name": "`step2",
        "table_id": "bd21e05c-8011-485a-8f0f-8dc82656d31e",
        "full_size": 3791,
        "is_size_after_pruning": true,
        "is_row_id_enabled": true,
        "is_cdf_enabled": true,
        "is_deletion_vector_enabled": false
      },
      {
        "table_name": "MetaData",
        "full_size": 1747,
        "is_size_after_pruning": true,
        "is_row_id_enabled": false,
        "is_cdf_enabled": false,
        "is_deletion_vector_enabled": true
      }
    ],
    "target_table_information": {
      "table_name": "step3",
      "full_size": 3943,
      "is_row_id_enabled": true,
      "is_cdf_enabled": true,
      "is_deletion_vector_enabled": false
    }
  }
}&lt;/LI-CODE&gt;&lt;P&gt;So the problem really seems to be the left join?&lt;/P&gt;</description>
      <pubDate>Thu, 24 Oct 2024 09:04:59 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/dlt-serverless-incremental-refresh-of-materialized-view/m-p/95913#M39188</guid>
      <dc:creator>L1000</dc:creator>
      <dc:date>2024-10-24T09:04:59Z</dc:date>
    </item>
  </channel>
</rss>

