Re: DLT Serverless incremental refresh of material...

L1000 · ‎10-24-2024

I have split up the materialized view in 3 separate ones:

MV1: deduplicate by grouping:

.table(name="step1", table_properties={"delta.enableRowTracking": "true"})
def step1():
    isolate_names = dlt.read("source_data").select("Name").groupBy("Name").count()
    return isolate_names

MV2:
step2: Use step1 and process data (notable functions: split, slice, sha2, explode -> select expressions)

.table(name="step2", table_properties={"delta.enableRowTracking": "true"})
def step2():
    df = dlt.read(step1).select("Name")
...

MV3:

.table(name="step3", table_properties={"delta.enableRowTracking": "true"})
def step3():
    step2 = dlt.read("step2")

    meta = spark.table("MetaData").alias("meta")

    add_meta = (
        step2.alias("step2")
        .join(
            meta,
            on=[
                f.col("step2.colA") == f.col("meta.col1"),
                f.col("step2.colB") == f.col("meta.col2"),
            ],
            how="left",
        )
        .select("step2.*", "meta.Id")
    )

    return add_meta

for step 3 I get now these incrementalization issues:

{
  "planning_information": {
    "technique_information": [
      {
        "incrementalization_issues": [
          {
            "issue_type": "CDF_UNAVAILABLE",
            "prevent_incrementalization": true,
            "table_information": {
              "table_name": "step2",
            }
          }
        ]
      },
      {
        "maintenance_type": "MAINTENANCE_TYPE_ROW_BASED",
        "incrementalization_issues": [
          {
            "issue_type": "ROW_TRACKING_NOT_ENABLED",
            "prevent_incrementalization": true,
            "table_information": {
              "table_name": "step2",
            }
          },
          {
            "issue_type": "PLAN_NOT_INCREMENTALIZABLE",
            "prevent_incrementalization": true,
            "operator_name": "Join",
            "join_type": "LEFT_OUTER"
          }
        ]
      },
      {
        "maintenance_type": "MAINTENANCE_TYPE_COMPLETE_RECOMPUTE",
        "is_chosen": true,
        "is_applicable": true,
        "cost": 78952
      }
    ],
    "source_table_information": [
      {
        "table_name": "step2",
        "full_size": 3791,
        "is_size_after_pruning": true,
        "is_row_id_enabled": true,
        "is_cdf_enabled": true,
        "is_deletion_vector_enabled": false
      },
      {
        "table_name": "meta",
        "full_size": 1747,
        "is_size_after_pruning": true,
        "is_row_id_enabled": false,
        "is_cdf_enabled": false,
        "is_deletion_vector_enabled": true
      }
    ],
    "target_table_information": {
      "table_name": "step3",
      "full_size": 3943,
      "is_row_id_enabled": true,
      "is_cdf_enabled": true,
      "is_deletion_vector_enabled": false
    }
  }
}

for step1 and step2 I got these messages:
Step1 has been planned in DLT to be executed as GROUP_AGGREGATE. (-> incremental?)
Step2 has been planned in DLT to be executed as COMPLETE_RECOMPUTE. Another option is available:GROUP_AGGREGATE. COMPLETE_RECOMPUTE was chosen in the current run for its optimal performance.