With this optimized approach, I would suggest creating view for Clean table:
1. Bronze Table: Raw CDC data (full storage)
2. Clean View: No physical storage - computed on-demand
3. Silver Table: Final processed data with SCD2 history
Result: ~67% storage reduction compared to 3 full table copies!
Sample code:
import dlt
from pyspark.sql.functions import col, expr
# Bronze: Raw streaming data
@Dlt.table
def customer_cdc_bronze():
return spark.readStream.format("cloudFiles") \
.option("cloudFiles.format", "json") \
.load("/path/to/cdc/files/")
# Silver: Clean view (no storage)
@Dlt.view
def customer_cdc_clean():
return dlt.read_stream("customer_cdc_bronze") \
.filter(col("customer_id").isNotNull()) \
.select(
col("customer_id"),
col("customer_name").alias("name"),
col("sequence_number")
)
# Final silver table with CDC
dlt.create_streaming_table("customer_silver")
dlt.apply_changes(
target="customer_silver",
source="customer_cdc_clean", # Using view
keys=["customer_id"],
sequence_by=col("sequence_number"),
stored_as_scd_type="2"
)