Creating a view first & then a table as you suggested still produces the same result: data in the table is overwritten (rather than appended) with each run of the pipeline. Here's a simple code example that I used:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
import datetime
import dlt
# Initialize Spark session
spark = SparkSession.builder.appName("Data Ingestion").getOrCreate()
from pyspark.sql.functions import current_timestamp
# Function to generate sample data
def generate_data():
data = [
(1, "A"),
(2, "B"),
(3, "C")
]
df = spark.createDataFrame(data, ["id", "value"])
df = df.withColumn("timestamp", lit(datetime.datetime.now()))
return df
# Define DLT view and table
@Dlt.view(
name="example_view"
)
def create_example_view():
return generate_data()
# # Define the Delta Live Table
@Dlt.table(
name="example_table"
)
def create_example_table():
df = spark.read.table("example_view")
return generate_data()