from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DateType
from pyspark.sql import DataFrame, functions as F
test_data_1 = [
('2023-01-10', 123, 50.0, 'bikes', 'LA', {"street_name":"Left Street", "street_number": 1, "zip_code": "8070"}),
('2023-01-31', 123, 150.0, None, 'LA', {"street_name":"North Street", "street_number": 1, "zip_code": "1234"}),
('2023-01-10', 321, 500.0, 'pans', 'NY', {"street_name":"Dark Street", "street_number": 2, "zip_code": "1234"}),
('2023-01-10', 321, 500.0, 'pans', 'NY', {"street_name":"Dark Street", "street_number": 2, "zip_code": "1234"}),
('2023-01-10', 123, 5000.0, 'cars', 'LA', {"street_name":"", "street_number": None, "zip_code": ""}),
('2023-02-28', 213, 300.0, 'spoons', None, {"street_name":"", "street_number": None, "zip_code": ""}),
('2023-03-10', 321, 50000.0, 'cars', 'NY', {"street_name":"", "street_number": None, "zip_code": ""}),
('2023-03-31', 213, None, 'cars', 'SF', {"street_name":"", "street_number": None, "zip_code": ""}),
('2023-04-30', 432, None, 'plates', 'SF', {"street_name":"", "street_number": None, "zip_code": ""})
]
test_data_schema_1 = StructType([
StructField("purchase_date", StringType(), True),
StructField("customer_id", IntegerType(), True),
StructField("amount", FloatType(), True),
StructField("category", StringType(), True),
StructField("city", StringType(), True),
StructField("address", StructType([
StructField("street_name", StringType(), True),
StructField("street_number", IntegerType(), True),
StructField("zip_code", StringType(), True)
]), True)
])
df = spark.createDataFrame(test_data_1, test_data_schema_1)