Hi everyone,
I am currently trying to enforce the following schema:
StructType([
StructField("site", StringType(), True),
StructField("meter", StringType(), True),
StructField("device_time", StringType(), True),
StructField("data", StructType([
StructField("energy", StructType([
StructField("cumulative", StructType([
StructField("active", StructType([
StructField("value", DoubleType(), True),
StructField("unit", StringType(), True)
]), True),
StructField("apparent", StructType([
StructField("value", DoubleType(), ),
StructField("unit", StringType(), )
]), ),
StructField("reactive", StructType([
StructField("value", DoubleType(), ),
StructField("unit", StringType(), )
]), )
]), True)
]), True),
StructField("power", StructType([
StructField("instantaneous", StructType([
StructField("active", StructType([
StructField("value", DoubleType(), True),
StructField("unit", StringType(), True)
]), True),
StructField("apparent", StructType([
StructField("value", DoubleType(), ),
StructField("unit", StringType(), )
]), ),
StructField("reactive", StructType([
StructField("value", DoubleType(), ),
StructField("unit", StringType(), )
]), )
]), True),
StructField("average", StructType([
StructField("active", StructType([
StructField("value", DoubleType(), True),
StructField("unit", StringType(), True)
]), True),
StructField("apparent", StructType([
StructField("value", DoubleType(), ),
StructField("unit", StringType(), )
]), ),
StructField("reactive", StructType([
StructField("value", DoubleType(), ),
StructField("unit", StringType(), )
]), )
]), True)
]), True)
]), True)
])
I want to do it using autoloader:
dfSilver = (spark.readStream
.format("cloudFiles")
.option("cloudFiles.format", "json")
.schema(json_schema_silver)
.load("myVolumePathForJsonFilesFromS3"))
but for some reason the schema is not enforced.
Here's one example of what i am ingesting as json files:
[
{
"site": "SiteA",
"meter": "M1",
"device_time": "2024-08-25T00:00:01.276Z",
"data": {
"energy": {
"cumulative": {
"active": {
"value": 60000.000000000000,
"unit": "kWh"
},
"apparent": {
"value": 61000.000000000000,
"unit": "kVAh"
},
"reactive": {
"value": 420.000000000000,
"unit": "kVArH"
}
}
},
"power": {
"instantaneous": {
"active": {
"value": 9100000.000000000000,
"unit": "watt"
},
"apparent": {
"value": 9200000.000000000000,
"unit": "VA"
},
"reactive": {
"value": 64000.000000000000,
"unit": "var"
}
},
"average": {
"active": {
"value": 9100000.000000000000,
"unit": "watt"
},
"apparent": {
"value": 9200000.000000000000,
"unit": "VA"
},
"reactive": {
"value": 64000.000000000000,
"unit": "var"
}
}
}
}
},
{
"site": "SiteB",
"meter": "M2",
"device_time": "2024-08-25T00:30:31.306Z",
"data": {
"energy": {
"cumulative": {
"active": {
"value": 61000.000000000000,
"unit": "kWh"
},
"apparent": {
"value": 62000.000000000000,
"unit": "kVAh"
},
"reactive": {
"value": 430.000000000000,
"unit": "kVArH"
}
}
},
"power": {
"instantaneous": {
"active": {
"value": 10200000.000000000000,
"unit": "watt"
},
"apparent": {
"value": 10300000.000000000000,
"unit": "VA"
},
"reactive": {
"value": 65000.000000000000,
"unit": "var"
}
},
"average": {
"active": {
"value": 10200000.000000000000,
"unit": "watt"
},
"apparent": {
"value": 10300000.000000000000,
By enforcing the schema i udestand it should show each field of the nested objects as it does with site for example in a tabular format not inside data, am i wrong for expecting this?