schema is not enforced when using autoloader

sakuraDev
New Contributor II

Hi everyone,

I am currently trying to enforce the following schema:

 

 StructType([
    StructField("site", StringType(), True),
    StructField("meter", StringType(), True),
    StructField("device_time", StringType(), True),
    StructField("data", StructType([
        StructField("energy", StructType([
            StructField("cumulative", StructType([
                StructField("active", StructType([
                    StructField("value", DoubleType(), True),
                    StructField("unit", StringType(), True)
                ]), True),
                StructField("apparent", StructType([
                    StructField("value", DoubleType(), ),
                    StructField("unit", StringType(), )
                ]), ),
                StructField("reactive", StructType([
                    StructField("value", DoubleType(), ),
                    StructField("unit", StringType(), )
                ]), )
            ]), True)
        ]), True),
        StructField("power", StructType([
            StructField("instantaneous", StructType([
                StructField("active", StructType([
                    StructField("value", DoubleType(), True),
                    StructField("unit", StringType(), True)
                ]), True),
                StructField("apparent", StructType([
                    StructField("value", DoubleType(), ),
                    StructField("unit", StringType(), )
                ]), ),
                StructField("reactive", StructType([
                    StructField("value", DoubleType(), ),
                    StructField("unit", StringType(), )
                ]), )
            ]), True),
            StructField("average", StructType([
                StructField("active", StructType([
                    StructField("value", DoubleType(), True),
                    StructField("unit", StringType(), True)
                ]), True),
                StructField("apparent", StructType([
                    StructField("value", DoubleType(), ),
                    StructField("unit", StringType(), )
                ]), ),
                StructField("reactive", StructType([
                    StructField("value", DoubleType(), ),
                    StructField("unit", StringType(), )
                ]), )
            ]), True)
        ]), True)
    ]), True)
])

 

I want to do it using autoloader:

 

dfSilver = (spark.readStream
      .format("cloudFiles")
      .option("cloudFiles.format", "json")
      .schema(json_schema_silver)
      .load("myVolumePathForJsonFilesFromS3"))

 

but for some reason the schema is not enforced.

sakuraDev_0-1725389159389.png

Here's one example of what i am ingesting as json files:

 

[
    {
        "site": "SiteA",
        "meter": "M1",
        "device_time": "2024-08-25T00:00:01.276Z",
        "data": {
            "energy": {
                "cumulative": {
                    "active": {
                        "value": 60000.000000000000,
                        "unit": "kWh"
                    },
                    "apparent": {
                        "value": 61000.000000000000,
                        "unit": "kVAh"
                    },
                    "reactive": {
                        "value": 420.000000000000,
                        "unit": "kVArH"
                    }
                }
            },
            "power": {
                "instantaneous": {
                    "active": {
                        "value": 9100000.000000000000,
                        "unit": "watt"
                    },
                    "apparent": {
                        "value": 9200000.000000000000,
                        "unit": "VA"
                    },
                    "reactive": {
                        "value": 64000.000000000000,
                        "unit": "var"
                    }
                },
                "average": {
                    "active": {
                        "value": 9100000.000000000000,
                        "unit": "watt"
                    },
                    "apparent": {
                        "value": 9200000.000000000000,
                        "unit": "VA"
                    },
                    "reactive": {
                        "value": 64000.000000000000,
                        "unit": "var"
                    }
                }
            }
        }
    },
    {
        "site": "SiteB",
        "meter": "M2",
        "device_time": "2024-08-25T00:30:31.306Z",
        "data": {
            "energy": {
                "cumulative": {
                    "active": {
                        "value": 61000.000000000000,
                        "unit": "kWh"
                    },
                    "apparent": {
                        "value": 62000.000000000000,
                        "unit": "kVAh"
                    },
                    "reactive": {
                        "value": 430.000000000000,
                        "unit": "kVArH"
                    }
                }
            },
            "power": {
                "instantaneous": {
                    "active": {
                        "value": 10200000.000000000000,
                        "unit": "watt"
                    },
                    "apparent": {
                        "value": 10300000.000000000000,
                        "unit": "VA"
                    },
                    "reactive": {
                        "value": 65000.000000000000,
                        "unit": "var"
                    }
                },
                "average": {
                    "active": {
                        "value": 10200000.000000000000,
                        "unit": "watt"
                    },
                    "apparent": {
                        "value": 10300000.000000000000,

By enforcing the schema i udestand it should show each field of the nested objects as it does with site for example in a tabular format not inside data, am i wrong for expecting this?