Hi there, I'd like to follow up on a prior post:
Basically I'm wondering what's the best way to set *both* dataPoint and values to string via schemaHints?
For example, although I've heard that this feature isn't supported currently, it would be great if I could dynamically set schemaHints as follows:
- schemaHints => 'elementData.element.data.*.dataPoint string, elementData.element.data.*.values string'
The reason being, the field `NESTED_DATA_NAME_X` is a variable name that I cannot reliably account for (maybe there's a few, or maybe there's 100, and they may change over time), but the `dataPoint` & `values` nested fields are consistent.
Below is a simpler code snippet than in my last post (providing json_text, but in practice I would be trying to use schemaHints via read_files or autoloader):
json_text = """
{
"packageHeader": {
"transactionId": "5c3661ac-abdd-480c-88b6-c3c128bce7bd",
"rootName": "rootName",
"endpointName": "report",
"elementType": "Generator",
"elementName": "rootName",
"intervalSize": 5,
"environment": "production",
"startDate": "2025-07-10T04:00:00",
"endDate": "2025-07-10T04:10:00"
},
"elementData": [
{
"elementName": "elementName1",
"elementIdentifier": "ae5f1a94-33b4-4001-b926-499dc0425bf1",
"elementDefinitionIdentifier": 2,
"metaData": {},
"data": {}
},
{
"elementName": "elementName2",
"elementIdentifier": "c647ffb1-b8ba-4b34-8b45-8590fed273ef",
"elementDefinitionIdentifier": 2,
"metaData": {},
"data": {
"NESTED_DATA_NAME_1": {
"dataPoint": {
"name": "NESTED_DATA_NAME_1",
"friendlyName": "friendlyName",
"keyName": "keyName",
"dataType": "Decimal",
"sequence": null
},
"values": [
{
"intervalLocal": "2025-07-10T04:05:00-05:00",
"value": 0.975200119018556
},
{
"intervalLocal": "2025-07-10T04:10:00-05:00",
"value": 0.21553290049235
}
]
},
"NESTED_DATA_NAME_2": {
"dataPoint": {
"name": "NESTED_DATA_NAME_2",
"friendlyName": "friendlyName",
"keyName": "keyName",
"dataType": "String",
"sequence": null
},
"values": [
{
"intervalLocal": "2025-07-10T04:05:00-05:00",
"value": "ON"
},
{
"intervalLocal": "2025-07-10T04:10:00-05:00",
"value": "ON"
}
]
},
"NESTED_DATA_NAME_3": {
"dataPoint": {
"name": "NESTED_DATA_NAME_3",
"friendlyName": "friendlyName",
"keyName": "keyName",
"dataType": "Decimal",
"sequence": null
},
"values": [
{
"intervalLocal": "2025-07-10T04:05:00-05:00",
"value": 0
},
{
"intervalLocal": "2025-07-10T04:10:00-05:00",
"value": 0
}
]
},
"NESTED_DATA_NAME_5": {
"dataPoint": {
"name": "NESTED_DATA_NAME_5",
"friendlyName": "friendlyName",
"keyName": "keyName",
"dataType": "Decimal",
"sequence": null
},
"values": [
{
"intervalLocal": "2025-07-10T04:05:00-05:00",
"value": 0
},
{
"intervalLocal": "2025-07-10T04:10:00-05:00",
"value": 0
}
]
},
"NESTED_DATA_NAME_6": {
"dataPoint": {
"name": "NESTED_DATA_NAME_6",
"friendlyName": "friendlyName",
"keyName": "keyName",
"dataType": "String",
"sequence": null
},
"values": [
{
"intervalLocal": "2025-07-10T04:05:00-05:00",
"value": "Off"
},
{
"intervalLocal": "2025-07-10T04:10:00-05:00",
"value": "Off"
}
]
},
"NESTED_DATA_NAME_7": {
"dataPoint": {
"name": "NESTED_DATA_NAME_7",
"friendlyName": "friendlyName",
"keyName": "keyName",
"dataType": "Decimal",
"sequence": null
},
"values": [
{
"intervalLocal": "2025-07-10T04:05:00-05:00",
"value": 0.00011502826237119734
},
{
"intervalLocal": "2025-07-10T04:10:00-05:00",
"value": 0.00011502826237119734
}
]
}
}
}
]
}
"""
rdd = spark.sparkContext.parallelize(json_text.split(', '))
df = spark.read.json(rdd)
df.printSchema()