<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: how to transform json-stat 2 filte to SparkDataFrame ? how to keep order on MapType structure ? in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/how-to-transform-json-stat-2-filte-to-sparkdataframe-how-to-keep/m-p/69167#M33833</link>
    <description>&lt;P&gt;MapType does not maintain order (json itself too).&lt;BR /&gt;Can you apply the ordering yourself afterwards?&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;</description>
    <pubDate>Thu, 16 May 2024 13:45:18 GMT</pubDate>
    <dc:creator>-werners-</dc:creator>
    <dc:date>2024-05-16T13:45:18Z</dc:date>
    <item>
      <title>how to transform json-stat 2 filte to SparkDataFrame ? how to keep order on MapType structure ?</title>
      <link>https://community.databricks.com/t5/data-engineering/how-to-transform-json-stat-2-filte-to-sparkdataframe-how-to-keep/m-p/69078#M33799</link>
      <description>&lt;P&gt;Hi,&lt;/P&gt;&lt;P data-unlink="true"&gt;I am using different json files of type &lt;A href="https://json-stat.org/" target="_self"&gt;json-stat2.&lt;/A&gt;&amp;nbsp;&amp;nbsp;These kind of json file is quite common used in&amp;nbsp;national statistisc bureau. Its multi dimensional with multy arrays.&lt;SPAN&gt;&amp;nbsp;Using python environment kan we use pyjstat package to easily&amp;nbsp; transform json to a dataFrame. I tried&amp;nbsp;pyjstat on databricks and it didnt work as expected since after a while cluster kept failing (most probably due to performance bottlenecks).&amp;nbsp;&lt;/SPAN&gt;&lt;/P&gt;&lt;P data-unlink="true"&gt;&lt;SPAN&gt;So i tried to create a script that do it self.&amp;nbsp;&lt;/SPAN&gt;&lt;/P&gt;&lt;H3&gt;Here is my code:&lt;/H3&gt;&lt;LI-CODE lang="python"&gt;file_path = "/mnt/makro/bronze/json_ssb/13472_20240514.json"
file_path = "/mnt/makro/bronze/json_ssb/13472_20240514.json"

# Define the schema to match the JSON structure
schema = StructType([
    StructField("version", StringType(), True),
    StructField("class", StringType(), True),
    StructField("label", StringType(), True),
    StructField("source", StringType(), True),
    StructField("updated", StringType(), True),
    StructField("note", ArrayType(StringType()), True),
    StructField("role", StructType([
        StructField("time", ArrayType(StringType()), True),
        StructField("geo", ArrayType(StringType()), True),
        StructField("metric", ArrayType(StringType()), True),
    ]), True),
    StructField("id", ArrayType(StringType()), True),
    StructField("size", ArrayType(IntegerType()), True),
    StructField("dimension", StructType([
        StructField("Region", StructType([
            StructField("label", StringType(), True),
            StructField("note", ArrayType(StringType()), True),
            StructField("category", StructType([
                StructField("index", MapType(StringType(), IntegerType()), True),
                StructField("label", MapType(StringType(), StringType()), True),
                StructField("note", MapType(StringType(), ArrayType(StringType())), True),
            ]), True),
            StructField("extension", StructType([
                StructField("elimination", StringType(), True),
                StructField("show", StringType(), True),
            ]), True),
            StructField("link", StructType([
                StructField("describedby", ArrayType(StructType([
                    StructField("extension", StructType([
                        StructField("Region", StringType(), True)
                    ]), True)
                ])), True),
            ]), True),
        ]), True),
        StructField("Sektor", StructType([
            StructField("label", StringType(), True),
            StructField("category", StructType([
                StructField("index", MapType(StringType(), IntegerType()), True),
                StructField("label", MapType(StringType(), StringType()), True)
            ]), True),
            StructField("extension", StructType([
                StructField("elimination", StringType(), True),
                StructField("eliminationValueCode", StringType(), True),
                StructField("show", StringType(), True),
            ]), True),
            StructField("link", StructType([
                StructField("describedby", ArrayType(StructType([
                    StructField("extension", StructType([
                        StructField("Sektor", StringType(), True)
                    ]), True)
                ])), True),
            ]), True),
        ]), True),
        StructField("ContentsCode", StructType([
            StructField("label", StringType(), True),
            StructField("category", StructType([
                StructField("index", MapType(StringType(), IntegerType()), True),
                StructField("label", MapType(StringType(), StringType()), True),
                StructField("unit", MapType(StringType(), StructType([
                    StructField("base", StringType(), True),
                    StructField("decimals", IntegerType(), True)
                ])), True)
            ]), True),
            StructField("extension", StructType([
                StructField("elimination", StringType(), True),
                StructField("refperiod", MapType(StringType(), StringType()), True),
                StructField("show", StringType(), True),
            ]), True),
            StructField("link", StructType([
                StructField("describedby", ArrayType(StructType([
                    StructField("extension", StructType([
                        StructField("ContentsCode", StringType(), True)
                    ]), True)
                ])), True),
            ]), True),
        ]), True),
        StructField("Tid", StructType([
            StructField("label", StringType(), True),
            StructField("category", StructType([
                StructField("index", MapType(StringType(), IntegerType()), True),
                StructField("label", MapType(StringType(), StringType()), True)
            ]), True),
            StructField("extension", StructType([
                StructField("elimination", StringType(), True),
                StructField("show", StringType(), True),
            ]), True),
        ]), True),
    ]), True),
    StructField("extension", StructType([
        StructField("px", StructType([
            StructField("infofile", StringType(), True),
            StructField("tableid", StringType(), True),
            StructField("decimals", IntegerType(), True),
            StructField("official-statistics", StringType(), True),
            StructField("aggregallowed", StringType(), True),
            StructField("language", StringType(), True),
            StructField("matrix", StringType(), True),
            StructField("subject-code", StringType(), True)
        ]), True),
        StructField("contact", ArrayType(StructType([
            StructField("name", StringType(), True),
            StructField("phone", StringType(), True),
            StructField("mail", StringType(), True),
            StructField("raw", StringType(), True)
        ])), True)
    ]), True),
    StructField("value", ArrayType(IntegerType()), True)
])

#read json
df = spark.read.option("multiline", "true").schema(schema).json(file_path)

dimensions = df.select("dimension.*").collect()[0].asDict()
updated= df.select("updated")
source = df.select("source")

def create_dimension_df(dim_name, dim_data):
    categories = dim_data['category']['label'].asDict()
    data = [(idx, k, v) for idx, (k, v) in enumerate(categories.items())]
    # data = [(k, v) for k, v in categories.items()]
    # return spark.createDataFrame(data, schema=['code', dim_name])
    return spark.createDataFrame(data, schema=['index', 'code', dim_name])

region_df = create_dimension_df("Region", dimensions["Region"])
sektor_df = create_dimension_df("sektor", dimensions["Sektor"])
contents_code_df = create_dimension_df("statistikkvariabel", dimensions["ContentsCode"])
time_df = create_dimension_df("år", dimensions["Tid"])
values = df.select("value").collect()[0].value
sizes = df.select("size").collect()[0].size
value_df = spark.createDataFrame([(i, values[i]) for i in range(len(values))], schema=['index', 'value'])

import itertools
indices = list(itertools.product(range(sizes[0]), range(sizes[1]), range(sizes[2]), range(sizes[3])))
index_df = spark.createDataFrame(indices, schema=["Region_index", "sektor_index", "statistikkvariabel_index", "år_index"])
index_df = index_df.withColumn("index", col("Region_index") * sizes[1] * sizes[2] * sizes[3] + col("sektor_index") * sizes[2] * sizes[3] + col("statistikkvariabel_index") * sizes[3] + col("år_index"))
index_df = index_df.join(value_df, on="index").drop("index")

region_df = region_df.withColumnRenamed('index', 'Region_index')
sektor_df = sektor_df.withColumnRenamed('index', 'sektor_index')
contents_code_df = contents_code_df.withColumnRenamed('index', 'statistikkvariabel_index')
time_df = time_df.withColumnRenamed('index', 'år_index')

# Join with dimension DataFrames
final_df = index_df \
    .join(region_df, on="Region_index") \
    .join(sektor_df, on="sektor_index") \
    .join(contents_code_df, on="statistikkvariabel_index") \
    .join(time_df, on="år_index") \
    .select("Region", "sektor", "statistikkvariabel", "år", "value")
&lt;/LI-CODE&gt;&lt;P&gt;this code works but gives wrong data, since original order of MapType structure is not preserved. In this case using this logic i need to maintain order to get correct data. how can i dot it ? is there a better way to&amp;nbsp; transform json-stat 2 filte to SparkDataFrame. I would like to have a code that will adapt to the response we get from api, since column names can vary.&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;If you want to try you can send a post request to:&lt;/P&gt;&lt;P&gt;url -&amp;gt;&amp;nbsp;&lt;SPAN&gt;&lt;A href="https://data.ssb.no/api/v0/no/table/13472/" target="_blank"&gt;https://data.ssb.no/api/v0/no/table/13472/&lt;/A&gt;&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;body-&amp;gt;&amp;nbsp;&lt;/P&gt;&lt;DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;{&lt;/SPAN&gt;&lt;SPAN&gt;"query"&lt;/SPAN&gt;&lt;SPAN&gt;:[{&lt;/SPAN&gt;&lt;SPAN&gt;"code"&lt;/SPAN&gt;&lt;SPAN&gt;:&lt;/SPAN&gt;&lt;SPAN&gt;"Region"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"selection"&lt;/SPAN&gt;&lt;SPAN&gt;:{&lt;/SPAN&gt;&lt;SPAN&gt;"filter"&lt;/SPAN&gt;&lt;SPAN&gt;:&lt;/SPAN&gt;&lt;SPAN&gt;"agg:KommSummer"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"values"&lt;/SPAN&gt;&lt;SPAN&gt;:[&lt;/SPAN&gt;&lt;SPAN&gt;"K-1875"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5501"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5503"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5510"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5512"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5514"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5516"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5518"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5520"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5522"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5524"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5526"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5528"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5530"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5532"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5534"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5536"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5538"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5540"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5542"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5544"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5546"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5601"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5603"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5605"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5607"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5610"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5612"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5614"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5616"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5618"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5620"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5622"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5624"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5626"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5628"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5630"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5632"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5634"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-5636"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-21-22"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-23"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"K-Rest"&lt;/SPAN&gt;&lt;SPAN&gt;]}},{&lt;/SPAN&gt;&lt;SPAN&gt;"code"&lt;/SPAN&gt;&lt;SPAN&gt;:&lt;/SPAN&gt;&lt;SPAN&gt;"Sektor"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"selection"&lt;/SPAN&gt;&lt;SPAN&gt;:{&lt;/SPAN&gt;&lt;SPAN&gt;"filter"&lt;/SPAN&gt;&lt;SPAN&gt;:&lt;/SPAN&gt;&lt;SPAN&gt;"item"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"values"&lt;/SPAN&gt;&lt;SPAN&gt;:[&lt;/SPAN&gt;&lt;SPAN&gt;"ALLE"&lt;/SPAN&gt;&lt;SPAN&gt;]}},{&lt;/SPAN&gt;&lt;SPAN&gt;"code"&lt;/SPAN&gt;&lt;SPAN&gt;:&lt;/SPAN&gt;&lt;SPAN&gt;"ContentsCode"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"selection"&lt;/SPAN&gt;&lt;SPAN&gt;:{&lt;/SPAN&gt;&lt;SPAN&gt;"filter"&lt;/SPAN&gt;&lt;SPAN&gt;:&lt;/SPAN&gt;&lt;SPAN&gt;"item"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"values"&lt;/SPAN&gt;&lt;SPAN&gt;:[&lt;/SPAN&gt;&lt;SPAN&gt;"SysselEtterArbste"&lt;/SPAN&gt;&lt;SPAN&gt;]}}],&lt;/SPAN&gt;&lt;SPAN&gt;"response"&lt;/SPAN&gt;&lt;SPAN&gt;:{&lt;/SPAN&gt;&lt;SPAN&gt;"format"&lt;/SPAN&gt;&lt;SPAN&gt;:&lt;/SPAN&gt;&lt;SPAN&gt;"json-stat2"&lt;/SPAN&gt;&lt;SPAN&gt;}}&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;thanks in advance.&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;Nuno&lt;/SPAN&gt;&lt;/DIV&gt;&lt;/DIV&gt;</description>
      <pubDate>Wed, 15 May 2024 11:27:43 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/how-to-transform-json-stat-2-filte-to-sparkdataframe-how-to-keep/m-p/69078#M33799</guid>
      <dc:creator>NTRT</dc:creator>
      <dc:date>2024-05-15T11:27:43Z</dc:date>
    </item>
    <item>
      <title>Re: how to transform json-stat 2 filte to SparkDataFrame ? how to keep order on MapType structure ?</title>
      <link>https://community.databricks.com/t5/data-engineering/how-to-transform-json-stat-2-filte-to-sparkdataframe-how-to-keep/m-p/69167#M33833</link>
      <description>&lt;P&gt;MapType does not maintain order (json itself too).&lt;BR /&gt;Can you apply the ordering yourself afterwards?&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;</description>
      <pubDate>Thu, 16 May 2024 13:45:18 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/how-to-transform-json-stat-2-filte-to-sparkdataframe-how-to-keep/m-p/69167#M33833</guid>
      <dc:creator>-werners-</dc:creator>
      <dc:date>2024-05-16T13:45:18Z</dc:date>
    </item>
  </channel>
</rss>

