<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Databricks Mosaic's grid_polyfill() is taking longer to explode the index when run using PySpark in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/databricks-mosaic-s-grid-polyfill-is-taking-longer-to-explode/m-p/51468#M29158</link>
    <description>&lt;P&gt;&lt;STRONG&gt;Pyspark Configuration&lt;/STRONG&gt;: pyspark --packages io.delta:delta-core_2.12:2.4.0,org.apache.hadoop:hadoop-aws:3.3.4,io.delta:delta-storage-s3-dynamodb:2.4.0 --conf "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension" --conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog" --executor-memory 10g --driver-memory 16g&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;EMR Configuration:&lt;/STRONG&gt;&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;m5a.16xlarge master and m5a.4xlarge core fleets scalable till 120 instances&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;Data&lt;/STRONG&gt;: 12000 records and 80MB size&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;Observations&lt;/STRONG&gt;: time to explode 1000 records is taking 2min and for 5000 is 33min&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;Pseudo Code&lt;/STRONG&gt;:&lt;/P&gt;&lt;PRE&gt;&lt;SPAN class=""&gt;from&lt;/SPAN&gt; pyspark.conf &lt;SPAN class=""&gt;import&lt;/SPAN&gt; SparkConf
&lt;SPAN class=""&gt;from&lt;/SPAN&gt; pyspark.sql &lt;SPAN class=""&gt;import&lt;/SPAN&gt; SparkSession
&lt;SPAN class=""&gt;from&lt;/SPAN&gt; pyspark.sql.functions &lt;SPAN class=""&gt;import&lt;/SPAN&gt; lit, col, explode, explode_outer
&lt;SPAN class=""&gt;from&lt;/SPAN&gt; mosaic &lt;SPAN class=""&gt;import&lt;/SPAN&gt; enable_mosaic, st_geomfromwkb, st_geomfromwkt, st_aswkt, st_isvalid, grid_polyfill

conf = SparkConf().setAll([(&lt;SPAN class=""&gt;"spark.databricks.labs.mosaic.jar.autoattach"&lt;/SPAN&gt;, &lt;SPAN class=""&gt;'false'&lt;/SPAN&gt;)])

spark=SparkSession.builder.config(conf=conf).getOrCreate()
enable_mosaic(spark)

polygon_df= df.withColumn(&lt;SPAN class=""&gt;"index_array"&lt;/SPAN&gt;, grid_polyfill(col(&lt;SPAN class=""&gt;"GEOMETRY"&lt;/SPAN&gt;), lit(&lt;SPAN class=""&gt;11&lt;/SPAN&gt;)))
exploded_df= polygon_df.withColumn(&lt;SPAN class=""&gt;"index_id"&lt;/SPAN&gt;, explode_outer(col(&lt;SPAN class=""&gt;"index_array"&lt;/SPAN&gt;))).select(&lt;SPAN class=""&gt;"GEOMETRY"&lt;/SPAN&gt;, &lt;SPAN class=""&gt;"index_id"&lt;/SPAN&gt;)

exploded_df.write.&lt;SPAN class=""&gt;format&lt;/SPAN&gt;(&lt;SPAN class=""&gt;"delta"&lt;/SPAN&gt;).option(&lt;SPAN class=""&gt;"overwrite"&lt;/SPAN&gt;, &lt;SPAN class=""&gt;"true"&lt;/SPAN&gt;).save(&lt;SPAN class=""&gt;"&amp;lt;s3_path&amp;gt;"&lt;/SPAN&gt;)&lt;/PRE&gt;&lt;P&gt;Any suggestions for optimizations in explode function?&lt;/P&gt;</description>
    <pubDate>Tue, 14 Nov 2023 05:05:34 GMT</pubDate>
    <dc:creator>KiranKondamadug</dc:creator>
    <dc:date>2023-11-14T05:05:34Z</dc:date>
    <item>
      <title>Databricks Mosaic's grid_polyfill() is taking longer to explode the index when run using PySpark</title>
      <link>https://community.databricks.com/t5/data-engineering/databricks-mosaic-s-grid-polyfill-is-taking-longer-to-explode/m-p/51468#M29158</link>
      <description>&lt;P&gt;&lt;STRONG&gt;Pyspark Configuration&lt;/STRONG&gt;: pyspark --packages io.delta:delta-core_2.12:2.4.0,org.apache.hadoop:hadoop-aws:3.3.4,io.delta:delta-storage-s3-dynamodb:2.4.0 --conf "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension" --conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog" --executor-memory 10g --driver-memory 16g&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;EMR Configuration:&lt;/STRONG&gt;&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;m5a.16xlarge master and m5a.4xlarge core fleets scalable till 120 instances&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;Data&lt;/STRONG&gt;: 12000 records and 80MB size&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;Observations&lt;/STRONG&gt;: time to explode 1000 records is taking 2min and for 5000 is 33min&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;Pseudo Code&lt;/STRONG&gt;:&lt;/P&gt;&lt;PRE&gt;&lt;SPAN class=""&gt;from&lt;/SPAN&gt; pyspark.conf &lt;SPAN class=""&gt;import&lt;/SPAN&gt; SparkConf
&lt;SPAN class=""&gt;from&lt;/SPAN&gt; pyspark.sql &lt;SPAN class=""&gt;import&lt;/SPAN&gt; SparkSession
&lt;SPAN class=""&gt;from&lt;/SPAN&gt; pyspark.sql.functions &lt;SPAN class=""&gt;import&lt;/SPAN&gt; lit, col, explode, explode_outer
&lt;SPAN class=""&gt;from&lt;/SPAN&gt; mosaic &lt;SPAN class=""&gt;import&lt;/SPAN&gt; enable_mosaic, st_geomfromwkb, st_geomfromwkt, st_aswkt, st_isvalid, grid_polyfill

conf = SparkConf().setAll([(&lt;SPAN class=""&gt;"spark.databricks.labs.mosaic.jar.autoattach"&lt;/SPAN&gt;, &lt;SPAN class=""&gt;'false'&lt;/SPAN&gt;)])

spark=SparkSession.builder.config(conf=conf).getOrCreate()
enable_mosaic(spark)

polygon_df= df.withColumn(&lt;SPAN class=""&gt;"index_array"&lt;/SPAN&gt;, grid_polyfill(col(&lt;SPAN class=""&gt;"GEOMETRY"&lt;/SPAN&gt;), lit(&lt;SPAN class=""&gt;11&lt;/SPAN&gt;)))
exploded_df= polygon_df.withColumn(&lt;SPAN class=""&gt;"index_id"&lt;/SPAN&gt;, explode_outer(col(&lt;SPAN class=""&gt;"index_array"&lt;/SPAN&gt;))).select(&lt;SPAN class=""&gt;"GEOMETRY"&lt;/SPAN&gt;, &lt;SPAN class=""&gt;"index_id"&lt;/SPAN&gt;)

exploded_df.write.&lt;SPAN class=""&gt;format&lt;/SPAN&gt;(&lt;SPAN class=""&gt;"delta"&lt;/SPAN&gt;).option(&lt;SPAN class=""&gt;"overwrite"&lt;/SPAN&gt;, &lt;SPAN class=""&gt;"true"&lt;/SPAN&gt;).save(&lt;SPAN class=""&gt;"&amp;lt;s3_path&amp;gt;"&lt;/SPAN&gt;)&lt;/PRE&gt;&lt;P&gt;Any suggestions for optimizations in explode function?&lt;/P&gt;</description>
      <pubDate>Tue, 14 Nov 2023 05:05:34 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/databricks-mosaic-s-grid-polyfill-is-taking-longer-to-explode/m-p/51468#M29158</guid>
      <dc:creator>KiranKondamadug</dc:creator>
      <dc:date>2023-11-14T05:05:34Z</dc:date>
    </item>
  </channel>
</rss>

