<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Spark dataframe performing poorly in Warehousing &amp; Analytics</title>
    <link>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91937#M1593</link>
    <description>&lt;P&gt;But the issue is coming up while writing the df, sure broadcast might optimize the join, but how will it enhance the data write operation?&lt;/P&gt;</description>
    <pubDate>Thu, 26 Sep 2024 16:31:09 GMT</pubDate>
    <dc:creator>qwerty3</dc:creator>
    <dc:date>2024-09-26T16:31:09Z</dc:date>
    <item>
      <title>Spark dataframe performing poorly</title>
      <link>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91459#M1566</link>
      <description>&lt;P&gt;I have huge datasets, transformation, display, print, show are working well on this data when read in a pandas dataframe. But the same dataframe when converted to a spark dataframe, is taking minutes to display even a single row and hours to write the data in a delta table.&lt;/P&gt;</description>
      <pubDate>Mon, 23 Sep 2024 15:08:26 GMT</pubDate>
      <guid>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91459#M1566</guid>
      <dc:creator>qwerty3</dc:creator>
      <dc:date>2024-09-23T15:08:26Z</dc:date>
    </item>
    <item>
      <title>Re: Spark dataframe performing poorly</title>
      <link>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91462#M1567</link>
      <description>&lt;P&gt;Can you please share the code snippet?&lt;/P&gt;</description>
      <pubDate>Mon, 23 Sep 2024 15:39:09 GMT</pubDate>
      <guid>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91462#M1567</guid>
      <dc:creator>gchandra</dc:creator>
      <dc:date>2024-09-23T15:39:09Z</dc:date>
    </item>
    <item>
      <title>Re: Spark dataframe performing poorly</title>
      <link>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91467#M1568</link>
      <description>&lt;DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;df.write.&lt;/SPAN&gt;&lt;SPAN&gt;format&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;"delta"&lt;/SPAN&gt;&lt;SPAN&gt;).&lt;/SPAN&gt;&lt;SPAN&gt;saveAsTable&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;"test_db.test_spark_df"&lt;/SPAN&gt;&lt;SPAN&gt;)&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;This took 3 mins to get completed with 5 rows and 4 columns&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;Actual datasets are not even getting written&lt;/SPAN&gt;&lt;/DIV&gt;&lt;/DIV&gt;</description>
      <pubDate>Mon, 23 Sep 2024 16:01:42 GMT</pubDate>
      <guid>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91467#M1568</guid>
      <dc:creator>qwerty3</dc:creator>
      <dc:date>2024-09-23T16:01:42Z</dc:date>
    </item>
    <item>
      <title>Re: Spark dataframe performing poorly</title>
      <link>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91468#M1569</link>
      <description>&lt;DIV&gt;&lt;SPAN&gt;df.write.&lt;/SPAN&gt;&lt;SPAN&gt;format&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;"delta"&lt;/SPAN&gt;&lt;SPAN&gt;).&lt;/SPAN&gt;&lt;SPAN&gt;saveAsTable&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;"test_db.test_spark_df"&lt;/SPAN&gt;&lt;SPAN&gt;)&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;This took 3 mins to get completed with 5 rows and 4 columns&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;Actual datasets are not even getting written&lt;/SPAN&gt;&lt;/DIV&gt;</description>
      <pubDate>Mon, 23 Sep 2024 16:02:19 GMT</pubDate>
      <guid>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91468#M1569</guid>
      <dc:creator>qwerty3</dc:creator>
      <dc:date>2024-09-23T16:02:19Z</dc:date>
    </item>
    <item>
      <title>Re: Spark dataframe performing poorly</title>
      <link>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91480#M1570</link>
      <description>&lt;P&gt;3 mins to write 5 rows is no good.&lt;/P&gt;
&lt;P&gt;Are you running this on a shared cluster with so many other jobs? Will it be possible to test this on a personal cluster to isolate the issue?&lt;/P&gt;
&lt;P&gt;try displaying the data frame in one cell display(df) and save the data frame in another cell.&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Mon, 23 Sep 2024 17:53:35 GMT</pubDate>
      <guid>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91480#M1570</guid>
      <dc:creator>gchandra</dc:creator>
      <dc:date>2024-09-23T17:53:35Z</dc:date>
    </item>
    <item>
      <title>Re: Spark dataframe performing poorly</title>
      <link>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91561#M1572</link>
      <description>&lt;P&gt;The cluster that I was using to execute this was not performing any other tasks, although the azure quota for this cluster family cpu was 83% at the time, I created a new cluster belonging to a family which had all the cores available, there spark is working well. But even at 83% utilization, should that cluster (the one used earlier, with high memory) perform so poorly?&lt;/P&gt;</description>
      <pubDate>Tue, 24 Sep 2024 11:16:02 GMT</pubDate>
      <guid>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91561#M1572</guid>
      <dc:creator>qwerty3</dc:creator>
      <dc:date>2024-09-24T11:16:02Z</dc:date>
    </item>
    <item>
      <title>Re: Spark dataframe performing poorly</title>
      <link>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91566#M1573</link>
      <description>&lt;P&gt;It's good to hear it worked on the new cluster family.&lt;/P&gt;
&lt;P&gt;If the quota is already at 83%, the number of nodes your cluster needs is important. If Azure is not able to provision that many resources, it could result in suboptimal performance.&lt;BR /&gt;&lt;BR /&gt;To find out this, please reduce the number of nodes so your cluster can start the job and complete it.&lt;/P&gt;</description>
      <pubDate>Tue, 24 Sep 2024 12:10:18 GMT</pubDate>
      <guid>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91566#M1573</guid>
      <dc:creator>gchandra</dc:creator>
      <dc:date>2024-09-24T12:10:18Z</dc:date>
    </item>
    <item>
      <title>Re: Spark dataframe performing poorly</title>
      <link>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91842#M1578</link>
      <description>&lt;P&gt;Earlier I was using EA family of clusters which were memory optimized, now when I shifted to general purpose compute, the same data is getting written in seconds. Is it that the EA family of memory optimized clusters are not very performant for spark operations?&lt;/P&gt;</description>
      <pubDate>Thu, 26 Sep 2024 09:40:07 GMT</pubDate>
      <guid>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91842#M1578</guid>
      <dc:creator>qwerty3</dc:creator>
      <dc:date>2024-09-26T09:40:07Z</dc:date>
    </item>
    <item>
      <title>Re: Spark dataframe performing poorly</title>
      <link>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91867#M1580</link>
      <description>&lt;P&gt;For processing 5 rows, EA vs. Non-EA doesn't matter.&lt;/P&gt;
&lt;P&gt;As you mentioned before, it could be non availability of the cluster in the quota.&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 26 Sep 2024 10:31:38 GMT</pubDate>
      <guid>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91867#M1580</guid>
      <dc:creator>gchandra</dc:creator>
      <dc:date>2024-09-26T10:31:38Z</dc:date>
    </item>
    <item>
      <title>Re: Spark dataframe performing poorly</title>
      <link>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91904#M1581</link>
      <description>&lt;P&gt;But even with General Purpose compute (256 GB memory, 64 cores, 8 max worker nodes, working solely on one task, i.e. one notebook) I am not able to write one dataframe as delta table, it contains geospatial data and must have data in lakhs&lt;/P&gt;</description>
      <pubDate>Thu, 26 Sep 2024 15:05:40 GMT</pubDate>
      <guid>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91904#M1581</guid>
      <dc:creator>qwerty3</dc:creator>
      <dc:date>2024-09-26T15:05:40Z</dc:date>
    </item>
    <item>
      <title>Re: Spark dataframe performing poorly</title>
      <link>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91917#M1583</link>
      <description>&lt;P&gt;It could be Skew, your partition anything.&lt;/P&gt;
&lt;P&gt;Without looking at the script, and knowing the schema, number of rows, and output of Spark UI, it's hard to say what is wrong.&lt;/P&gt;</description>
      <pubDate>Thu, 26 Sep 2024 15:39:41 GMT</pubDate>
      <guid>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91917#M1583</guid>
      <dc:creator>gchandra</dc:creator>
      <dc:date>2024-09-26T15:39:41Z</dc:date>
    </item>
    <item>
      <title>Re: Spark dataframe performing poorly</title>
      <link>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91921#M1586</link>
      <description>&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="qwerty3_0-1727365368492.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/11529i65DE7D17DFAD121C/image-size/medium?v=v2&amp;amp;px=400" role="button" title="qwerty3_0-1727365368492.png" alt="qwerty3_0-1727365368492.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 26 Sep 2024 15:43:02 GMT</pubDate>
      <guid>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91921#M1586</guid>
      <dc:creator>qwerty3</dc:creator>
      <dc:date>2024-09-26T15:43:02Z</dc:date>
    </item>
    <item>
      <title>Re: Spark dataframe performing poorly</title>
      <link>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91923#M1587</link>
      <description>&lt;P&gt;&lt;span class="lia-unicode-emoji" title=":slightly_smiling_face:"&gt;🙂&lt;/span&gt; count() is just the action.&amp;nbsp;&lt;/P&gt;
&lt;P&gt;What are the transformations you are doing in the data frame? How many columns, how many rows you are anticipating approximately?&lt;/P&gt;</description>
      <pubDate>Thu, 26 Sep 2024 15:47:14 GMT</pubDate>
      <guid>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91923#M1587</guid>
      <dc:creator>gchandra</dc:creator>
      <dc:date>2024-09-26T15:47:14Z</dc:date>
    </item>
    <item>
      <title>Re: Spark dataframe performing poorly</title>
      <link>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91927#M1589</link>
      <description>&lt;P&gt;I want to write that data in a table, but it always get stuck, it has 12 columns, the task was getting stuck, that is why I wanted to see count of data&lt;/P&gt;</description>
      <pubDate>Thu, 26 Sep 2024 15:52:21 GMT</pubDate>
      <guid>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91927#M1589</guid>
      <dc:creator>qwerty3</dc:creator>
      <dc:date>2024-09-26T15:52:21Z</dc:date>
    </item>
    <item>
      <title>Re: Spark dataframe performing poorly</title>
      <link>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91929#M1590</link>
      <description>&lt;P&gt;One last time, please share the entire script of the data frame so I can see how I can help.&lt;/P&gt;</description>
      <pubDate>Thu, 26 Sep 2024 15:55:46 GMT</pubDate>
      <guid>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91929#M1590</guid>
      <dc:creator>gchandra</dc:creator>
      <dc:date>2024-09-26T15:55:46Z</dc:date>
    </item>
    <item>
      <title>Re: Spark dataframe performing poorly</title>
      <link>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91930#M1591</link>
      <description>&lt;DIV&gt;&lt;SPAN&gt;df = sparl.sql("SELECT df1.*, df2.*&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;&amp;nbsp; &amp;nbsp; FROM df1&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;&amp;nbsp; &amp;nbsp; JOIN df2&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;&amp;nbsp; &amp;nbsp; ON ST_Intersects(df1.geometry, df2.geometry)")&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp;&lt;/DIV&gt;&lt;DIV&gt;&lt;DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;df.write.&lt;/SPAN&gt;&lt;SPAN&gt;format&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;"delta"&lt;/SPAN&gt;&lt;SPAN&gt;).&lt;/SPAN&gt;&lt;SPAN&gt;mode&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;"append"&lt;/SPAN&gt;&lt;SPAN&gt;).&lt;/SPAN&gt;&lt;SPAN&gt;saveAsTable&lt;/SPAN&gt;&lt;SPAN&gt;(table_name&lt;/SPAN&gt;&lt;SPAN&gt;)&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&amp;nbsp;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;df1 has 1500 rows, df2 has 90 lakh rows.&lt;/SPAN&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;</description>
      <pubDate>Thu, 26 Sep 2024 16:02:01 GMT</pubDate>
      <guid>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91930#M1591</guid>
      <dc:creator>qwerty3</dc:creator>
      <dc:date>2024-09-26T16:02:01Z</dc:date>
    </item>
    <item>
      <title>Re: Spark dataframe performing poorly</title>
      <link>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91933#M1592</link>
      <description>&lt;P&gt;df1 has 1500 rows and df2 has 9 million rows, try broadcast join on df1&lt;BR /&gt;&lt;BR /&gt;df= spark.sql("""SELECT /*+ BROADCAST(df1) */&lt;BR /&gt;df1.*,&amp;nbsp;df2.*&amp;nbsp;FROM df1&amp;nbsp;JOIN df2&amp;nbsp; ON ST_Intersects(df1.geometry, df2.geometry) """)&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 26 Sep 2024 16:12:39 GMT</pubDate>
      <guid>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91933#M1592</guid>
      <dc:creator>gchandra</dc:creator>
      <dc:date>2024-09-26T16:12:39Z</dc:date>
    </item>
    <item>
      <title>Re: Spark dataframe performing poorly</title>
      <link>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91937#M1593</link>
      <description>&lt;P&gt;But the issue is coming up while writing the df, sure broadcast might optimize the join, but how will it enhance the data write operation?&lt;/P&gt;</description>
      <pubDate>Thu, 26 Sep 2024 16:31:09 GMT</pubDate>
      <guid>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91937#M1593</guid>
      <dc:creator>qwerty3</dc:creator>
      <dc:date>2024-09-26T16:31:09Z</dc:date>
    </item>
    <item>
      <title>Re: Spark dataframe performing poorly</title>
      <link>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91939#M1594</link>
      <description>&lt;P&gt;The spark concept is Lazy Transformation. This means all Transformations will be processed when an Action is invoked. In your case, the SQL JOIN is the transformation, and Write is the Action.&amp;nbsp;&lt;/P&gt;
&lt;P&gt;So try the Broadcast join and check the result.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 26 Sep 2024 16:46:40 GMT</pubDate>
      <guid>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91939#M1594</guid>
      <dc:creator>gchandra</dc:creator>
      <dc:date>2024-09-26T16:46:40Z</dc:date>
    </item>
    <item>
      <title>Re: Spark dataframe performing poorly</title>
      <link>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91955#M1595</link>
      <description>&lt;P&gt;Didn't work, still taking 10 minutes to write the df, which is a very long time, considering I 5000 such chunks to process&lt;/P&gt;</description>
      <pubDate>Thu, 26 Sep 2024 18:50:18 GMT</pubDate>
      <guid>https://community.databricks.com/t5/warehousing-analytics/spark-dataframe-performing-poorly/m-p/91955#M1595</guid>
      <dc:creator>qwerty3</dc:creator>
      <dc:date>2024-09-26T18:50:18Z</dc:date>
    </item>
  </channel>
</rss>

