<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Best practices for optimizing Spark jobs in Get Started Discussions</title>
    <link>https://community.databricks.com/t5/get-started-discussions/best-practices-for-optimizing-spark-jobs/m-p/92417#M4374</link>
    <description>&lt;P&gt;There are so many.&lt;BR /&gt;Here are a few:&lt;BR /&gt;- look for data skew&lt;BR /&gt;- shuffle as less as possible&lt;BR /&gt;- avoid many small files&lt;BR /&gt;- use spark and not only pure python&lt;BR /&gt;- if using an autoscale cluster: check if you don't lose a lot of time scaling up/down&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;</description>
    <pubDate>Tue, 01 Oct 2024 10:12:07 GMT</pubDate>
    <dc:creator>-werners-</dc:creator>
    <dc:date>2024-10-01T10:12:07Z</dc:date>
    <item>
      <title>Best practices for optimizing Spark jobs</title>
      <link>https://community.databricks.com/t5/get-started-discussions/best-practices-for-optimizing-spark-jobs/m-p/92387#M4372</link>
      <description>&lt;P&gt;What are some best practices for optimizing Spark jobs in Databricks, especially when dealing large datasets? Any tips or resources would be greatly appreciated! I’m trying to analyze data on &lt;A href="https://ambermenu.com.ph/" target="_blank" rel="noopener"&gt;restaurant menu prices&lt;/A&gt; so that insights would be especially helpful!&lt;/P&gt;</description>
      <pubDate>Tue, 01 Oct 2024 06:46:50 GMT</pubDate>
      <guid>https://community.databricks.com/t5/get-started-discussions/best-practices-for-optimizing-spark-jobs/m-p/92387#M4372</guid>
      <dc:creator>chris0991</dc:creator>
      <dc:date>2024-10-01T06:46:50Z</dc:date>
    </item>
    <item>
      <title>Re: Best practices for optimizing Spark jobs</title>
      <link>https://community.databricks.com/t5/get-started-discussions/best-practices-for-optimizing-spark-jobs/m-p/92417#M4374</link>
      <description>&lt;P&gt;There are so many.&lt;BR /&gt;Here are a few:&lt;BR /&gt;- look for data skew&lt;BR /&gt;- shuffle as less as possible&lt;BR /&gt;- avoid many small files&lt;BR /&gt;- use spark and not only pure python&lt;BR /&gt;- if using an autoscale cluster: check if you don't lose a lot of time scaling up/down&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;</description>
      <pubDate>Tue, 01 Oct 2024 10:12:07 GMT</pubDate>
      <guid>https://community.databricks.com/t5/get-started-discussions/best-practices-for-optimizing-spark-jobs/m-p/92417#M4374</guid>
      <dc:creator>-werners-</dc:creator>
      <dc:date>2024-10-01T10:12:07Z</dc:date>
    </item>
    <item>
      <title>Re: Best practices for optimizing Spark jobs</title>
      <link>https://community.databricks.com/t5/get-started-discussions/best-practices-for-optimizing-spark-jobs/m-p/125258#M10371</link>
      <description>&lt;P&gt;Good one&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/175186"&gt;@john34567&lt;/a&gt;&amp;nbsp;, made me chuckle but still this is a spam &lt;span class="lia-unicode-emoji" title=":grinning_face_with_smiling_eyes:"&gt;😄&lt;/span&gt;&lt;/P&gt;</description>
      <pubDate>Tue, 15 Jul 2025 09:12:13 GMT</pubDate>
      <guid>https://community.databricks.com/t5/get-started-discussions/best-practices-for-optimizing-spark-jobs/m-p/125258#M10371</guid>
      <dc:creator>szymon_dybczak</dc:creator>
      <dc:date>2025-07-15T09:12:13Z</dc:date>
    </item>
    <item>
      <title>Re: Best practices for optimizing Spark jobs</title>
      <link>https://community.databricks.com/t5/get-started-discussions/best-practices-for-optimizing-spark-jobs/m-p/139859#M11043</link>
      <description>&lt;DIV class=""&gt;&lt;DIV class=""&gt;&lt;DIV class=""&gt;&lt;DIV class=""&gt;&lt;DIV class=""&gt;&lt;DIV class=""&gt;&lt;P&gt;Optimizing Spark jobs is all about using smart data strategies like minimizing shuffles, tuning partitions, caching only what truly matters, and choosing the right file format to keep workloads efficient and cost-effective, and it reminds me of how planning ahead works just like checking the &lt;A href="http://wetherspoonmenuprices.co.uk/children/" target="_self"&gt;Wetherspoons kids menu&lt;/A&gt; before ordering so everything runs smoother, faster, and without unnecessary delays.&lt;/P&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;</description>
      <pubDate>Thu, 20 Nov 2025 23:33:47 GMT</pubDate>
      <guid>https://community.databricks.com/t5/get-started-discussions/best-practices-for-optimizing-spark-jobs/m-p/139859#M11043</guid>
      <dc:creator>Nohashah</dc:creator>
      <dc:date>2025-11-20T23:33:47Z</dc:date>
    </item>
    <item>
      <title>Re: Best practices for optimizing Spark jobs</title>
      <link>https://community.databricks.com/t5/get-started-discussions/best-practices-for-optimizing-spark-jobs/m-p/139892#M11046</link>
      <description>&lt;P&gt;In addition to above cool comments, try to use clusters with VMs enabled for &lt;A href="https://docs.databricks.com/aws/en/optimizations/disk-cache" target="_self"&gt;disk caching&lt;/A&gt; as well. This caches data at parquet files level in VM local storage, acting as a great complement to spark caching.&lt;/P&gt;</description>
      <pubDate>Fri, 21 Nov 2025 08:56:15 GMT</pubDate>
      <guid>https://community.databricks.com/t5/get-started-discussions/best-practices-for-optimizing-spark-jobs/m-p/139892#M11046</guid>
      <dc:creator>Coffee77</dc:creator>
      <dc:date>2025-11-21T08:56:15Z</dc:date>
    </item>
  </channel>
</rss>

