<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: How to optimize jobs performance in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/how-to-optimize-jobs-performance/m-p/7555#M3388</link>
    <description>&lt;P&gt;Hi Lakshay,&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Thank you for replying.  One thing I noticed is in the job description in "Spark UI",  each job with the below code takes an average of 15 minutes.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;"save at StoreTransform.scala"&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Not sure whether it is a custom code or a Databricks code.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Regards,&lt;/P&gt;&lt;P&gt;Rajesh.&lt;/P&gt;</description>
    <pubDate>Fri, 17 Mar 2023 14:47:19 GMT</pubDate>
    <dc:creator>RajeshRK</dc:creator>
    <dc:date>2023-03-17T14:47:19Z</dc:date>
    <item>
      <title>How to optimize jobs performance</title>
      <link>https://community.databricks.com/t5/data-engineering/how-to-optimize-jobs-performance/m-p/7553#M3386</link>
      <description>&lt;P&gt;Hi Team,&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;We have a complex ETL job running in databricks for 6 hours.  The cluster has the below configuration: &lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Minworkers: 16&lt;/P&gt;&lt;P&gt;Maxworkers: 24&lt;/P&gt;&lt;P&gt;Worker and Driver Node Type: Standard_DS14_v2. (16 cores, 128 GB RAM)&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;I have monitored the job progress in Spark UI for an hour, and my observations are  below: &lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;- The jobs are progressing and not stuck for a long time.&amp;nbsp;&amp;nbsp;&lt;/P&gt;&lt;P&gt;- The workers nodes scaled up to 24 (max_workers configured)&lt;/P&gt;&lt;P&gt;- Shuffling (Read/Write) happens with a large amount of data.&amp;nbsp; (I Ran this job with spark.sql.shuffle.partitions 4000)&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;We are expecting the jobs should be completed within 4 hours.    Any suggestions, please, to optimize the performance of the job? &lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Regards,&lt;/P&gt;&lt;P&gt;Rajesh.&lt;/P&gt;</description>
      <pubDate>Fri, 17 Mar 2023 11:29:09 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/how-to-optimize-jobs-performance/m-p/7553#M3386</guid>
      <dc:creator>RajeshRK</dc:creator>
      <dc:date>2023-03-17T11:29:09Z</dc:date>
    </item>
    <item>
      <title>Re: How to optimize jobs performance</title>
      <link>https://community.databricks.com/t5/data-engineering/how-to-optimize-jobs-performance/m-p/7554#M3387</link>
      <description>&lt;P&gt;Hi @Rajesh Kannan R​&amp;nbsp;, Can you check the Spark UI for the spark job where the job is spending most of the time. Also, look for any failed spark jobs in Spark UI.&lt;/P&gt;</description>
      <pubDate>Fri, 17 Mar 2023 14:35:33 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/how-to-optimize-jobs-performance/m-p/7554#M3387</guid>
      <dc:creator>Lakshay</dc:creator>
      <dc:date>2023-03-17T14:35:33Z</dc:date>
    </item>
    <item>
      <title>Re: How to optimize jobs performance</title>
      <link>https://community.databricks.com/t5/data-engineering/how-to-optimize-jobs-performance/m-p/7555#M3388</link>
      <description>&lt;P&gt;Hi Lakshay,&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Thank you for replying.  One thing I noticed is in the job description in "Spark UI",  each job with the below code takes an average of 15 minutes.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;"save at StoreTransform.scala"&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Not sure whether it is a custom code or a Databricks code.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Regards,&lt;/P&gt;&lt;P&gt;Rajesh.&lt;/P&gt;</description>
      <pubDate>Fri, 17 Mar 2023 14:47:19 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/how-to-optimize-jobs-performance/m-p/7555#M3388</guid>
      <dc:creator>RajeshRK</dc:creator>
      <dc:date>2023-03-17T14:47:19Z</dc:date>
    </item>
    <item>
      <title>Re: How to optimize jobs performance</title>
      <link>https://community.databricks.com/t5/data-engineering/how-to-optimize-jobs-performance/m-p/7556#M3389</link>
      <description>&lt;P&gt;Hi @Rajesh Kannan R​&amp;nbsp;, It looks like a custom code. Could you please share a task-level screenshot of one of these stages? &lt;/P&gt;</description>
      <pubDate>Fri, 17 Mar 2023 14:55:14 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/how-to-optimize-jobs-performance/m-p/7556#M3389</guid>
      <dc:creator>Lakshay</dc:creator>
      <dc:date>2023-03-17T14:55:14Z</dc:date>
    </item>
    <item>
      <title>Re: How to optimize jobs performance</title>
      <link>https://community.databricks.com/t5/data-engineering/how-to-optimize-jobs-performance/m-p/7557#M3390</link>
      <description>&lt;P&gt;Hi Lakshay,&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Unfortunately, I haven't captured it.   I will share if I run the job next time.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Regards,&lt;/P&gt;&lt;P&gt;Rajesh.&lt;/P&gt;</description>
      <pubDate>Fri, 17 Mar 2023 15:03:25 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/how-to-optimize-jobs-performance/m-p/7557#M3390</guid>
      <dc:creator>RajeshRK</dc:creator>
      <dc:date>2023-03-17T15:03:25Z</dc:date>
    </item>
    <item>
      <title>Re: How to optimize jobs performance</title>
      <link>https://community.databricks.com/t5/data-engineering/how-to-optimize-jobs-performance/m-p/7558#M3391</link>
      <description>&lt;P&gt;Sure. You can also try the below suggestions:&lt;/P&gt;&lt;OL&gt;&lt;LI&gt;Use compute-optimized Node Type. Currently, you are using a Memory-optimized one.&lt;/LI&gt;&lt;LI&gt;Run the job with spark.sql.shuffle.partitions auto&lt;/LI&gt;&lt;/OL&gt;</description>
      <pubDate>Fri, 17 Mar 2023 19:06:13 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/how-to-optimize-jobs-performance/m-p/7558#M3391</guid>
      <dc:creator>Lakshay</dc:creator>
      <dc:date>2023-03-17T19:06:13Z</dc:date>
    </item>
    <item>
      <title>Re: How to optimize jobs performance</title>
      <link>https://community.databricks.com/t5/data-engineering/how-to-optimize-jobs-performance/m-p/7559#M3392</link>
      <description>&lt;P&gt;Hi @Rajesh Kannan R​&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Thank you for your question! To assist you better, please take a moment to review the answer and let me know if it best fits your needs.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Please help us select the best solution by clicking on "Select As Best" if it does.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Your feedback will help us ensure that we are providing the best possible service to you. &lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Thank you!&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Sat, 18 Mar 2023 06:08:46 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/how-to-optimize-jobs-performance/m-p/7559#M3392</guid>
      <dc:creator>Anonymous</dc:creator>
      <dc:date>2023-03-18T06:08:46Z</dc:date>
    </item>
    <item>
      <title>Re: How to optimize jobs performance</title>
      <link>https://community.databricks.com/t5/data-engineering/how-to-optimize-jobs-performance/m-p/7560#M3393</link>
      <description>&lt;P&gt;@Lakshay Goel​&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Hi Lakshay,&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;It takes a couple of days to test this recommendation.   I will try the job execution with new recommendations and update this thread.  &lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Regards,&lt;/P&gt;&lt;P&gt;Rajesh.&lt;/P&gt;</description>
      <pubDate>Tue, 21 Mar 2023 06:54:37 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/how-to-optimize-jobs-performance/m-p/7560#M3393</guid>
      <dc:creator>RajeshRK</dc:creator>
      <dc:date>2023-03-21T06:54:37Z</dc:date>
    </item>
  </channel>
</rss>

