<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Predicting compute required to run Spark jobs in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/predicting-compute-required-to-run-spark-jobs/m-p/69764#M33911</link>
    <description>&lt;P&gt;Im working on a project to predict compute (cores) required to run spark jobs. Has anyone work on this or something similar before? How did you get started?&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Sun, 19 May 2024 08:25:10 GMT</pubDate>
    <dc:creator>kseyser</dc:creator>
    <dc:date>2024-05-19T08:25:10Z</dc:date>
    <item>
      <title>Predicting compute required to run Spark jobs</title>
      <link>https://community.databricks.com/t5/data-engineering/predicting-compute-required-to-run-spark-jobs/m-p/69764#M33911</link>
      <description>&lt;P&gt;Im working on a project to predict compute (cores) required to run spark jobs. Has anyone work on this or something similar before? How did you get started?&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Sun, 19 May 2024 08:25:10 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/predicting-compute-required-to-run-spark-jobs/m-p/69764#M33911</guid>
      <dc:creator>kseyser</dc:creator>
      <dc:date>2024-05-19T08:25:10Z</dc:date>
    </item>
    <item>
      <title>Re: Predicting compute required to run Spark jobs</title>
      <link>https://community.databricks.com/t5/data-engineering/predicting-compute-required-to-run-spark-jobs/m-p/69924#M33925</link>
      <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/105458"&gt;@kseyser&lt;/a&gt;&amp;nbsp;good day,&lt;/P&gt;
&lt;P&gt;This documentation might help you in your use-case:&amp;nbsp;&lt;A href="https://docs.databricks.com/en/compute/cluster-config-best-practices.html#compute-sizing-considerations" target="_blank"&gt;https://docs.databricks.com/en/compute/cluster-config-best-practices.html#compute-sizing-considerations&lt;/A&gt;&lt;/P&gt;
&lt;P&gt;Kind regards,&lt;/P&gt;
&lt;P&gt;Yesh&lt;/P&gt;</description>
      <pubDate>Mon, 20 May 2024 07:47:43 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/predicting-compute-required-to-run-spark-jobs/m-p/69924#M33925</guid>
      <dc:creator>Yeshwanth</dc:creator>
      <dc:date>2024-05-20T07:47:43Z</dc:date>
    </item>
    <item>
      <title>Re: Predicting compute required to run Spark jobs</title>
      <link>https://community.databricks.com/t5/data-engineering/predicting-compute-required-to-run-spark-jobs/m-p/70170#M34026</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/32523"&gt;@Yeshwanth&lt;/a&gt;, thank you for directing me to the documentation. I don't know much about computations, so I'm still figuring things out. So is there like a straight forward (standard) way to calculate the compute (no. of cores &amp;amp; memory) required to run spark jobs based on certain data volume of the job, frequency of the jobs, and number of jobs? I read that the data is generally partitioned into 128MB and the executor memory is divided into 300 MB reserved memory, 60% execution memory, and 40% storage memory. How would this help me calculate the compute for a data of size, say 1.5 TB?&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 21 May 2024 22:38:20 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/predicting-compute-required-to-run-spark-jobs/m-p/70170#M34026</guid>
      <dc:creator>kseyser</dc:creator>
      <dc:date>2024-05-21T22:38:20Z</dc:date>
    </item>
  </channel>
</rss>

