<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: databricks job taking longer time to load 2.3 gb data from bolb to ssms table in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/databricks-job-taking-longer-time-to-load-2-3-gb-data-from-bolb/m-p/103030#M41300</link>
    <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/15469"&gt;@RiyazAliM&lt;/a&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;i just want want to decrease the job time so that why i was using repartition,batch size,num partiton, but i did not work can you please suggest correct code this my worker node details&amp;nbsp;&lt;/P&gt;&lt;DIV class=""&gt;&lt;SPAN class=""&gt;1-2 Workers&lt;/SPAN&gt;&lt;SPAN class=""&gt;&lt;SPAN&gt;32-64&amp;nbsp;GB Memory&lt;/SPAN&gt;&lt;SPAN&gt;4-8&amp;nbsp;Cores&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV class=""&gt;&lt;SPAN class=""&gt;1 Driver&lt;/SPAN&gt;&lt;SPAN class=""&gt;32&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;GB Memory,&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;4&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;Cores&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV class=""&gt;&lt;SPAN class=""&gt;Runtime&lt;/SPAN&gt;&lt;SPAN class=""&gt;14.3.x-scala2.12&lt;/SPAN&gt;&lt;/DIV&gt;</description>
    <pubDate>Mon, 23 Dec 2024 14:04:32 GMT</pubDate>
    <dc:creator>vijaypodili</dc:creator>
    <dc:date>2024-12-23T14:04:32Z</dc:date>
    <item>
      <title>databricks job taking longer time to load 2.3 gb data from bolb to ssms table</title>
      <link>https://community.databricks.com/t5/data-engineering/databricks-job-taking-longer-time-to-load-2-3-gb-data-from-bolb/m-p/102875#M41261</link>
      <description>&lt;PRE&gt;df_CorpBond= spark.read.&lt;SPAN class=""&gt;format(&lt;SPAN class=""&gt;"parquet").option(&lt;SPAN class=""&gt;"header", &lt;SPAN class=""&gt;"true").load(&lt;SPAN class=""&gt;f"/mnt/&lt;SPAN class=""&gt;{container_name}/raw_data/dsl.corporate.parquet")
df_CorpBond.repartition(20).write\
    .&lt;SPAN class=""&gt;format(&lt;SPAN class=""&gt;"jdbc")\
    .option(&lt;SPAN class=""&gt;"url", url_connector)\
    .option(&lt;SPAN class=""&gt;"dbtable", &lt;SPAN class=""&gt;"MarkIt_CorpBonds")\
    .option(&lt;SPAN class=""&gt;"user", user)\
    .option(&lt;SPAN class=""&gt;"password", pwd)\
    .option(&lt;SPAN class=""&gt;"driver", &lt;SPAN class=""&gt;"com.microsoft.sqlserver.jdbc.SQLServerDriver")\
    .option(&lt;SPAN class=""&gt;"numPartitions", &lt;SPAN class=""&gt;100)\
    .option(&lt;SPAN class=""&gt;"batchsize", &lt;SPAN class=""&gt;100000)\
    .mode(&lt;SPAN class=""&gt;"overwrite")\
    .save()&lt;BR /&gt;&lt;BR /&gt;this&amp;nbsp;is&amp;nbsp;my&amp;nbsp;code&amp;nbsp;to&amp;nbsp;load&amp;nbsp;2.3&amp;nbsp;gb&amp;nbsp;blob&amp;nbsp;data&amp;nbsp;into&amp;nbsp;ssms&amp;nbsp;table&amp;nbsp;job&amp;nbsp;will&amp;nbsp;take&amp;nbsp;more&amp;nbsp;than&amp;nbsp;2&amp;nbsp;hours&amp;nbsp;my&amp;nbsp;cluster&amp;nbsp;size&amp;nbsp;is&amp;nbsp;94gb&amp;nbsp;and&amp;nbsp;have&amp;nbsp;1&amp;nbsp;driver &amp;nbsp;node&amp;nbsp;and&amp;nbsp;2 worker node how we can optimize the code &amp;nbsp;&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/PRE&gt;</description>
      <pubDate>Sat, 21 Dec 2024 06:35:17 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/databricks-job-taking-longer-time-to-load-2-3-gb-data-from-bolb/m-p/102875#M41261</guid>
      <dc:creator>vijaypodili</dc:creator>
      <dc:date>2024-12-21T06:35:17Z</dc:date>
    </item>
    <item>
      <title>Re: databricks job taking longer time to load 2.3 gb data from bolb to ssms table</title>
      <link>https://community.databricks.com/t5/data-engineering/databricks-job-taking-longer-time-to-load-2-3-gb-data-from-bolb/m-p/102892#M41263</link>
      <description>&lt;P&gt;Is this comparing it with another job runs or by comparing it with all purpose cluster?&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Sat, 21 Dec 2024 14:16:41 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/databricks-job-taking-longer-time-to-load-2-3-gb-data-from-bolb/m-p/102892#M41263</guid>
      <dc:creator>Walter_C</dc:creator>
      <dc:date>2024-12-21T14:16:41Z</dc:date>
    </item>
    <item>
      <title>Re: databricks job taking longer time to load 2.3 gb data from bolb to ssms table</title>
      <link>https://community.databricks.com/t5/data-engineering/databricks-job-taking-longer-time-to-load-2-3-gb-data-from-bolb/m-p/102895#M41264</link>
      <description>&lt;P&gt;i'm trying to say my cluster have enough storage space 94gb so it can easy handle 2.3 gb data but my job taking longer time time&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="vijaypodili_0-1734790771188.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/13645iF49F7BD9992B68E7/image-size/medium?v=v2&amp;amp;px=400" role="button" title="vijaypodili_0-1734790771188.png" alt="vijaypodili_0-1734790771188.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;job 2&amp;nbsp; and 3 completed with in the 3 min&amp;nbsp;&lt;/P&gt;&lt;P&gt;but job 4 taking longer to to complete its tasks&lt;/P&gt;</description>
      <pubDate>Sat, 21 Dec 2024 14:21:04 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/databricks-job-taking-longer-time-to-load-2-3-gb-data-from-bolb/m-p/102895#M41264</guid>
      <dc:creator>vijaypodili</dc:creator>
      <dc:date>2024-12-21T14:21:04Z</dc:date>
    </item>
    <item>
      <title>Re: databricks job taking longer time to load 2.3 gb data from bolb to ssms table</title>
      <link>https://community.databricks.com/t5/data-engineering/databricks-job-taking-longer-time-to-load-2-3-gb-data-from-bolb/m-p/102900#M41265</link>
      <description>&lt;P&gt;i'm trying to say my cluster have enough storage space 94gb so it can easy handle 2.3 gb data but my job taking longer time time&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="vijaypodili_0-1734792013095.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/13647i543D2F3A2F8AB5A3/image-size/medium?v=v2&amp;amp;px=400" role="button" title="vijaypodili_0-1734792013095.png" alt="vijaypodili_0-1734792013095.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;job 2&amp;nbsp; and 3 completed with in the 3 min&amp;nbsp;&lt;/P&gt;&lt;P&gt;but job 4 taking longer to to complete its tasks&lt;/P&gt;</description>
      <pubDate>Sat, 21 Dec 2024 14:41:12 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/databricks-job-taking-longer-time-to-load-2-3-gb-data-from-bolb/m-p/102900#M41265</guid>
      <dc:creator>vijaypodili</dc:creator>
      <dc:date>2024-12-21T14:41:12Z</dc:date>
    </item>
    <item>
      <title>Re: databricks job taking longer time to load 2.3 gb data from bolb to ssms table</title>
      <link>https://community.databricks.com/t5/data-engineering/databricks-job-taking-longer-time-to-load-2-3-gb-data-from-bolb/m-p/102953#M41281</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/115094"&gt;@vijaypodili&lt;/a&gt;, wondering why did you repartition the df to 20 and then set the num partitions to 100. Also I see that your cluster has 94 gigs but what is the number of cores your cluster has?&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Mon, 23 Dec 2024 06:03:44 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/databricks-job-taking-longer-time-to-load-2-3-gb-data-from-bolb/m-p/102953#M41281</guid>
      <dc:creator>RiyazAliM</dc:creator>
      <dc:date>2024-12-23T06:03:44Z</dc:date>
    </item>
    <item>
      <title>Re: databricks job taking longer time to load 2.3 gb data from bolb to ssms table</title>
      <link>https://community.databricks.com/t5/data-engineering/databricks-job-taking-longer-time-to-load-2-3-gb-data-from-bolb/m-p/103030#M41300</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/15469"&gt;@RiyazAliM&lt;/a&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;i just want want to decrease the job time so that why i was using repartition,batch size,num partiton, but i did not work can you please suggest correct code this my worker node details&amp;nbsp;&lt;/P&gt;&lt;DIV class=""&gt;&lt;SPAN class=""&gt;1-2 Workers&lt;/SPAN&gt;&lt;SPAN class=""&gt;&lt;SPAN&gt;32-64&amp;nbsp;GB Memory&lt;/SPAN&gt;&lt;SPAN&gt;4-8&amp;nbsp;Cores&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV class=""&gt;&lt;SPAN class=""&gt;1 Driver&lt;/SPAN&gt;&lt;SPAN class=""&gt;32&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;GB Memory,&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;4&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;Cores&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV class=""&gt;&lt;SPAN class=""&gt;Runtime&lt;/SPAN&gt;&lt;SPAN class=""&gt;14.3.x-scala2.12&lt;/SPAN&gt;&lt;/DIV&gt;</description>
      <pubDate>Mon, 23 Dec 2024 14:04:32 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/databricks-job-taking-longer-time-to-load-2-3-gb-data-from-bolb/m-p/103030#M41300</guid>
      <dc:creator>vijaypodili</dc:creator>
      <dc:date>2024-12-23T14:04:32Z</dc:date>
    </item>
    <item>
      <title>Re: databricks job taking longer time to load 2.3 gb data from bolb to ssms table</title>
      <link>https://community.databricks.com/t5/data-engineering/databricks-job-taking-longer-time-to-load-2-3-gb-data-from-bolb/m-p/103093#M41329</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/115094"&gt;@vijaypodili&lt;/a&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;ideally, given you have 8 cores (for 2 workers), repartition/numPartition should be a multiple of 8 (no. of cores).&lt;/P&gt;&lt;P&gt;Concern here is, I don't see any transformation in the code snippet shared that could potentially trigger a long running job. I strongly believe, writing it to the table in SSMS is taking longer in this case.&amp;nbsp;&lt;/P&gt;&lt;P&gt;For the job that's taking time to execute, would you be able to share the DAG from the spark UI?&lt;/P&gt;&lt;P&gt;Also, check the below &lt;A href="https://stackoverflow.com/questions/55708079/spark-optimise-writing-a-dataframe-to-sql-server" target="_self"&gt;StackOverFlow&lt;/A&gt;, where folks suggesting to use various connectors to improve write performance.&lt;/P&gt;</description>
      <pubDate>Tue, 24 Dec 2024 07:11:43 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/databricks-job-taking-longer-time-to-load-2-3-gb-data-from-bolb/m-p/103093#M41329</guid>
      <dc:creator>RiyazAliM</dc:creator>
      <dc:date>2024-12-24T07:11:43Z</dc:date>
    </item>
    <item>
      <title>Re: databricks job taking longer time to load 2.3 gb data from bolb to ssms table</title>
      <link>https://community.databricks.com/t5/data-engineering/databricks-job-taking-longer-time-to-load-2-3-gb-data-from-bolb/m-p/103115#M41333</link>
      <description>&lt;P&gt;i removed the numpartiton,batch size and repartition as well job will take almost 3hrs to write data into ssms tables&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 24 Dec 2024 10:30:51 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/databricks-job-taking-longer-time-to-load-2-3-gb-data-from-bolb/m-p/103115#M41333</guid>
      <dc:creator>vijaypodili</dc:creator>
      <dc:date>2024-12-24T10:30:51Z</dc:date>
    </item>
    <item>
      <title>Re: databricks job taking longer time to load 2.3 gb data from bolb to ssms table</title>
      <link>https://community.databricks.com/t5/data-engineering/databricks-job-taking-longer-time-to-load-2-3-gb-data-from-bolb/m-p/103222#M41362</link>
      <description>&lt;P&gt;Instead of removing, try to tweak the num partitions, repartition and shuffle partitions to see if it increases the write speed. The Spark UI &amp;amp; DAG flow will exactly tell us the execution plan and we can see what's taking time to load the table to SSMS.&lt;/P&gt;</description>
      <pubDate>Thu, 26 Dec 2024 09:22:19 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/databricks-job-taking-longer-time-to-load-2-3-gb-data-from-bolb/m-p/103222#M41362</guid>
      <dc:creator>RiyazAliM</dc:creator>
      <dc:date>2024-12-26T09:22:19Z</dc:date>
    </item>
    <item>
      <title>Re: databricks job taking longer time to load 2.3 gb data from bolb to ssms table</title>
      <link>https://community.databricks.com/t5/data-engineering/databricks-job-taking-longer-time-to-load-2-3-gb-data-from-bolb/m-p/103503#M41460</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/15469"&gt;@RiyazAliM&lt;/a&gt;&amp;nbsp;&lt;BR /&gt;this is my dag digram&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="vijaypodili_0-1735540926974.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/13766i5064AAC158AD1DF1/image-size/medium?v=v2&amp;amp;px=400" role="button" title="vijaypodili_0-1735540926974.png" alt="vijaypodili_0-1735540926974.png" /&gt;&lt;/span&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="vijaypodili_1-1735540954147.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/13767i5B72C305821E01D1/image-size/medium?v=v2&amp;amp;px=400" role="button" title="vijaypodili_1-1735540954147.png" alt="vijaypodili_1-1735540954147.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;file size is 3.5 gb and in future we need to load 14gb as well&lt;/P&gt;</description>
      <pubDate>Mon, 30 Dec 2024 06:43:55 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/databricks-job-taking-longer-time-to-load-2-3-gb-data-from-bolb/m-p/103503#M41460</guid>
      <dc:creator>vijaypodili</dc:creator>
      <dc:date>2024-12-30T06:43:55Z</dc:date>
    </item>
  </channel>
</rss>

