<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Java heap issue, GC allocation failure while writing data from mysql to adls in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/java-heap-issue-gc-allocation-failure-while-writing-data-from/m-p/91035#M38065</link>
    <description>&lt;P&gt;Multiple things.&lt;/P&gt;&lt;OL&gt;&lt;LI&gt;First very obvious thing: "5000000" - &lt;SPAN class=""&gt;It's not surprising that you run OOM when loading such a huge amount of records.&lt;/SPAN&gt;&lt;/LI&gt;&lt;LI&gt;&lt;SPAN class=""&gt;The DBR has a built-in &lt;A href="https://docs.databricks.com/en/connect/external-systems/mysql.html#using-the-mysql-connector-in-databricks-runtime" target="_self"&gt;MySQL driver&lt;/A&gt;, use it instead&lt;/SPAN&gt;&lt;/LI&gt;&lt;LI&gt;&lt;SPAN class=""&gt;Use &lt;A href="https://docs.databricks.com/en/connect/external-systems/jdbc.html#control-number-of-rows-fetched-per-query" target="_self"&gt;fetchSize&lt;/A&gt; to control the number of records. Start e.g. with 1000 and measure the performance. Adapt it if needed&lt;/SPAN&gt;&lt;/LI&gt;&lt;LI&gt;&lt;SPAN class=""&gt;If the source table has a partition column, use it as &lt;A href="https://docs.databricks.com/en/connect/external-systems/jdbc.html#control-parallelism-for-jdbc-queries" target="_self"&gt;described here&lt;/A&gt;.&lt;/SPAN&gt;&lt;/LI&gt;&lt;/OL&gt;</description>
    <pubDate>Thu, 19 Sep 2024 09:36:47 GMT</pubDate>
    <dc:creator>Witold</dc:creator>
    <dc:date>2024-09-19T09:36:47Z</dc:date>
    <item>
      <title>Java heap issue, GC allocation failure while writing data from mysql to adls</title>
      <link>https://community.databricks.com/t5/data-engineering/java-heap-issue-gc-allocation-failure-while-writing-data-from/m-p/90970#M38045</link>
      <description>&lt;P&gt;Hi Team,&lt;/P&gt;&lt;P&gt;I am reading 60 million -80million data from mysql server and writing into ADLS in parquet format but i am getting java heap issue, GC allocation failure and out of memory issue.&lt;/P&gt;&lt;P&gt;below are my cluster configuration &amp;nbsp;&lt;/P&gt;&lt;P&gt;Driver - 56GB Ram, 16 core&lt;/P&gt;&lt;P&gt;Worker - 56GB Ram, 16 core&lt;/P&gt;&lt;P&gt;autos-calling enabled with min 4 worker to max 8 worker&amp;nbsp;&lt;/P&gt;&lt;P&gt;could you please help to resolve the issue ?&lt;/P&gt;&lt;P&gt;after reading the data from mysql server&amp;nbsp;&lt;/P&gt;&lt;P&gt;df.count() is giving me the result but df.write is failing with above mentioned error&lt;/P&gt;&lt;P&gt;i have tried with df.repartition() from 128 to 1024 but no luck also tried salting but dit now work for df.write.parquet&lt;/P&gt;</description>
      <pubDate>Wed, 18 Sep 2024 20:52:23 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/java-heap-issue-gc-allocation-failure-while-writing-data-from/m-p/90970#M38045</guid>
      <dc:creator>sshukla</dc:creator>
      <dc:date>2024-09-18T20:52:23Z</dc:date>
    </item>
    <item>
      <title>Re: Java heap issue, GC allocation failure while writing data from mysql to adls</title>
      <link>https://community.databricks.com/t5/data-engineering/java-heap-issue-gc-allocation-failure-while-writing-data-from/m-p/90971#M38046</link>
      <description>&lt;P&gt;team help me to resolve the problem&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 18 Sep 2024 20:54:58 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/java-heap-issue-gc-allocation-failure-while-writing-data-from/m-p/90971#M38046</guid>
      <dc:creator>sshukla</dc:creator>
      <dc:date>2024-09-18T20:54:58Z</dc:date>
    </item>
    <item>
      <title>Re: Java heap issue, GC allocation failure while writing data from mysql to adls</title>
      <link>https://community.databricks.com/t5/data-engineering/java-heap-issue-gc-allocation-failure-while-writing-data-from/m-p/90974#M38048</link>
      <description>&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="IMG_4903.jpeg" style="width: 1179px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/11331i1690F522646101A4/image-size/medium?v=v2&amp;amp;px=400" role="button" title="IMG_4903.jpeg" alt="IMG_4903.jpeg" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;every time I am seeing data behaviour like this&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 18 Sep 2024 21:33:55 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/java-heap-issue-gc-allocation-failure-while-writing-data-from/m-p/90974#M38048</guid>
      <dc:creator>sshukla</dc:creator>
      <dc:date>2024-09-18T21:33:55Z</dc:date>
    </item>
    <item>
      <title>Re: Java heap issue, GC allocation failure while writing data from mysql to adls</title>
      <link>https://community.databricks.com/t5/data-engineering/java-heap-issue-gc-allocation-failure-while-writing-data-from/m-p/91004#M38059</link>
      <description>&lt;P&gt;How do you read and write the records? Which cluster size do you use?&lt;/P&gt;</description>
      <pubDate>Thu, 19 Sep 2024 08:17:28 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/java-heap-issue-gc-allocation-failure-while-writing-data-from/m-p/91004#M38059</guid>
      <dc:creator>Witold</dc:creator>
      <dc:date>2024-09-19T08:17:28Z</dc:date>
    </item>
    <item>
      <title>Re: Java heap issue, GC allocation failure while writing data from mysql to adls</title>
      <link>https://community.databricks.com/t5/data-engineering/java-heap-issue-gc-allocation-failure-while-writing-data-from/m-p/91026#M38063</link>
      <description>&lt;P&gt;Hi,&lt;/P&gt;&lt;P&gt;below is the code&lt;/P&gt;&lt;P&gt;driver=“com.mysql.cj. jdbc.Driver'&lt;BR /&gt;database_host =“ip address"&lt;BR /&gt;database_port=“3306"&lt;BR /&gt;database_name =“database"&lt;BR /&gt;table =“table"&lt;BR /&gt;user ="user”&lt;BR /&gt;password =“0password"&lt;BR /&gt;url = f"jdbc:mysq]://(database_host): (database_port)/(database_name)?zeroDateTimeBehavior=CONVERT_TO_NULL”&lt;BR /&gt;remote_ tablel=&lt;BR /&gt;spark.read&lt;BR /&gt;format ("idb")&lt;BR /&gt;.option ("driver", driver)&lt;BR /&gt;.option ("url", url)&lt;BR /&gt;.option ("query", "select * from database.table where deleted =0")&lt;BR /&gt;.option ("user", user)&lt;BR /&gt;.option ("password", password)&lt;BR /&gt;.option ("maxRows InMemory",&lt;BR /&gt;5000000)&lt;/P&gt;&lt;P&gt;remote_table1.write.format ("parquet"). partitionBy("name") .save("/mt/sm/process/replica/datalake/myuday/datalake/2024/09/19”)&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;cluster config-&lt;/P&gt;&lt;P&gt;worker type - Standard_DS13_v2(56 ram 8 cores) min worker 8 max 12&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;driver type - Standard_DS13_v2(56 ram 8 cores) min worker 8 max 12&lt;/SPAN&gt;&lt;/P&gt;</description>
      <pubDate>Thu, 19 Sep 2024 09:18:46 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/java-heap-issue-gc-allocation-failure-while-writing-data-from/m-p/91026#M38063</guid>
      <dc:creator>sshukla</dc:creator>
      <dc:date>2024-09-19T09:18:46Z</dc:date>
    </item>
    <item>
      <title>Re: Java heap issue, GC allocation failure while writing data from mysql to adls</title>
      <link>https://community.databricks.com/t5/data-engineering/java-heap-issue-gc-allocation-failure-while-writing-data-from/m-p/91035#M38065</link>
      <description>&lt;P&gt;Multiple things.&lt;/P&gt;&lt;OL&gt;&lt;LI&gt;First very obvious thing: "5000000" - &lt;SPAN class=""&gt;It's not surprising that you run OOM when loading such a huge amount of records.&lt;/SPAN&gt;&lt;/LI&gt;&lt;LI&gt;&lt;SPAN class=""&gt;The DBR has a built-in &lt;A href="https://docs.databricks.com/en/connect/external-systems/mysql.html#using-the-mysql-connector-in-databricks-runtime" target="_self"&gt;MySQL driver&lt;/A&gt;, use it instead&lt;/SPAN&gt;&lt;/LI&gt;&lt;LI&gt;&lt;SPAN class=""&gt;Use &lt;A href="https://docs.databricks.com/en/connect/external-systems/jdbc.html#control-number-of-rows-fetched-per-query" target="_self"&gt;fetchSize&lt;/A&gt; to control the number of records. Start e.g. with 1000 and measure the performance. Adapt it if needed&lt;/SPAN&gt;&lt;/LI&gt;&lt;LI&gt;&lt;SPAN class=""&gt;If the source table has a partition column, use it as &lt;A href="https://docs.databricks.com/en/connect/external-systems/jdbc.html#control-parallelism-for-jdbc-queries" target="_self"&gt;described here&lt;/A&gt;.&lt;/SPAN&gt;&lt;/LI&gt;&lt;/OL&gt;</description>
      <pubDate>Thu, 19 Sep 2024 09:36:47 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/java-heap-issue-gc-allocation-failure-while-writing-data-from/m-p/91035#M38065</guid>
      <dc:creator>Witold</dc:creator>
      <dc:date>2024-09-19T09:36:47Z</dc:date>
    </item>
    <item>
      <title>Re: Java heap issue, GC allocation failure while writing data from mysql to adls</title>
      <link>https://community.databricks.com/t5/data-engineering/java-heap-issue-gc-allocation-failure-while-writing-data-from/m-p/91038#M38068</link>
      <description>&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;thanks for the reply&lt;/P&gt;&lt;P&gt;let me try the approaches which you mentioned and see the performance. will update you shortly.&lt;/P&gt;</description>
      <pubDate>Thu, 19 Sep 2024 09:59:19 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/java-heap-issue-gc-allocation-failure-while-writing-data-from/m-p/91038#M38068</guid>
      <dc:creator>sshukla</dc:creator>
      <dc:date>2024-09-19T09:59:19Z</dc:date>
    </item>
    <item>
      <title>Re: Java heap issue, GC allocation failure while writing data from mysql to adls</title>
      <link>https://community.databricks.com/t5/data-engineering/java-heap-issue-gc-allocation-failure-while-writing-data-from/m-p/91047#M38071</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/107959"&gt;@Witold&lt;/a&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;After trying&amp;nbsp;&lt;BR /&gt;table = (spark.read&lt;BR /&gt;.format("jdbc")&lt;BR /&gt;.option("url", "&amp;lt;jdbc-url&amp;gt;")&lt;BR /&gt;.option("dbtable", "&amp;lt;table-name&amp;gt;")&lt;BR /&gt;.option("user", "&amp;lt;username&amp;gt;")&lt;BR /&gt;.option("password", "&amp;lt;password&amp;gt;")&lt;BR /&gt;.option("fetchSize", "1000") -- to 50000&lt;BR /&gt;.load()&lt;BR /&gt;)&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;job is taking lots of time&amp;nbsp; and not even reading 10 million records.&lt;BR /&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="Capture.PNG" style="width: 886px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/11345iF02F077B000F3863/image-size/large?v=v2&amp;amp;px=999" role="button" title="Capture.PNG" alt="Capture.PNG" /&gt;&lt;/span&gt;&lt;/P&gt;</description>
      <pubDate>Thu, 19 Sep 2024 10:53:55 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/java-heap-issue-gc-allocation-failure-while-writing-data-from/m-p/91047#M38071</guid>
      <dc:creator>sshukla</dc:creator>
      <dc:date>2024-09-19T10:53:55Z</dc:date>
    </item>
    <item>
      <title>Re: Java heap issue, GC allocation failure while writing data from mysql to adls</title>
      <link>https://community.databricks.com/t5/data-engineering/java-heap-issue-gc-allocation-failure-while-writing-data-from/m-p/91074#M38073</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/107959"&gt;@Witold&lt;/a&gt;&amp;nbsp;&lt;BR /&gt;Now i am able to read the data but one issue i am seeing is that out of 8 executor 3 are getting success in just 2-3 sec and rest 5 are running why this behavior.&lt;/P&gt;&lt;P&gt;below is the code&lt;/P&gt;&lt;DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;remote_table1 &lt;/SPAN&gt;&lt;SPAN&gt;=&lt;/SPAN&gt;&lt;SPAN&gt; (spark.read&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;&amp;nbsp; .&lt;/SPAN&gt;&lt;SPAN&gt;format&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;"jdbc"&lt;/SPAN&gt;&lt;SPAN&gt;)&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;&amp;nbsp; .&lt;/SPAN&gt;&lt;SPAN&gt;option&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;"url"&lt;/SPAN&gt;&lt;SPAN&gt;, url)&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;&amp;nbsp; .&lt;/SPAN&gt;&lt;SPAN&gt;option&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;"dbtable"&lt;/SPAN&gt;&lt;SPAN&gt;,&lt;/SPAN&gt;&lt;SPAN&gt;"(select * from db.table) temp"&lt;/SPAN&gt;&lt;SPAN&gt;)&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;&amp;nbsp; .&lt;/SPAN&gt;&lt;SPAN&gt;option&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;"user"&lt;/SPAN&gt;&lt;SPAN&gt;, user)&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;&amp;nbsp; .&lt;/SPAN&gt;&lt;SPAN&gt;option&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;"password"&lt;/SPAN&gt;&lt;SPAN&gt;, password)&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;&amp;nbsp; .&lt;/SPAN&gt;&lt;SPAN&gt;option&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;"fetchSize"&lt;/SPAN&gt;&lt;SPAN&gt;, &lt;/SPAN&gt;&lt;SPAN&gt;"50000"&lt;/SPAN&gt;&lt;SPAN&gt;)&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;&amp;nbsp; .&lt;/SPAN&gt;&lt;SPAN&gt;option&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;"lowerBound"&lt;/SPAN&gt;&lt;SPAN&gt;, &lt;/SPAN&gt;&lt;SPAN&gt;"1"&lt;/SPAN&gt;&lt;SPAN&gt;)&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;&amp;nbsp; .&lt;/SPAN&gt;&lt;SPAN&gt;option&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;"upperBound"&lt;/SPAN&gt;&lt;SPAN&gt;, &lt;/SPAN&gt;&lt;SPAN&gt;"12"&lt;/SPAN&gt;&lt;SPAN&gt;)&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;&amp;nbsp; .&lt;/SPAN&gt;&lt;SPAN&gt;option&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;"partitionColumn"&lt;/SPAN&gt;&lt;SPAN&gt;, &lt;/SPAN&gt;&lt;SPAN&gt;"month"&lt;/SPAN&gt;&lt;SPAN&gt;)&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;&amp;nbsp; .&lt;/SPAN&gt;&lt;SPAN&gt;option&lt;/SPAN&gt;&lt;SPAN&gt;(&lt;/SPAN&gt;&lt;SPAN&gt;"numPartitions"&lt;/SPAN&gt;&lt;SPAN&gt;, &lt;/SPAN&gt;&lt;SPAN&gt;8&lt;/SPAN&gt;&lt;SPAN&gt;)&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;&amp;nbsp; .&lt;/SPAN&gt;&lt;SPAN&gt;load&lt;/SPAN&gt;&lt;SPAN&gt;()&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;)&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV&gt;&lt;SPAN&gt;also please find snapshot of execution&amp;nbsp;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="Capture2.PNG" style="width: 999px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/11347i4587381FD2E7F522/image-size/large?v=v2&amp;amp;px=999" role="button" title="Capture2.PNG" alt="Capture2.PNG" /&gt;&lt;/span&gt;&lt;/SPAN&gt;&lt;/DIV&gt;&lt;/DIV&gt;</description>
      <pubDate>Thu, 19 Sep 2024 14:03:37 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/java-heap-issue-gc-allocation-failure-while-writing-data-from/m-p/91074#M38073</guid>
      <dc:creator>sshukla</dc:creator>
      <dc:date>2024-09-19T14:03:37Z</dc:date>
    </item>
    <item>
      <title>Re: Java heap issue, GC allocation failure while writing data from mysql to adls</title>
      <link>https://community.databricks.com/t5/data-engineering/java-heap-issue-gc-allocation-failure-while-writing-data-from/m-p/91078#M38074</link>
      <description>&lt;P&gt;Hello good man&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 19 Sep 2024 14:18:06 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/java-heap-issue-gc-allocation-failure-while-writing-data-from/m-p/91078#M38074</guid>
      <dc:creator>shaza606</dc:creator>
      <dc:date>2024-09-19T14:18:06Z</dc:date>
    </item>
  </channel>
</rss>

