<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: OOM while loading a lot of data through JDBC in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/oom-while-loading-a-lot-of-data-through-jdbc/m-p/49446#M28558</link>
    <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/9"&gt;@Retired_mod&lt;/a&gt;any idea?&lt;/P&gt;</description>
    <pubDate>Wed, 18 Oct 2023 07:12:33 GMT</pubDate>
    <dc:creator>-werners-</dc:creator>
    <dc:date>2023-10-18T07:12:33Z</dc:date>
    <item>
      <title>OOM while loading a lot of data through JDBC</title>
      <link>https://community.databricks.com/t5/data-engineering/oom-while-loading-a-lot-of-data-through-jdbc/m-p/49096#M28476</link>
      <description>&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="java"&gt;    
    public void bigDataTest() throws Exception {
        int rowsCount = 100_000;
        int colSize = 1024;
        int colCount = 12;

        

        String colValue = "'"+"x".repeat(colSize)+"'";
        String query = "select explode(sequence(1, "+rowsCount+"))," +
                String.join(",", Collections.nCopies(colCount, colValue));

        try (                
                Connection conn = dataSource.getConnection()
        ) {
            PreparedStatement ps = conn.prepareStatement(query, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
            ps.setFetchSize(1);
            ResultSet rs = ps.executeQuery();

            int count = 0;
            while(rs.next()) {
                if(count++ % 100 == 0) {
                    LOG.info("Count = {}", count);
                }
            }
        }
    }&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;With -Xmx200m I can read about 50_000 rows and after that I receive "Exception in thread "pool-12-thread-50" Exception in thread "pool-12-thread-1" java.lang.OutOfMemoryError: Java heap space&lt;BR /&gt;java.lang.OutOfMemoryError: Java heap space"&lt;/P&gt;&lt;P&gt;The memory picture is classic for OOM:&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="Screenshot 2023-10-13 at 08.10.08.png" style="width: 490px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/4402i361A8F48796BB5BD/image-size/large/is-moderation-mode/true?v=v2&amp;amp;px=999" role="button" title="Screenshot 2023-10-13 at 08.10.08.png" alt="Screenshot 2023-10-13 at 08.10.08.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;What can I see in the heapdump:&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="Screenshot 2023-10-13 at 08.12.52.png" style="width: 700px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/4403i773B7500F03036EE/image-size/large/is-moderation-mode/true?v=v2&amp;amp;px=999" role="button" title="Screenshot 2023-10-13 at 08.12.52.png" alt="Screenshot 2023-10-13 at 08.12.52.png" /&gt;&lt;/span&gt;&lt;/P&gt;</description>
      <pubDate>Fri, 13 Oct 2023 08:31:38 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/oom-while-loading-a-lot-of-data-through-jdbc/m-p/49096#M28476</guid>
      <dc:creator>krocodl</dc:creator>
      <dc:date>2023-10-13T08:31:38Z</dc:date>
    </item>
    <item>
      <title>Re: OOM while loading a lot of data through JDBC</title>
      <link>https://community.databricks.com/t5/data-engineering/oom-while-loading-a-lot-of-data-through-jdbc/m-p/49097#M28477</link>
      <description>&lt;PRE&gt;&lt;SPAN&gt;DATABRICKS_JDBC_URL &lt;/SPAN&gt;= &lt;SPAN&gt;"jdbc:databricks://xxx.cloud.databricks.com:443/default;" &lt;/SPAN&gt;+&lt;BR /&gt;        &lt;SPAN&gt;"transportMode=http;" &lt;/SPAN&gt;+&lt;BR /&gt;        &lt;SPAN&gt;"ssl=1;" &lt;/SPAN&gt;+&lt;BR /&gt;        &lt;SPAN&gt;"httpPath=sql/protocolv1/o/xxxxx;AuthMech=3;MaxConsecutiveResultFileDownloadRetries=50;fetchsize=1"&lt;/SPAN&gt;&lt;/PRE&gt;&lt;P&gt;Without custom&amp;nbsp;&lt;SPAN&gt;MaxConsecutiveResultFileDownloadRetries I received 500638 JDBC error and can read only&amp;nbsp; about 20_000 rows&lt;/SPAN&gt;&lt;/P&gt;&lt;PRE&gt;databricksDriver = &lt;SPAN&gt;"com.databricks:databricks-jdbc:2.6.33"&lt;/SPAN&gt;&lt;/PRE&gt;</description>
      <pubDate>Fri, 13 Oct 2023 08:30:21 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/oom-while-loading-a-lot-of-data-through-jdbc/m-p/49097#M28477</guid>
      <dc:creator>krocodl</dc:creator>
      <dc:date>2023-10-13T08:30:21Z</dc:date>
    </item>
    <item>
      <title>Re: OOM while loading a lot of data through JDBC</title>
      <link>https://community.databricks.com/t5/data-engineering/oom-while-loading-a-lot-of-data-through-jdbc/m-p/49108#M28478</link>
      <description>&lt;P&gt;I'd first ingest the raw data onto a data lake (using some ingest tool, databricks is not the best for this imo), then process the data using databricks.&lt;/P&gt;</description>
      <pubDate>Fri, 13 Oct 2023 09:28:11 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/oom-while-loading-a-lot-of-data-through-jdbc/m-p/49108#M28478</guid>
      <dc:creator>-werners-</dc:creator>
      <dc:date>2023-10-13T09:28:11Z</dc:date>
    </item>
    <item>
      <title>Re: OOM while loading a lot of data through JDBC</title>
      <link>https://community.databricks.com/t5/data-engineering/oom-while-loading-a-lot-of-data-through-jdbc/m-p/49122#M28479</link>
      <description>&lt;P&gt;Perhaps for Some use cases this will be the solution.&lt;/P&gt;&lt;P&gt;But it does not cancel the fact that there is a memory leak bug in the driver.&lt;/P&gt;</description>
      <pubDate>Fri, 13 Oct 2023 10:13:42 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/oom-while-loading-a-lot-of-data-through-jdbc/m-p/49122#M28479</guid>
      <dc:creator>krocodl</dc:creator>
      <dc:date>2023-10-13T10:13:42Z</dc:date>
    </item>
    <item>
      <title>Re: OOM while loading a lot of data through JDBC</title>
      <link>https://community.databricks.com/t5/data-engineering/oom-while-loading-a-lot-of-data-through-jdbc/m-p/49131#M28485</link>
      <description>&lt;P&gt;not necessarily a memory leak.&amp;nbsp; possibly the raw data is fetched and the query is processed in memory.&amp;nbsp; don't know if that is the case though.&lt;/P&gt;</description>
      <pubDate>Fri, 13 Oct 2023 12:10:47 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/oom-while-loading-a-lot-of-data-through-jdbc/m-p/49131#M28485</guid>
      <dc:creator>-werners-</dc:creator>
      <dc:date>2023-10-13T12:10:47Z</dc:date>
    </item>
    <item>
      <title>Re: OOM while loading a lot of data through JDBC</title>
      <link>https://community.databricks.com/t5/data-engineering/oom-while-loading-a-lot-of-data-through-jdbc/m-p/49247#M28497</link>
      <description>&lt;P&gt;Ok, let's call it a temporary minor memory starvation issue causing the virtual machine to crash.&lt;/P&gt;</description>
      <pubDate>Mon, 16 Oct 2023 07:28:54 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/oom-while-loading-a-lot-of-data-through-jdbc/m-p/49247#M28497</guid>
      <dc:creator>krocodl</dc:creator>
      <dc:date>2023-10-16T07:28:54Z</dc:date>
    </item>
    <item>
      <title>Re: OOM while loading a lot of data through JDBC</title>
      <link>https://community.databricks.com/t5/data-engineering/oom-while-loading-a-lot-of-data-through-jdbc/m-p/49248#M28498</link>
      <description>&lt;P&gt;And here's another extremely minor issue leading to uncontrolled reproduction of threads. &lt;A href="https://community.databricks.com/t5/data-engineering/thread-leakage-when-connection-cannot-be-established/td-p/39435" target="_blank"&gt;https://community.databricks.com/t5/data-engineering/thread-leakage-when-connection-cannot-be-established/td-p/39435&lt;/A&gt;&lt;BR /&gt;For some reason nobody responds to it either....&lt;/P&gt;</description>
      <pubDate>Mon, 16 Oct 2023 07:31:37 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/oom-while-loading-a-lot-of-data-through-jdbc/m-p/49248#M28498</guid>
      <dc:creator>krocodl</dc:creator>
      <dc:date>2023-10-16T07:31:37Z</dc:date>
    </item>
    <item>
      <title>Re: OOM while loading a lot of data through JDBC</title>
      <link>https://community.databricks.com/t5/data-engineering/oom-while-loading-a-lot-of-data-through-jdbc/m-p/49261#M28506</link>
      <description>&lt;P&gt;That is, at least I think, because the jdbc driver is not part of the databricks platform itself (and closed source afaik).&lt;BR /&gt;Chances are small that someone of the community knows the ins an outs of the driver-code.&lt;BR /&gt;Now, if you are convinced that there is an actual bug in the databricks driver, I suggest you open a ticket at databricks so someone can look into it.&lt;BR /&gt;Because maybe you stumbled upon something here.&lt;/P&gt;</description>
      <pubDate>Mon, 16 Oct 2023 09:19:56 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/oom-while-loading-a-lot-of-data-through-jdbc/m-p/49261#M28506</guid>
      <dc:creator>-werners-</dc:creator>
      <dc:date>2023-10-16T09:19:56Z</dc:date>
    </item>
    <item>
      <title>Re: OOM while loading a lot of data through JDBC</title>
      <link>https://community.databricks.com/t5/data-engineering/oom-while-loading-a-lot-of-data-through-jdbc/m-p/49380#M28541</link>
      <description>&lt;P&gt;I solved this issue, but it requires to change several classes. The final result:&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="Screenshot 2023-10-17 at 09.13.52.png" style="width: 711px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/4476i5C4CF6DB4C4AE5EB/image-size/large/is-moderation-mode/true?v=v2&amp;amp;px=999" role="button" title="Screenshot 2023-10-17 at 09.13.52.png" alt="Screenshot 2023-10-17 at 09.13.52.png" /&gt;&lt;/span&gt;&lt;/P&gt;</description>
      <pubDate>Tue, 17 Oct 2023 13:05:20 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/oom-while-loading-a-lot-of-data-through-jdbc/m-p/49380#M28541</guid>
      <dc:creator>krocodl</dc:creator>
      <dc:date>2023-10-17T13:05:20Z</dc:date>
    </item>
    <item>
      <title>Re: OOM while loading a lot of data through JDBC</title>
      <link>https://community.databricks.com/t5/data-engineering/oom-while-loading-a-lot-of-data-through-jdbc/m-p/49439#M28556</link>
      <description>&lt;P&gt;Nice!&lt;BR /&gt;You might wanna share your improvements with the driver devs.&lt;/P&gt;</description>
      <pubDate>Wed, 18 Oct 2023 05:36:03 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/oom-while-loading-a-lot-of-data-through-jdbc/m-p/49439#M28556</guid>
      <dc:creator>-werners-</dc:creator>
      <dc:date>2023-10-18T05:36:03Z</dc:date>
    </item>
    <item>
      <title>Re: OOM while loading a lot of data through JDBC</title>
      <link>https://community.databricks.com/t5/data-engineering/oom-while-loading-a-lot-of-data-through-jdbc/m-p/49442#M28557</link>
      <description>&lt;P&gt;Yes, I really want to, but I have absolutely no idea how to send these edits to them.&lt;/P&gt;&lt;P&gt;They do not have a public repository or public ticket system.&lt;/P&gt;</description>
      <pubDate>Wed, 18 Oct 2023 06:35:21 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/oom-while-loading-a-lot-of-data-through-jdbc/m-p/49442#M28557</guid>
      <dc:creator>krocodl</dc:creator>
      <dc:date>2023-10-18T06:35:21Z</dc:date>
    </item>
    <item>
      <title>Re: OOM while loading a lot of data through JDBC</title>
      <link>https://community.databricks.com/t5/data-engineering/oom-while-loading-a-lot-of-data-through-jdbc/m-p/49446#M28558</link>
      <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/9"&gt;@Retired_mod&lt;/a&gt;any idea?&lt;/P&gt;</description>
      <pubDate>Wed, 18 Oct 2023 07:12:33 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/oom-while-loading-a-lot-of-data-through-jdbc/m-p/49446#M28558</guid>
      <dc:creator>-werners-</dc:creator>
      <dc:date>2023-10-18T07:12:33Z</dc:date>
    </item>
    <item>
      <title>Re: OOM while loading a lot of data through JDBC</title>
      <link>https://community.databricks.com/t5/data-engineering/oom-while-loading-a-lot-of-data-through-jdbc/m-p/105645#M42224</link>
      <description>&lt;P&gt;Hi, I am from databricks eng and we have had the driver developer look into this and could not repro. A couple of things to note:&lt;/P&gt;&lt;P&gt;1. 2.6.33 is a pretty old driver that does not have Cloud Fetch support.&lt;/P&gt;&lt;P&gt;2. nowadays, later versions have Cloud Fetch enabled by default. The client/server interactions' shape has changed significantly in the later versions. Can you try the new versions?&lt;/P&gt;&lt;P&gt;3. if you do not mind, would you share your client-side fix, so I can pass it on to the driver developers to take a look and see whether they are still relevant to include in the improvement to the later versions?&lt;/P&gt;&lt;P&gt;Thanks for your patience and support!&lt;/P&gt;&lt;P&gt;eng-partner-eco-help@databricks.com&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 14 Jan 2025 20:08:33 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/oom-while-loading-a-lot-of-data-through-jdbc/m-p/105645#M42224</guid>
      <dc:creator>yunbodeng</dc:creator>
      <dc:date>2025-01-14T20:08:33Z</dc:date>
    </item>
  </channel>
</rss>

