<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Internal errors when running SQLs in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/internal-errors-when-running-sqls/m-p/119357#M45851</link>
    <description>&lt;P&gt;We are running Databricks on GCP with a classic SQL warehouse. Its on the current version (&lt;SPAN&gt;v 2025.15)&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;We have a pipeline that runs DBT on top of the SQL warehouse&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;Since the 9th of May, our queries have been failing intermittently with internal errors from Databricks that look like this. We were getting these kind of issues before, but they were one off. But now they are hampering our production pipeline.&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;How can this issue be fixed?&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;Thank you in advance for the help&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="Screenshot 2025-05-15 at 4.51.49 pm.png" style="width: 960px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/16879i14C9404809A49E33/image-size/large?v=v2&amp;amp;px=999" role="button" title="Screenshot 2025-05-15 at 4.51.49 pm.png" alt="Screenshot 2025-05-15 at 4.51.49 pm.png" /&gt;&lt;/span&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="Screenshot 2025-05-15 at 5.23.57 pm.png" style="width: 704px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/16881i6D5CA56A9FF05FDC/image-size/large?v=v2&amp;amp;px=999" role="button" title="Screenshot 2025-05-15 at 5.23.57 pm.png" alt="Screenshot 2025-05-15 at 5.23.57 pm.png" /&gt;&lt;/span&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="Screenshot 2025-05-15 at 5.24.12 pm.png" style="width: 950px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/16880iC4920CAFA0FB965D/image-size/large?v=v2&amp;amp;px=999" role="button" title="Screenshot 2025-05-15 at 5.24.12 pm.png" alt="Screenshot 2025-05-15 at 5.24.12 pm.png" /&gt;&lt;/span&gt;&lt;/SPAN&gt;&lt;/P&gt;</description>
    <pubDate>Thu, 15 May 2025 15:29:54 GMT</pubDate>
    <dc:creator>utkarshamone</dc:creator>
    <dc:date>2025-05-15T15:29:54Z</dc:date>
    <item>
      <title>Internal errors when running SQLs</title>
      <link>https://community.databricks.com/t5/data-engineering/internal-errors-when-running-sqls/m-p/119357#M45851</link>
      <description>&lt;P&gt;We are running Databricks on GCP with a classic SQL warehouse. Its on the current version (&lt;SPAN&gt;v 2025.15)&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;We have a pipeline that runs DBT on top of the SQL warehouse&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;Since the 9th of May, our queries have been failing intermittently with internal errors from Databricks that look like this. We were getting these kind of issues before, but they were one off. But now they are hampering our production pipeline.&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;How can this issue be fixed?&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;Thank you in advance for the help&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="Screenshot 2025-05-15 at 4.51.49 pm.png" style="width: 960px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/16879i14C9404809A49E33/image-size/large?v=v2&amp;amp;px=999" role="button" title="Screenshot 2025-05-15 at 4.51.49 pm.png" alt="Screenshot 2025-05-15 at 4.51.49 pm.png" /&gt;&lt;/span&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="Screenshot 2025-05-15 at 5.23.57 pm.png" style="width: 704px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/16881i6D5CA56A9FF05FDC/image-size/large?v=v2&amp;amp;px=999" role="button" title="Screenshot 2025-05-15 at 5.23.57 pm.png" alt="Screenshot 2025-05-15 at 5.23.57 pm.png" /&gt;&lt;/span&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="Screenshot 2025-05-15 at 5.24.12 pm.png" style="width: 950px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/16880iC4920CAFA0FB965D/image-size/large?v=v2&amp;amp;px=999" role="button" title="Screenshot 2025-05-15 at 5.24.12 pm.png" alt="Screenshot 2025-05-15 at 5.24.12 pm.png" /&gt;&lt;/span&gt;&lt;/SPAN&gt;&lt;/P&gt;</description>
      <pubDate>Thu, 15 May 2025 15:29:54 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/internal-errors-when-running-sqls/m-p/119357#M45851</guid>
      <dc:creator>utkarshamone</dc:creator>
      <dc:date>2025-05-15T15:29:54Z</dc:date>
    </item>
    <item>
      <title>Re: Internal errors when running SQLs</title>
      <link>https://community.databricks.com/t5/data-engineering/internal-errors-when-running-sqls/m-p/119377#M45858</link>
      <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/154000"&gt;@utkarshamone&lt;/a&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;The error messages you've shared—such as:&lt;/P&gt;&lt;P&gt;-- [INTERNAL_ERROR] Query could not be scheduled: HTTP Response code: 503&lt;BR /&gt;-- ExecutorLostFailure ... exited with code 134, sigabrt&lt;BR /&gt;-- Internal error&lt;/P&gt;&lt;P&gt;—indicate that your Databricks SQL warehouse on GCP (v2025.15) is encountering intermittent internal issues likely tied to:&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;Root Causes&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;1. Databricks Platform Instability or Bugs (Post May 9 Update)&lt;/STRONG&gt;&lt;BR /&gt;-- Since you're observing a change in behavior after May 9, it's possible the recent version or backend updates introduced bugs or instability.&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;2. SQL Warehouse Resource Exhaustion or Scheduling Delay&lt;/STRONG&gt;&lt;BR /&gt;-- Code 503 is often due to temporary overload or service unavailability.&lt;BR /&gt;-- The sigabrt + ExecutorLostFailure may be from exceeding memory limits or a critical failure in executor management.&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;3. Concurrency and Load Patterns&lt;/STRONG&gt;&lt;BR /&gt;If your DBT runs or other jobs were scaled up or changed recently, they might be exceeding the SQL&lt;BR /&gt;warehouse's concurrency or memory capacity.&lt;/P&gt;&lt;P&gt;&lt;BR /&gt;&lt;STRONG&gt;Recommended Actions&lt;/STRONG&gt;&lt;BR /&gt;&lt;STRONG&gt;1. Switch to Pro SQL Warehouse (If Not Already)&lt;/STRONG&gt;&lt;BR /&gt;-- Classic SQL Warehouses are more prone to instability.&lt;BR /&gt;-- Pro or Serverless SQL Warehouses offer auto-scaling, better fault tolerance, and enhanced scheduling.&lt;BR /&gt;&lt;STRONG&gt;2. Enable Query Retry in DBT&lt;/STRONG&gt;&lt;BR /&gt;-- Add automatic retry logic around DBT SQL models using macros or a retry decorator for flaky jobs.&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;3. Increase Warehouse Size / Concurrency Slots&lt;/STRONG&gt;&lt;BR /&gt;If you're seeing resource contention, increase the SQL warehouse size to provide more memory and better scheduling.&lt;BR /&gt;&lt;STRONG&gt;4. Check DBT Query Footprint&lt;/STRONG&gt;&lt;BR /&gt;DESCRIBE HISTORY &amp;lt;table&amp;gt;;&lt;/P&gt;&lt;P&gt;or query system.query_log to investigate any long-running or memory-intensive queries introduced recently.&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;5. Open a Databricks Support Ticket&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 15 May 2025 18:01:12 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/internal-errors-when-running-sqls/m-p/119377#M45858</guid>
      <dc:creator>lingareddy_Alva</dc:creator>
      <dc:date>2025-05-15T18:01:12Z</dc:date>
    </item>
    <item>
      <title>Re: Internal errors when running SQLs</title>
      <link>https://community.databricks.com/t5/data-engineering/internal-errors-when-running-sqls/m-p/119533#M45900</link>
      <description>&lt;P class=""&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/154000"&gt;@utkarshamone&lt;/a&gt;&amp;nbsp;,&lt;/P&gt;&lt;P class=""&gt;We faced a similar issue and I wanted to share our findings, which might help clarify what’s going on.&lt;/P&gt;&lt;P class=""&gt;We’re using a &lt;SPAN class=""&gt;&lt;STRONG&gt;Classic SQL Warehouse&lt;/STRONG&gt;&lt;/SPAN&gt;&amp;nbsp;size L (v2025.15), and executing a &lt;SPAN class=""&gt;&lt;STRONG&gt;dbt pipeline&lt;/STRONG&gt;&lt;/SPAN&gt; on top of it.&lt;/P&gt;&lt;P class=""&gt;Our dbt jobs started to &lt;SPAN class=""&gt;&lt;STRONG&gt;fail&lt;/STRONG&gt;&lt;/SPAN&gt;&amp;nbsp;with &lt;SPAN class=""&gt;&lt;STRONG&gt;internal Databricks errors&amp;nbsp;&lt;/STRONG&gt;&lt;/SPAN&gt;and are affecting our &lt;SPAN class=""&gt;&lt;STRONG&gt;production pipeline too&lt;/STRONG&gt;&lt;/SPAN&gt;.&lt;BR /&gt;&lt;BR /&gt;&lt;STRONG&gt;Then I checked the pipeline in depth and saw the following in the query profile and Spark UI&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;Classic Warehouse (FAILED)&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;Execution details:&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;&lt;STRONG&gt;Fixed 256 shuffle partitions&lt;/STRONG&gt;&lt;/LI&gt;&lt;LI&gt;Fails in: PhotonUnionShuffleExchangeSink&lt;UL&gt;&lt;LI&gt;&lt;STRONG&gt;Peak memory total ≈ 91.9 GiB&lt;/STRONG&gt;&lt;/LI&gt;&lt;LI&gt;&lt;STRONG&gt;0 rows output&lt;/STRONG&gt;&lt;/LI&gt;&lt;LI&gt;Multiple executors exited with code &lt;STRONG&gt;134 (SIGABRT)&lt;/STRONG&gt;&lt;/LI&gt;&lt;/UL&gt;&lt;/LI&gt;&lt;LI&gt;&lt;STRONG&gt;Spill = 0 bytes&lt;/STRONG&gt; (crashes before spilling)&lt;/LI&gt;&lt;LI&gt;&lt;STRONG&gt;Dead executors&lt;/STRONG&gt;, hundreds of failed tasks&lt;/LI&gt;&lt;LI&gt;&lt;STRONG&gt;Off‑heap memory peak&lt;/STRONG&gt; = 7–8 GiB before crash&lt;/LI&gt;&lt;LI&gt;Input: &lt;STRONG&gt;213 GiB read, 671 M rows&lt;/STRONG&gt;&lt;/LI&gt;&lt;LI&gt;&lt;STRONG&gt;Task time in Photon = 18 %&lt;/STRONG&gt;&lt;/LI&gt;&lt;LI&gt;&lt;EM&gt;My analysis:&lt;/EM&gt; Photon may under-estimate memory requirements during the union shuffle. One partition becomes too large (“elephant”), exceeds executor memory, malloc fails, and triggers SIGABRT.&lt;/LI&gt;&lt;/UL&gt;&lt;P&gt;&lt;SPAN class=""&gt;&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;Serverless Warehouse (SUCCEEDED)&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;Execution details:&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;&lt;STRONG&gt;AQE enabled&lt;/STRONG&gt;, partitions dynamically adjusted (~2,000 early, coalesced later)&lt;/LI&gt;&lt;LI&gt;Sort operators: &lt;STRONG&gt;52 GiB / 46 GiB&lt;/STRONG&gt; total&lt;/LI&gt;&lt;LI&gt;ShuffleExchange: &lt;STRONG&gt;Peak memory = 18 GiB&lt;/STRONG&gt;, &lt;STRONG&gt;Peak per-task ≈ 280 MiB&lt;/STRONG&gt;&lt;/LI&gt;&lt;LI&gt;&lt;STRONG&gt;No executor losses&lt;/STRONG&gt;&lt;/LI&gt;&lt;LI&gt;&lt;STRONG&gt;Spill = 0 bytes&lt;/STRONG&gt;&lt;/LI&gt;&lt;LI&gt;&lt;STRONG&gt;Failed Tasks = 0&lt;/STRONG&gt;&lt;/LI&gt;&lt;LI&gt;Runtime: &lt;STRONG&gt;1 min 46 s&lt;/STRONG&gt;&lt;/LI&gt;&lt;LI&gt;&lt;STRONG&gt;Task time in Photon = 99 %&lt;/STRONG&gt;&lt;/LI&gt;&lt;LI&gt;&lt;EM&gt;My analysis:&lt;/EM&gt; AQE + newer Photon version effectively balances partitions and avoids memory hotspots.&lt;/LI&gt;&lt;/UL&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;We reported this to Databricks support.They confirmed:&lt;BR /&gt;&lt;BR /&gt;&lt;EM&gt;"Engineering identified the root cause and has prepared a fix.&lt;/EM&gt;&lt;BR /&gt;&lt;EM&gt;&lt;SPAN class=""&gt;It will be included in the &lt;/SPAN&gt;&lt;STRONG&gt;next maintenance cycle, scheduled for end of May 2025&lt;/STRONG&gt;&lt;SPAN class=""&gt;."&lt;BR /&gt;&lt;BR /&gt;&lt;/SPAN&gt;&lt;/EM&gt;&lt;/P&gt;&lt;P class=""&gt;Until the fix is deployed:&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;&lt;SPAN class=""&gt;&lt;STRONG&gt;Check the query profile and Spark UI&lt;/STRONG&gt;&lt;/SPAN&gt;&amp;nbsp;to identify where the hotspot occurs&lt;/LI&gt;&lt;LI&gt;&lt;SPAN class=""&gt;&lt;STRONG&gt;Switch to Serverless SQL Warehouse&amp;nbsp;provisionally&lt;/STRONG&gt;&lt;/SPAN&gt; for production dbt pipelines (stable + memory-safe)&lt;/LI&gt;&lt;LI&gt;Reevaluate using Classic at the &lt;SPAN class=""&gt;&lt;STRONG&gt;end of May&lt;/STRONG&gt;&lt;/SPAN&gt;, once the new version is available&lt;/LI&gt;&lt;/UL&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P class=""&gt;Hope this helps! &lt;span class="lia-unicode-emoji" title=":slightly_smiling_face:"&gt;🙂&lt;/span&gt;&lt;BR /&gt;&lt;BR /&gt;Isi&lt;/P&gt;</description>
      <pubDate>Sat, 17 May 2025 18:45:59 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/internal-errors-when-running-sqls/m-p/119533#M45900</guid>
      <dc:creator>Isi</dc:creator>
      <dc:date>2025-05-17T18:45:59Z</dc:date>
    </item>
    <item>
      <title>Re: Internal errors when running SQLs</title>
      <link>https://community.databricks.com/t5/data-engineering/internal-errors-when-running-sqls/m-p/119582#M45922</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/24053"&gt;@lingareddy_Alva&lt;/a&gt;&amp;nbsp;&lt;BR /&gt;Thank you for the quick reply and suggestions&lt;/P&gt;</description>
      <pubDate>Mon, 19 May 2025 07:06:18 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/internal-errors-when-running-sqls/m-p/119582#M45922</guid>
      <dc:creator>utkarshamone</dc:creator>
      <dc:date>2025-05-19T07:06:18Z</dc:date>
    </item>
    <item>
      <title>Re: Internal errors when running SQLs</title>
      <link>https://community.databricks.com/t5/data-engineering/internal-errors-when-running-sqls/m-p/119699#M45947</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/145555"&gt;@Isi&lt;/a&gt;&amp;nbsp;&lt;BR /&gt;Thanks for your reply!&lt;/P&gt;&lt;P&gt;Will look into changing the warehouse type&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;</description>
      <pubDate>Tue, 20 May 2025 07:31:22 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/internal-errors-when-running-sqls/m-p/119699#M45947</guid>
      <dc:creator>utkarshamone</dc:creator>
      <dc:date>2025-05-20T07:31:22Z</dc:date>
    </item>
  </channel>
</rss>

