<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Performance Issues with Writing Large DataFrames to Managed Tables in Databricks (3.5B+ Rows) in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/performance-issues-with-writing-large-dataframes-to-managed/m-p/128829#M48352</link>
    <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/149511"&gt;@kanikvijay9&lt;/a&gt;&amp;nbsp;,&lt;/P&gt;&lt;P&gt;Thanks for sharing solution with us! Could you mark your answer as a solution to the thread? It helps other memebers find correct answer in a faster way &lt;span class="lia-unicode-emoji" title=":slightly_smiling_face:"&gt;🙂&lt;/span&gt;&lt;/P&gt;</description>
    <pubDate>Tue, 19 Aug 2025 09:31:42 GMT</pubDate>
    <dc:creator>szymon_dybczak</dc:creator>
    <dc:date>2025-08-19T09:31:42Z</dc:date>
    <item>
      <title>Performance Issues with Writing Large DataFrames to Managed Tables in Databricks (3.5B+ Rows)</title>
      <link>https://community.databricks.com/t5/data-engineering/performance-issues-with-writing-large-dataframes-to-managed/m-p/128256#M48190</link>
      <description>&lt;P&gt;Hi Community,&lt;/P&gt;&lt;P&gt;I'm working on a large-scale data processing job in Databricks and facing performance and stability issues during the write operations. Here's a detailed breakdown of my use case and environment:&lt;/P&gt;&lt;H3&gt;&lt;STRONG&gt;Use Case Overview:&lt;/STRONG&gt;&lt;/H3&gt;&lt;H3&gt;&lt;STRONG&gt;Primary Data Frames:&lt;/STRONG&gt;&lt;/H3&gt;&lt;OL&gt;&lt;LI&gt;&lt;STRONG&gt;First Data Frame: Created by reading from 4 external tables (each ~7B rows, 7–15 columns). Final DF size:&lt;SPAN&gt;&amp;nbsp;&lt;STRONG&gt;3.5B+ rows, mostly strings, nulls, and Booleans.&lt;/STRONG&gt;&lt;/SPAN&gt;&lt;/STRONG&gt;&lt;/LI&gt;&lt;LI&gt;&lt;STRONG&gt;Second Data Frame: Created from a managed table in Databricks catalog. Size:&lt;SPAN&gt;&amp;nbsp;&lt;STRONG&gt;3.8B+ rows, 10–15 columns.&lt;/STRONG&gt;&lt;/SPAN&gt;&lt;/STRONG&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P&gt;&lt;STRONG&gt;Processing Steps:&lt;/STRONG&gt;&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;Writing both primary Data Frames to&lt;SPAN&gt;&amp;nbsp;&lt;STRONG&gt;temporary managed tables.&lt;/STRONG&gt;&lt;/SPAN&gt;&lt;/LI&gt;&lt;LI&gt;Performing&lt;SPAN&gt;&amp;nbsp;&lt;STRONG&gt;multi-key joins&lt;SPAN&gt;&amp;nbsp;using Spark SQL.&lt;/SPAN&gt;&lt;/STRONG&gt;&lt;/SPAN&gt;&lt;/LI&gt;&lt;LI&gt;Final join with another external table (~5B rows, 297.8GiB across 3444 files).&lt;/LI&gt;&lt;LI&gt;Final output is written to a&lt;SPAN&gt;&lt;SPAN&gt;&amp;nbsp;&lt;STRONG&gt;managed Delta table&lt;SPAN&gt;&amp;nbsp;using&lt;SPAN&gt;&amp;nbsp;&lt;STRONG&gt;Liquid Clustering.&lt;/STRONG&gt;&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/STRONG&gt;&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/LI&gt;&lt;/UL&gt;&lt;/LI&gt;&lt;/OL&gt;&lt;P&gt;&lt;STRONG&gt;Cluster Configuration:&lt;/STRONG&gt;&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;&lt;STRONG&gt;Databricks Runtime: 16.4 LTS (includes Apache Spark 3.5.2, Scala 2.12)&lt;/STRONG&gt;&lt;/LI&gt;&lt;LI&gt;&lt;STRONG&gt;Driver: Standard_F32s_v4 (160 GB RAM, 32 Cores)&lt;/STRONG&gt;&lt;/LI&gt;&lt;LI&gt;&lt;STRONG&gt;Worker: Standard_E32d_v4 (160 GB RAM, 32 Cores)&lt;/STRONG&gt;&lt;/LI&gt;&lt;LI&gt;&lt;STRONG&gt;Min Workers: 2 |&lt;SPAN&gt;&amp;nbsp;&lt;STRONG&gt;Max Workers: 18 |&lt;SPAN&gt;&amp;nbsp;&lt;STRONG&gt;Current Workers: 6&lt;/STRONG&gt;&lt;/SPAN&gt;&lt;/STRONG&gt;&lt;/SPAN&gt;&lt;/STRONG&gt;&lt;/LI&gt;&lt;LI&gt;&lt;STRONG&gt;&lt;STRONG&gt;Unity Catalog Enabled&lt;/STRONG&gt;&lt;/STRONG&gt;&lt;/LI&gt;&lt;/UL&gt;&lt;P&gt;&lt;STRONG&gt;Spark Configurations:&lt;/STRONG&gt;&lt;/P&gt;&lt;DIV&gt;&lt;UL&gt;&lt;LI&gt;&lt;STRONG&gt;Modified Successfully:&lt;/STRONG&gt;&lt;/LI&gt;&lt;/UL&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-center" image-alt="kanikvijay9_0-1755015948307.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/19035iDB8F77FEA8CDC0C5/image-size/medium?v=v2&amp;amp;px=400" role="button" title="kanikvijay9_0-1755015948307.png" alt="kanikvijay9_0-1755015948307.png" /&gt;&lt;/span&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;&lt;STRONG&gt;Attempted but Failed (Memory Error):&lt;/STRONG&gt;&lt;/LI&gt;&lt;/UL&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-center" image-alt="kanikvijay9_1-1755015978065.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/19036i5078132DF2749284/image-size/medium?v=v2&amp;amp;px=400" role="button" title="kanikvijay9_1-1755015978065.png" alt="kanikvijay9_1-1755015978065.png" /&gt;&lt;/span&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;&lt;STRONG&gt;Error Message:&lt;/STRONG&gt;&lt;UL&gt;&lt;LI&gt;&lt;STRONG&gt;com.databricks.backend.cluster.IllegalSparkContainerMemoryException: INVALID_PARAMETER_VALUE: Specified heap memory (102400 MB) and off heap memory (92475 MB) is above the maximum executor memory (123300 MB) allowed for node type Standard_E20d_v4.&lt;/STRONG&gt;&lt;/LI&gt;&lt;/UL&gt;&lt;/LI&gt;&lt;/UL&gt;&lt;DIV class=""&gt;&lt;DIV class=""&gt;&lt;DIV class=""&gt;&lt;DIV class=""&gt;&lt;DIV class=""&gt;&lt;DIV class=""&gt;&lt;DIV class=""&gt;&lt;DIV class=""&gt;&lt;DIV class=""&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;H3&gt;&lt;STRONG&gt;Write Logic:&lt;/STRONG&gt;&lt;/H3&gt;&lt;P&gt;&lt;SPAN&gt;&lt;!--       ScriptorStartFragment       --&gt;&lt;/SPAN&gt;&lt;/P&gt;&lt;DIV class=""&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-center" image-alt="kanikvijay9_0-1755016092143.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/19037iAE8C6D8BE26A7E0C/image-size/medium?v=v2&amp;amp;px=400" role="button" title="kanikvijay9_0-1755016092143.png" alt="kanikvijay9_0-1755016092143.png" /&gt;&lt;/span&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;DIV class=""&gt;&lt;H3&gt;&lt;STRONG&gt;What I’ve Tried:&lt;/STRONG&gt;&lt;/H3&gt;&lt;OL&gt;&lt;LI&gt;&lt;STRONG&gt;Caching/Persisting: Attempted to cache/persist intermediate Data Frames, but due to the massive size, it was not feasible and led to memory pressure.&lt;/STRONG&gt;&lt;/LI&gt;&lt;LI&gt;&lt;STRONG&gt;&lt;STRONG&gt;Partitioning During Write: Used&lt;SPAN&gt;&amp;nbsp;partition By&lt;SPAN&gt;&amp;nbsp;on multiple columns to optimize write performance, but still facing executor failures and long write times.&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/STRONG&gt;&lt;/STRONG&gt;&lt;/LI&gt;&lt;/OL&gt;&lt;P&gt;&lt;STRONG&gt;Issues Faced:&lt;/STRONG&gt;&lt;/P&gt;&lt;OL&gt;&lt;LI&gt;&lt;STRONG&gt;Executor Failures&lt;SPAN&gt;: While writing the first primary Data Frame to temporary managed tables,&lt;SPAN&gt;&amp;nbsp;&lt;STRONG&gt;60%+ executors fail&lt;SPAN&gt;.&lt;/SPAN&gt;&lt;/STRONG&gt;&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/STRONG&gt;&lt;/LI&gt;&lt;LI&gt;&lt;STRONG&gt;Write Time: Writing intermediate Data Frames (each taking&lt;SPAN&gt;&amp;nbsp;&lt;STRONG&gt;8–10 hours) is extremely slow and unstable.&lt;/STRONG&gt;&lt;/SPAN&gt;&lt;/STRONG&gt;&lt;/LI&gt;&lt;LI&gt;&lt;STRONG&gt;&lt;STRONG&gt;Final Join: Joining with the 5B-row external table is compounding the performance issues.&lt;/STRONG&gt;&lt;/STRONG&gt;&lt;/LI&gt;&lt;/OL&gt;&lt;P&gt;&lt;STRONG&gt;Looking for Help On:&lt;/STRONG&gt;&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;Best practices for writing large Data Frames efficiently.&lt;/LI&gt;&lt;LI&gt;Optimizing memory and executor configurations for large-scale joins and writes.&lt;/LI&gt;&lt;LI&gt;Suggestions for improving write performance with Delta + Liquid Clustering.&lt;/LI&gt;&lt;LI&gt;Any known limitations or tuning tips for Databricks Runtime 10.4 LTS with Spark 3.2.2.&lt;/LI&gt;&lt;/UL&gt;&lt;P&gt;Any insights, suggestions, or shared experiences would be greatly appreciated!&lt;/P&gt;&lt;P&gt;Thanks in advance&amp;nbsp;&lt;BR /&gt;&lt;STRONG&gt;Kanik Vijay&lt;BR /&gt;&lt;/STRONG&gt;&lt;/P&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;</description>
      <pubDate>Tue, 12 Aug 2025 16:30:43 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/performance-issues-with-writing-large-dataframes-to-managed/m-p/128256#M48190</guid>
      <dc:creator>kanikvijay9</dc:creator>
      <dc:date>2025-08-12T16:30:43Z</dc:date>
    </item>
    <item>
      <title>Re: Performance Issues with Writing Large DataFrames to Managed Tables in Databricks (3.5B+ Rows)</title>
      <link>https://community.databricks.com/t5/data-engineering/performance-issues-with-writing-large-dataframes-to-managed/m-p/128827#M48350</link>
      <description>&lt;P&gt;I found the solution, Please refer to the below links for the solution&lt;/P&gt;&lt;UL class=""&gt;&lt;LI&gt;LinkedIn Post:&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;&lt;A class="" href="https://www.linkedin.com/posts/activity-7363497408925745154-LaaL?utm_source=share&amp;amp;utm_medium=member_desktop&amp;amp;rcm=ACoAACTtno0BU78QJcWz-X3GHtKRvhXxf5fod90" target="_blank" rel="noopener ugc nofollow"&gt;https://www.linkedin.com/posts/activity-7363497408925745154-LaaL?utm_source=share&amp;amp;utm_medium=member_desktop&amp;amp;rcm=ACoAACTtno0BU78QJcWz-X3GHtKRvhXxf5fod90&lt;/A&gt;&lt;/LI&gt;&lt;LI&gt;Medium Blob:&amp;nbsp;&lt;A href="https://medium.com/@kanik.work/from-hours-to-minutes-optimizing-large-scale-writes-in-databricks-with-liquid-clustering-fbaa94c64f51" target="_blank" rel="noopener"&gt;https://medium.com/@kanik.work/from-hours-to-minutes-optimizing-large-scale-writes-in-databricks-with-liquid-clustering-fbaa94c64f51&lt;/A&gt;&lt;/LI&gt;&lt;/UL&gt;</description>
      <pubDate>Tue, 19 Aug 2025 09:27:45 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/performance-issues-with-writing-large-dataframes-to-managed/m-p/128827#M48350</guid>
      <dc:creator>kanikvijay9</dc:creator>
      <dc:date>2025-08-19T09:27:45Z</dc:date>
    </item>
    <item>
      <title>Re: Performance Issues with Writing Large DataFrames to Managed Tables in Databricks (3.5B+ Rows)</title>
      <link>https://community.databricks.com/t5/data-engineering/performance-issues-with-writing-large-dataframes-to-managed/m-p/128829#M48352</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/149511"&gt;@kanikvijay9&lt;/a&gt;&amp;nbsp;,&lt;/P&gt;&lt;P&gt;Thanks for sharing solution with us! Could you mark your answer as a solution to the thread? It helps other memebers find correct answer in a faster way &lt;span class="lia-unicode-emoji" title=":slightly_smiling_face:"&gt;🙂&lt;/span&gt;&lt;/P&gt;</description>
      <pubDate>Tue, 19 Aug 2025 09:31:42 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/performance-issues-with-writing-large-dataframes-to-managed/m-p/128829#M48352</guid>
      <dc:creator>szymon_dybczak</dc:creator>
      <dc:date>2025-08-19T09:31:42Z</dc:date>
    </item>
  </channel>
</rss>

