<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Selective overwrite on Partition and Liquid clustered tables in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/selective-overwrite-on-partition-and-liquid-clustered-tables/m-p/159968#M54829</link>
    <description>&lt;P&gt;Hi, the current way is not optimal. You can follow below&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;INSERT query ran with mostly 43 tasks, creating 43 output files. Since the&amp;nbsp;&lt;/SPAN&gt;Liquid clustered table has no organization (&lt;SPAN class=""&gt;clusterBy "[]") -&lt;/SPAN&gt;&lt;SPAN&gt;&amp;nbsp;dates are randomly scattered across files.&lt;/SPAN&gt;&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;Partition table did a clean partition-level swap as all&amp;nbsp;date='2026-06-02' data is isolated by partitioning&lt;/LI&gt;&lt;LI&gt;In Liquid Clustering with Auto -&amp;nbsp;Data is&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;&lt;STRONG&gt;scattered&lt;/STRONG&gt;&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;- mostly both dates mixed across files.&amp;nbsp;&lt;STRONG&gt;REPLACE&lt;/STRONG&gt; USING scanned all 86 files, identified which 43 files contained date='2026-06-02' followed by rewrite. It's an expensive operation (&lt;SPAN class=""&gt;numDeletedRows: 30M&lt;/SPAN&gt;)&lt;/LI&gt;&lt;/UL&gt;&lt;P&gt;You can follow below to reduce file count during writes&lt;/P&gt;&lt;P&gt;1. &lt;U&gt;&lt;STRONG&gt;Control in Configuration&lt;/STRONG&gt;&lt;/U&gt;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;SET spark.sql.shuffle.partitions = 16

INSERT INTO &amp;lt;target&amp;gt; REPLACE USING (BED)
SELECT &amp;lt;cols&amp;gt; FROM &amp;lt;table&amp;gt;&lt;/LI-CODE&gt;&lt;P&gt;2. &lt;U&gt;&lt;STRONG&gt;Control in Repartition&lt;/STRONG&gt;&lt;/U&gt;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;INSERT INTO &amp;lt;target&amp;gt; REPLACE USING (BED)
SELECT /*+ REPARTITION(16) */ &amp;lt;col&amp;gt; FROM &amp;lt;table&amp;gt;;&lt;/LI-CODE&gt;&lt;P&gt;&lt;STRONG&gt;Partitioning is simpler and faster&lt;/STRONG&gt;&lt;SPAN&gt;&amp;nbsp;than liquid clustering in this case (With 2 date values and full replacements).&amp;nbsp;&lt;STRONG&gt;Liquid clustering&lt;/STRONG&gt; &lt;STRONG&gt;wins&lt;/STRONG&gt; when you have high cardinality or multi-dimensional queries.&lt;/SPAN&gt;&lt;/P&gt;</description>
    <pubDate>Sat, 20 Jun 2026 18:07:11 GMT</pubDate>
    <dc:creator>balajij8</dc:creator>
    <dc:date>2026-06-20T18:07:11Z</dc:date>
    <item>
      <title>Selective overwrite on Partition and Liquid clustered tables</title>
      <link>https://community.databricks.com/t5/data-engineering/selective-overwrite-on-partition-and-liquid-clustered-tables/m-p/159917#M54819</link>
      <description>&lt;P&gt;Hi,&lt;/P&gt;&lt;P&gt;I have created 2 identical tables but one is partitioned and the one is a Liquid Clustered with Auto Clustering.&lt;/P&gt;&lt;P&gt;I inserted 30M rows x 2 (60M) for two dates , date 1 = 2026-06-01 and date = 2026-06-02 , then I overwrite the date&amp;nbsp;2026-06-02 with a selective overwrite statement&amp;nbsp;&lt;/P&gt;&lt;DIV&gt;&lt;DIV&gt;&lt;EM&gt;REPLACE USING (BED)&lt;/EM&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;P&gt;&lt;SPAN class=""&gt;For partition table , history shows ,&lt;/SPAN&gt;&lt;/P&gt;&lt;TABLE border="1" width="100%"&gt;&lt;TBODY&gt;&lt;TR&gt;&lt;TD width="50%"&gt;&lt;STRONG&gt;&lt;SPAN class=""&gt;operationParameters&lt;/SPAN&gt;&lt;/STRONG&gt;&lt;/TD&gt;&lt;TD width="50%"&gt;&amp;nbsp;&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD width="50%"&gt;&amp;nbsp;&lt;/TD&gt;&lt;TD width="50%"&gt;&lt;P&gt;&lt;SPAN class=""&gt;object&lt;/SPAN&gt;&lt;/P&gt;&lt;UL class=""&gt;&lt;LI&gt;&lt;DIV class=""&gt;&lt;SPAN class=""&gt;mode&lt;/SPAN&gt;&lt;SPAN class=""&gt;:&amp;nbsp;&lt;/SPAN&gt;&lt;DIV class=""&gt;"Overwrite"&lt;/DIV&gt;&lt;/DIV&gt;&lt;/LI&gt;&lt;LI&gt;&lt;DIV class=""&gt;&lt;SPAN class=""&gt;statsOnLoad&lt;/SPAN&gt;&lt;SPAN class=""&gt;:&amp;nbsp;&lt;/SPAN&gt;&lt;DIV class=""&gt;"false"&lt;/DIV&gt;&lt;/DIV&gt;&lt;/LI&gt;&lt;LI&gt;&lt;DIV class=""&gt;&lt;SPAN class=""&gt;partitionBy&lt;/SPAN&gt;&lt;SPAN class=""&gt;:&amp;nbsp;&lt;/SPAN&gt;&lt;DIV class=""&gt;"[]"&lt;/DIV&gt;&lt;/DIV&gt;&lt;/LI&gt;&lt;/UL&gt;&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD width="50%"&gt;&lt;STRONG&gt;operationMetrics&lt;/STRONG&gt;&lt;/TD&gt;&lt;TD width="50%"&gt;&amp;nbsp;&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD width="50%"&gt;&amp;nbsp;&lt;/TD&gt;&lt;TD width="50%"&gt;&lt;SPAN class=""&gt;object&lt;/SPAN&gt;&lt;UL class=""&gt;&lt;LI&gt;&lt;DIV class=""&gt;&lt;SPAN class=""&gt;numFiles&lt;/SPAN&gt;&lt;SPAN class=""&gt;:&amp;nbsp;&lt;/SPAN&gt;&lt;DIV class=""&gt;"43"&lt;/DIV&gt;&lt;/DIV&gt;&lt;/LI&gt;&lt;LI&gt;&lt;DIV class=""&gt;&lt;SPAN class=""&gt;numRemovedFiles&lt;/SPAN&gt;&lt;SPAN class=""&gt;:&amp;nbsp;&lt;/SPAN&gt;&lt;DIV class=""&gt;"43"&lt;/DIV&gt;&lt;/DIV&gt;&lt;/LI&gt;&lt;LI&gt;&lt;DIV class=""&gt;&lt;SPAN class=""&gt;numRemovedBytes&lt;/SPAN&gt;&lt;SPAN class=""&gt;:&amp;nbsp;&lt;/SPAN&gt;&lt;DIV class=""&gt;"6090798096"&lt;/DIV&gt;&lt;/DIV&gt;&lt;/LI&gt;&lt;LI&gt;&lt;DIV class=""&gt;&lt;SPAN class=""&gt;numDeletionVectorsRemoved&lt;/SPAN&gt;&lt;SPAN class=""&gt;:&amp;nbsp;&lt;/SPAN&gt;&lt;DIV class=""&gt;"0"&lt;/DIV&gt;&lt;/DIV&gt;&lt;/LI&gt;&lt;LI&gt;&lt;DIV class=""&gt;&lt;SPAN class=""&gt;numOutputRows&lt;/SPAN&gt;&lt;SPAN class=""&gt;:&amp;nbsp;&lt;/SPAN&gt;&lt;DIV class=""&gt;"30000000"&lt;/DIV&gt;&lt;/DIV&gt;&lt;/LI&gt;&lt;LI&gt;&lt;DIV class=""&gt;&lt;SPAN class=""&gt;numOutputBytes&lt;/SPAN&gt;&lt;SPAN class=""&gt;:&amp;nbsp;&lt;/SPAN&gt;&lt;DIV class=""&gt;"6101142740"&lt;/DIV&gt;&lt;/DIV&gt;&lt;/LI&gt;&lt;/UL&gt;&lt;/TD&gt;&lt;/TR&gt;&lt;/TBODY&gt;&lt;/TABLE&gt;&lt;P&gt;&lt;SPAN class=""&gt;For Liquid Clustered table&amp;nbsp;&lt;/SPAN&gt;&lt;/P&gt;&lt;TABLE border="1" width="100%"&gt;&lt;TBODY&gt;&lt;TR&gt;&lt;TD width="50%"&gt;&lt;STRONG&gt;&lt;SPAN class=""&gt;operationParameters&lt;/SPAN&gt;&lt;/STRONG&gt;&lt;/TD&gt;&lt;TD width="50%"&gt;&amp;nbsp;&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD width="50%"&gt;&amp;nbsp;&lt;/TD&gt;&lt;TD width="50%"&gt;&lt;SPAN class=""&gt;object&lt;/SPAN&gt;&lt;UL class=""&gt;&lt;LI&gt;&lt;DIV class=""&gt;&lt;SPAN class=""&gt;partitionBy&lt;/SPAN&gt;&lt;SPAN class=""&gt;:&amp;nbsp;&lt;/SPAN&gt;&lt;DIV class=""&gt;"[]"&lt;/DIV&gt;&lt;/DIV&gt;&lt;/LI&gt;&lt;LI&gt;&lt;DIV class=""&gt;&lt;SPAN class=""&gt;clusterBy&lt;/SPAN&gt;&lt;SPAN class=""&gt;:&amp;nbsp;&lt;/SPAN&gt;&lt;DIV class=""&gt;"[]"&lt;/DIV&gt;&lt;/DIV&gt;&lt;/LI&gt;&lt;LI&gt;&lt;DIV class=""&gt;&lt;SPAN class=""&gt;mode&lt;/SPAN&gt;&lt;SPAN class=""&gt;:&amp;nbsp;&lt;/SPAN&gt;&lt;DIV class=""&gt;"Overwrite"&lt;/DIV&gt;&lt;/DIV&gt;&lt;/LI&gt;&lt;LI&gt;&lt;DIV class=""&gt;&lt;SPAN class=""&gt;clusteringOnWriteStatus&lt;/SPAN&gt;&lt;SPAN class=""&gt;:&amp;nbsp;&lt;/SPAN&gt;&lt;DIV class=""&gt;null&lt;/DIV&gt;&lt;/DIV&gt;&lt;/LI&gt;&lt;LI&gt;&lt;DIV class=""&gt;&lt;SPAN class=""&gt;statsOnLoad&lt;/SPAN&gt;&lt;SPAN class=""&gt;:&amp;nbsp;&lt;/SPAN&gt;&lt;DIV class=""&gt;"false"&lt;/DIV&gt;&lt;/DIV&gt;&lt;/LI&gt;&lt;LI&gt;&lt;DIV class=""&gt;&lt;SPAN class=""&gt;replaceUsingCols&lt;/SPAN&gt;&lt;SPAN class=""&gt;:&amp;nbsp;&lt;/SPAN&gt;&lt;DIV class=""&gt;"(BED)"&lt;/DIV&gt;&lt;/DIV&gt;&lt;/LI&gt;&lt;/UL&gt;&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD width="50%"&gt;&lt;STRONG&gt;operationMetrics&lt;/STRONG&gt;&lt;/TD&gt;&lt;TD width="50%"&gt;&amp;nbsp;&lt;/TD&gt;&lt;/TR&gt;&lt;TR&gt;&lt;TD width="50%"&gt;&amp;nbsp;&lt;/TD&gt;&lt;TD width="50%"&gt;&lt;SPAN class=""&gt;object&lt;/SPAN&gt;&lt;UL class=""&gt;&lt;LI&gt;&lt;DIV class=""&gt;&lt;SPAN class=""&gt;numFiles&lt;/SPAN&gt;&lt;SPAN class=""&gt;:&amp;nbsp;&lt;/SPAN&gt;&lt;DIV class=""&gt;"43"&lt;/DIV&gt;&lt;/DIV&gt;&lt;/LI&gt;&lt;LI&gt;&lt;DIV class=""&gt;&lt;SPAN class=""&gt;numRemovedFiles&lt;/SPAN&gt;&lt;SPAN class=""&gt;:&amp;nbsp;&lt;/SPAN&gt;&lt;DIV class=""&gt;"43"&lt;/DIV&gt;&lt;/DIV&gt;&lt;/LI&gt;&lt;LI&gt;&lt;DIV class=""&gt;&lt;SPAN class=""&gt;numRemovedBytes&lt;/SPAN&gt;&lt;SPAN class=""&gt;:&amp;nbsp;&lt;/SPAN&gt;&lt;DIV class=""&gt;"5779903887"&lt;/DIV&gt;&lt;/DIV&gt;&lt;/LI&gt;&lt;LI&gt;&lt;DIV class=""&gt;&lt;SPAN class=""&gt;numCopiedRows&lt;/SPAN&gt;&lt;SPAN class=""&gt;:&amp;nbsp;&lt;/SPAN&gt;&lt;DIV class=""&gt;"0"&lt;/DIV&gt;&lt;/DIV&gt;&lt;/LI&gt;&lt;LI&gt;&lt;DIV class=""&gt;&lt;SPAN class=""&gt;numDeletionVectorsAdded&lt;/SPAN&gt;&lt;SPAN class=""&gt;:&amp;nbsp;&lt;/SPAN&gt;&lt;DIV class=""&gt;"0"&lt;/DIV&gt;&lt;/DIV&gt;&lt;/LI&gt;&lt;LI&gt;&lt;DIV class=""&gt;&lt;SPAN class=""&gt;numDeletedRows&lt;/SPAN&gt;&lt;SPAN class=""&gt;:&amp;nbsp;&lt;/SPAN&gt;&lt;DIV class=""&gt;"30000000"&lt;/DIV&gt;&lt;/DIV&gt;&lt;/LI&gt;&lt;LI&gt;&lt;DIV class=""&gt;&lt;SPAN class=""&gt;numDeletionVectorsRemoved&lt;/SPAN&gt;&lt;SPAN class=""&gt;:&amp;nbsp;&lt;/SPAN&gt;&lt;DIV class=""&gt;"0"&lt;/DIV&gt;&lt;/DIV&gt;&lt;/LI&gt;&lt;LI&gt;&lt;DIV class=""&gt;&lt;SPAN class=""&gt;numAddedChangeFiles&lt;/SPAN&gt;&lt;SPAN class=""&gt;:&amp;nbsp;&lt;/SPAN&gt;&lt;DIV class=""&gt;"0"&lt;/DIV&gt;&lt;/DIV&gt;&lt;/LI&gt;&lt;LI&gt;&lt;DIV class=""&gt;&lt;SPAN class=""&gt;numOutputRows&lt;/SPAN&gt;&lt;SPAN class=""&gt;:&amp;nbsp;&lt;/SPAN&gt;&lt;DIV class=""&gt;"30000000"&lt;/DIV&gt;&lt;/DIV&gt;&lt;/LI&gt;&lt;LI&gt;&lt;DIV class=""&gt;&lt;SPAN class=""&gt;numOutputBytes&lt;/SPAN&gt;&lt;SPAN class=""&gt;:&amp;nbsp;&lt;/SPAN&gt;&lt;DIV class=""&gt;"5779715681"&lt;/DIV&gt;&lt;DIV class=""&gt;&amp;nbsp;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/LI&gt;&lt;/UL&gt;&lt;/TD&gt;&lt;/TR&gt;&lt;/TBODY&gt;&lt;/TABLE&gt;&lt;P&gt;Its overwriting 43 files (Total 86) ?, is this optimal ?&lt;/P&gt;&lt;P&gt;Is there a way to improve performance by reducing the # of files ?&lt;/P&gt;&lt;P&gt;I am using&amp;nbsp;&lt;/P&gt;&lt;P&gt;INSERT INTO &amp;lt;target&amp;gt;&amp;nbsp; REPLACE USING (col)&amp;nbsp;&lt;/P&gt;&lt;P&gt;SELECT &amp;lt;cols&amp;gt; FROM &amp;lt;table&amp;gt;&amp;nbsp;&lt;/P&gt;&lt;DIV class=""&gt;&lt;DIV class=""&gt;&amp;nbsp;&lt;/DIV&gt;&lt;/DIV&gt;</description>
      <pubDate>Fri, 19 Jun 2026 16:08:27 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/selective-overwrite-on-partition-and-liquid-clustered-tables/m-p/159917#M54819</guid>
      <dc:creator>RGSLCA</dc:creator>
      <dc:date>2026-06-19T16:08:27Z</dc:date>
    </item>
    <item>
      <title>Re: Selective overwrite on Partition and Liquid clustered tables</title>
      <link>https://community.databricks.com/t5/data-engineering/selective-overwrite-on-partition-and-liquid-clustered-tables/m-p/159968#M54829</link>
      <description>&lt;P&gt;Hi, the current way is not optimal. You can follow below&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;INSERT query ran with mostly 43 tasks, creating 43 output files. Since the&amp;nbsp;&lt;/SPAN&gt;Liquid clustered table has no organization (&lt;SPAN class=""&gt;clusterBy "[]") -&lt;/SPAN&gt;&lt;SPAN&gt;&amp;nbsp;dates are randomly scattered across files.&lt;/SPAN&gt;&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;Partition table did a clean partition-level swap as all&amp;nbsp;date='2026-06-02' data is isolated by partitioning&lt;/LI&gt;&lt;LI&gt;In Liquid Clustering with Auto -&amp;nbsp;Data is&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;&lt;STRONG&gt;scattered&lt;/STRONG&gt;&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;- mostly both dates mixed across files.&amp;nbsp;&lt;STRONG&gt;REPLACE&lt;/STRONG&gt; USING scanned all 86 files, identified which 43 files contained date='2026-06-02' followed by rewrite. It's an expensive operation (&lt;SPAN class=""&gt;numDeletedRows: 30M&lt;/SPAN&gt;)&lt;/LI&gt;&lt;/UL&gt;&lt;P&gt;You can follow below to reduce file count during writes&lt;/P&gt;&lt;P&gt;1. &lt;U&gt;&lt;STRONG&gt;Control in Configuration&lt;/STRONG&gt;&lt;/U&gt;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;SET spark.sql.shuffle.partitions = 16

INSERT INTO &amp;lt;target&amp;gt; REPLACE USING (BED)
SELECT &amp;lt;cols&amp;gt; FROM &amp;lt;table&amp;gt;&lt;/LI-CODE&gt;&lt;P&gt;2. &lt;U&gt;&lt;STRONG&gt;Control in Repartition&lt;/STRONG&gt;&lt;/U&gt;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;INSERT INTO &amp;lt;target&amp;gt; REPLACE USING (BED)
SELECT /*+ REPARTITION(16) */ &amp;lt;col&amp;gt; FROM &amp;lt;table&amp;gt;;&lt;/LI-CODE&gt;&lt;P&gt;&lt;STRONG&gt;Partitioning is simpler and faster&lt;/STRONG&gt;&lt;SPAN&gt;&amp;nbsp;than liquid clustering in this case (With 2 date values and full replacements).&amp;nbsp;&lt;STRONG&gt;Liquid clustering&lt;/STRONG&gt; &lt;STRONG&gt;wins&lt;/STRONG&gt; when you have high cardinality or multi-dimensional queries.&lt;/SPAN&gt;&lt;/P&gt;</description>
      <pubDate>Sat, 20 Jun 2026 18:07:11 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/selective-overwrite-on-partition-and-liquid-clustered-tables/m-p/159968#M54829</guid>
      <dc:creator>balajij8</dc:creator>
      <dc:date>2026-06-20T18:07:11Z</dc:date>
    </item>
  </channel>
</rss>

