<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Feature request delta tables : drop duplicate rows in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/feature-request-delta-tables-drop-duplicate-rows/m-p/75505#M34971</link>
    <description>&lt;P&gt;This is basically wiping and rewriting the whole table. Obviously it's a very easy solution, but very expensive.&lt;/P&gt;&lt;P&gt;There's a reason why the "usual" solutions are very complex, because they only target the duplicated rows.&lt;/P&gt;</description>
    <pubDate>Sun, 23 Jun 2024 14:55:27 GMT</pubDate>
    <dc:creator>Victor_D</dc:creator>
    <dc:date>2024-06-23T14:55:27Z</dc:date>
    <item>
      <title>Feature request delta tables : drop duplicate rows</title>
      <link>https://community.databricks.com/t5/data-engineering/feature-request-delta-tables-drop-duplicate-rows/m-p/4306#M1061</link>
      <description>&lt;P&gt;A deltaTable.dropDuplicates(columns) would be a very nice feature, simplifying the complex procedures that are suggested online.  &lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Or am I missing any existing procedures that can be done withouth merge operations or similar?&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Tue, 16 May 2023 13:10:35 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/feature-request-delta-tables-drop-duplicate-rows/m-p/4306#M1061</guid>
      <dc:creator>MRTN</dc:creator>
      <dc:date>2023-05-16T13:10:35Z</dc:date>
    </item>
    <item>
      <title>Re: Feature request delta tables : drop duplicate rows</title>
      <link>https://community.databricks.com/t5/data-engineering/feature-request-delta-tables-drop-duplicate-rows/m-p/4307#M1062</link>
      <description>&lt;P&gt;It would be helpful. Currently, the best way is just to read the table as a dataframe and use Pyspark dropDuplicates().&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;# Load the table
df = spark.table("yourtable")
&amp;nbsp;
# Drop duplicates based on the Id and Name columns
df = df.dropDuplicates(["Id", "Name"])
&amp;nbsp;
# Overwrite the original table with the resulting dataframe
df.write.mode("overwrite").saveAsTable("yourtable")&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Tue, 16 May 2023 17:14:34 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/feature-request-delta-tables-drop-duplicate-rows/m-p/4307#M1062</guid>
      <dc:creator>Hubert-Dudek</dc:creator>
      <dc:date>2023-05-16T17:14:34Z</dc:date>
    </item>
    <item>
      <title>Re: Feature request delta tables : drop duplicate rows</title>
      <link>https://community.databricks.com/t5/data-engineering/feature-request-delta-tables-drop-duplicate-rows/m-p/4308#M1063</link>
      <description>&lt;P&gt;I created a feature request in the delta table project: &lt;A href="https://github.com/delta-io/delta/issues/1767" alt="https://github.com/delta-io/delta/issues/1767" target="_blank"&gt;[Feature Request] data deduplication on existing delta table · Issue #1767 · delta-io/delta (github.com)&lt;/A&gt;&lt;/P&gt;</description>
      <pubDate>Tue, 16 May 2023 21:43:21 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/feature-request-delta-tables-drop-duplicate-rows/m-p/4308#M1063</guid>
      <dc:creator>MRTN</dc:creator>
      <dc:date>2023-05-16T21:43:21Z</dc:date>
    </item>
    <item>
      <title>Re: Feature request delta tables : drop duplicate rows</title>
      <link>https://community.databricks.com/t5/data-engineering/feature-request-delta-tables-drop-duplicate-rows/m-p/56438#M30554</link>
      <description>&lt;P&gt;This worked perfectly, and much easier than all the complex solutions that are suggested online.&lt;/P&gt;</description>
      <pubDate>Thu, 04 Jan 2024 15:29:42 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/feature-request-delta-tables-drop-duplicate-rows/m-p/56438#M30554</guid>
      <dc:creator>DRGutierrez</dc:creator>
      <dc:date>2024-01-04T15:29:42Z</dc:date>
    </item>
    <item>
      <title>Re: Feature request delta tables : drop duplicate rows</title>
      <link>https://community.databricks.com/t5/data-engineering/feature-request-delta-tables-drop-duplicate-rows/m-p/75505#M34971</link>
      <description>&lt;P&gt;This is basically wiping and rewriting the whole table. Obviously it's a very easy solution, but very expensive.&lt;/P&gt;&lt;P&gt;There's a reason why the "usual" solutions are very complex, because they only target the duplicated rows.&lt;/P&gt;</description>
      <pubDate>Sun, 23 Jun 2024 14:55:27 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/feature-request-delta-tables-drop-duplicate-rows/m-p/75505#M34971</guid>
      <dc:creator>Victor_D</dc:creator>
      <dc:date>2024-06-23T14:55:27Z</dc:date>
    </item>
    <item>
      <title>Re: Feature request delta tables : drop duplicate rows</title>
      <link>https://community.databricks.com/t5/data-engineering/feature-request-delta-tables-drop-duplicate-rows/m-p/102307#M41064</link>
      <description>&lt;P&gt;is this still the best method?&lt;/P&gt;</description>
      <pubDate>Mon, 16 Dec 2024 21:17:56 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/feature-request-delta-tables-drop-duplicate-rows/m-p/102307#M41064</guid>
      <dc:creator>akshaybhan92</dc:creator>
      <dc:date>2024-12-16T21:17:56Z</dc:date>
    </item>
  </channel>
</rss>

