<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Remove Duplicate rows in tables in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/remove-duplicate-rows-in-tables/m-p/91405#M38163</link>
    <description>&lt;P&gt;Hello&lt;/P&gt;&lt;P&gt;I've seen posts that show how to remove duplicates, something like this:&lt;/P&gt;&lt;P&gt;MERGE into [deltatable] as target&lt;/P&gt;&lt;P&gt;USING ( select *, ROW_NUMBER() OVER (Partition By [primary keys] Order By [date] desc) as rn&amp;nbsp; from [deltatable] qualify rn&amp;gt; 1 ) as source&lt;/P&gt;&lt;P&gt;ON [merge primary keys and date column between source and target]&lt;/P&gt;&lt;P&gt;WHEN MATCHED THEN DELETE&lt;/P&gt;&lt;P&gt;Problem is this is reliant upon the rows having something to order by as a differentiator.&lt;/P&gt;&lt;P&gt;If I use the same column to order then use ON source.&amp;lt;PK&amp;gt; = target.&amp;lt;PK&amp;gt;, it just removes every row (so same as just a straight delete)&lt;/P&gt;&lt;P&gt;&amp;nbsp;My data is an exact copy so I have 7 duplicated rows in the table&lt;/P&gt;&lt;P&gt;This is pretty easy to delete in SQL Server but I cannot find an easy method in databricks&lt;/P&gt;&lt;P&gt;Do I have to maybe delete and compare versions then insert from a previous version with a distinct?&lt;/P&gt;&lt;P&gt;Or create a temp table, insert with distinct then truncate source and copy back?&lt;/P&gt;&lt;P&gt;Both these methods seem long-winded&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Any thoughts?&lt;/P&gt;</description>
    <pubDate>Mon, 23 Sep 2024 10:07:54 GMT</pubDate>
    <dc:creator>dpc</dc:creator>
    <dc:date>2024-09-23T10:07:54Z</dc:date>
    <item>
      <title>Remove Duplicate rows in tables</title>
      <link>https://community.databricks.com/t5/data-engineering/remove-duplicate-rows-in-tables/m-p/91405#M38163</link>
      <description>&lt;P&gt;Hello&lt;/P&gt;&lt;P&gt;I've seen posts that show how to remove duplicates, something like this:&lt;/P&gt;&lt;P&gt;MERGE into [deltatable] as target&lt;/P&gt;&lt;P&gt;USING ( select *, ROW_NUMBER() OVER (Partition By [primary keys] Order By [date] desc) as rn&amp;nbsp; from [deltatable] qualify rn&amp;gt; 1 ) as source&lt;/P&gt;&lt;P&gt;ON [merge primary keys and date column between source and target]&lt;/P&gt;&lt;P&gt;WHEN MATCHED THEN DELETE&lt;/P&gt;&lt;P&gt;Problem is this is reliant upon the rows having something to order by as a differentiator.&lt;/P&gt;&lt;P&gt;If I use the same column to order then use ON source.&amp;lt;PK&amp;gt; = target.&amp;lt;PK&amp;gt;, it just removes every row (so same as just a straight delete)&lt;/P&gt;&lt;P&gt;&amp;nbsp;My data is an exact copy so I have 7 duplicated rows in the table&lt;/P&gt;&lt;P&gt;This is pretty easy to delete in SQL Server but I cannot find an easy method in databricks&lt;/P&gt;&lt;P&gt;Do I have to maybe delete and compare versions then insert from a previous version with a distinct?&lt;/P&gt;&lt;P&gt;Or create a temp table, insert with distinct then truncate source and copy back?&lt;/P&gt;&lt;P&gt;Both these methods seem long-winded&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Any thoughts?&lt;/P&gt;</description>
      <pubDate>Mon, 23 Sep 2024 10:07:54 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/remove-duplicate-rows-in-tables/m-p/91405#M38163</guid>
      <dc:creator>dpc</dc:creator>
      <dc:date>2024-09-23T10:07:54Z</dc:date>
    </item>
    <item>
      <title>Re: Remove Duplicate rows in tables</title>
      <link>https://community.databricks.com/t5/data-engineering/remove-duplicate-rows-in-tables/m-p/91448#M38166</link>
      <description>&lt;P&gt;Please try this pyspark solution.&amp;nbsp;&lt;BR /&gt;df = spark.table("yourtablename")&lt;BR /&gt;df = df.dropDuplicates()&lt;/P&gt;
&lt;P&gt;or use the Row_number() query and use DELETE statement where&amp;nbsp;rn &amp;gt; 1&lt;/P&gt;</description>
      <pubDate>Mon, 23 Sep 2024 14:00:35 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/remove-duplicate-rows-in-tables/m-p/91448#M38166</guid>
      <dc:creator>gchandra</dc:creator>
      <dc:date>2024-09-23T14:00:35Z</dc:date>
    </item>
    <item>
      <title>Re: Remove Duplicate rows in tables</title>
      <link>https://community.databricks.com/t5/data-engineering/remove-duplicate-rows-in-tables/m-p/91457#M38167</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/108745"&gt;@dpc&lt;/a&gt;&amp;nbsp;,&lt;BR /&gt;&lt;BR /&gt;if you like using SQL:&lt;/P&gt;&lt;P&gt;1. Test data:&lt;/P&gt;&lt;LI-CODE lang="python"&gt;# Sample data
data = [("1", "A"), ("1", "A"), ("2", "B"), ("2", "B"), ("3", "C")]

# Create DataFrame
df = spark.createDataFrame(data, ["id", "value"])

# Write to Delta table
df.write.format("delta").mode("overwrite").saveAsTable("duplicates")&lt;/LI-CODE&gt;&lt;P&gt;2. Removing the duplicates from the table&lt;/P&gt;&lt;LI-CODE lang="python"&gt;%sql
CREATE OR REPLACE TEMP VIEW dedups AS
SELECT DISTINCT * FROM duplicates;

-- Overwrite the existing Delta table with distinct rows
INSERT OVERWRITE duplicates
SELECT * FROM dedups;

SELECT *
FROM duplicates;&lt;/LI-CODE&gt;&lt;P&gt;3. No duplicates in duplicates:&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="filipniziol_0-1727102570106.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/11434i8D0DE76BD208FFD8/image-size/medium?v=v2&amp;amp;px=400" role="button" title="filipniziol_0-1727102570106.png" alt="filipniziol_0-1727102570106.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Mon, 23 Sep 2024 14:43:07 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/remove-duplicate-rows-in-tables/m-p/91457#M38167</guid>
      <dc:creator>filipniziol</dc:creator>
      <dc:date>2024-09-23T14:43:07Z</dc:date>
    </item>
    <item>
      <title>Re: Remove Duplicate rows in tables</title>
      <link>https://community.databricks.com/t5/data-engineering/remove-duplicate-rows-in-tables/m-p/91749#M38254</link>
      <description>&lt;P&gt;Thanks. This looks more straight forward than I thought&lt;/P&gt;&lt;P&gt;I'll give it a go in the morning and see if it works&lt;/P&gt;</description>
      <pubDate>Wed, 25 Sep 2024 16:15:57 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/remove-duplicate-rows-in-tables/m-p/91749#M38254</guid>
      <dc:creator>dpc</dc:creator>
      <dc:date>2024-09-25T16:15:57Z</dc:date>
    </item>
    <item>
      <title>Re: Remove Duplicate rows in tables</title>
      <link>https://community.databricks.com/t5/data-engineering/remove-duplicate-rows-in-tables/m-p/91750#M38255</link>
      <description>&lt;P&gt;I originally tried this:&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;Row_number() query and use DELETE statement where&amp;nbsp;rn &amp;gt; 1&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;It removes all rows no matter how I piece the code together&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;Found this method works well in SQL Server but not databricks&lt;/SPAN&gt;&lt;/P&gt;</description>
      <pubDate>Wed, 25 Sep 2024 16:17:48 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/remove-duplicate-rows-in-tables/m-p/91750#M38255</guid>
      <dc:creator>dpc</dc:creator>
      <dc:date>2024-09-25T16:17:48Z</dc:date>
    </item>
  </channel>
</rss>

