<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Large MERGE Statements - 500+ lines of code! in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/large-merge-statements-500-lines-of-code/m-p/10640#M5785</link>
    <description>&lt;P&gt;Thanks, this does make sense.&lt;/P&gt;&lt;P&gt;I have a new lead to chase .. &lt;span class="lia-unicode-emoji" title=":slightly_smiling_face:"&gt;🙂&lt;/span&gt;&lt;/P&gt;&lt;P&gt;Much appreciated. &lt;span class="lia-unicode-emoji" title=":smiling_face_with_smiling_eyes:"&gt;😊&lt;/span&gt; &lt;/P&gt;</description>
    <pubDate>Fri, 27 Jan 2023 14:24:40 GMT</pubDate>
    <dc:creator>StevenW</dc:creator>
    <dc:date>2023-01-27T14:24:40Z</dc:date>
    <item>
      <title>Large MERGE Statements - 500+ lines of code!</title>
      <link>https://community.databricks.com/t5/data-engineering/large-merge-statements-500-lines-of-code/m-p/10630#M5775</link>
      <description>&lt;P&gt;I'm new to databricks. (Not new to DB's - 10+ year DB Developer).&lt;/P&gt;&lt;P&gt;How do you generate a MERGE statement in DataBricks?&amp;nbsp;&lt;/P&gt;&lt;P&gt;Trying to manually maintain a 500+ or 1000+ lines in a MERGE statement doesn't make much sense? Working with Large Tables of between 200 - 500 columns.&lt;/P&gt;</description>
      <pubDate>Thu, 26 Jan 2023 14:20:27 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/large-merge-statements-500-lines-of-code/m-p/10630#M5775</guid>
      <dc:creator>StevenW</dc:creator>
      <dc:date>2023-01-26T14:20:27Z</dc:date>
    </item>
    <item>
      <title>Re: Large MERGE Statements - 500+ lines of code!</title>
      <link>https://community.databricks.com/t5/data-engineering/large-merge-statements-500-lines-of-code/m-p/10631#M5776</link>
      <description>&lt;P&gt;In my opinion, when possible MERGE statement should be on the primary key. If not possible you can create your own unique key (by concatenate some fields and eventually hashing them) and then use it in merge logic. &lt;/P&gt;</description>
      <pubDate>Thu, 26 Jan 2023 16:40:17 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/large-merge-statements-500-lines-of-code/m-p/10631#M5776</guid>
      <dc:creator>Hubert-Dudek</dc:creator>
      <dc:date>2023-01-26T16:40:17Z</dc:date>
    </item>
    <item>
      <title>Re: Large MERGE Statements - 500+ lines of code!</title>
      <link>https://community.databricks.com/t5/data-engineering/large-merge-statements-500-lines-of-code/m-p/10632#M5777</link>
      <description>&lt;P&gt;Thanks.. but that's not really what I'm asking...&lt;/P&gt;&lt;P&gt;How many columns can a MERGE statement manage, before maintenance becomes a nightmare?&lt;/P&gt;</description>
      <pubDate>Thu, 26 Jan 2023 16:48:25 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/large-merge-statements-500-lines-of-code/m-p/10632#M5777</guid>
      <dc:creator>StevenW</dc:creator>
      <dc:date>2023-01-26T16:48:25Z</dc:date>
    </item>
    <item>
      <title>Re: Large MERGE Statements - 500+ lines of code!</title>
      <link>https://community.databricks.com/t5/data-engineering/large-merge-statements-500-lines-of-code/m-p/10633#M5778</link>
      <description>&lt;P&gt;It was tested with up to 4000 columns. Here are tests up to 1000 columns after the update in December 2021 &lt;A href="https://github.com/delta-io/delta/pull/584" target="test_blank"&gt;https://github.com/delta-io/delta/pull/584&lt;/A&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Also, remember that stats are precalculated for the first 32 columns (you can change it to more in settings). So it would be good to have fields on which you merge conditions in the first 32 columns.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper" image-alt="image.png"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/785i8A32F2B72A6AC16B/image-size/large?v=v2&amp;amp;px=999" role="button" title="image.png" alt="image.png" /&gt;&lt;/span&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 26 Jan 2023 16:54:51 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/large-merge-statements-500-lines-of-code/m-p/10633#M5778</guid>
      <dc:creator>Hubert-Dudek</dc:creator>
      <dc:date>2023-01-26T16:54:51Z</dc:date>
    </item>
    <item>
      <title>Re: Large MERGE Statements - 500+ lines of code!</title>
      <link>https://community.databricks.com/t5/data-engineering/large-merge-statements-500-lines-of-code/m-p/10634#M5779</link>
      <description>&lt;P&gt;Interesting...I had  not yet considered the performance issues. I didn't think there would be any .. will clearly need to come back to this one... &lt;span class="lia-unicode-emoji" title=":slightly_smiling_face:"&gt;🙂&lt;/span&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;I'm currently only concerned with the maintenance of a piece of code (the 500+ lines of the MERGE statement). Do you just "eyeball" the changes and hope for the best, or is there a more structured way to maintain large MERGE statements?&lt;/P&gt;</description>
      <pubDate>Fri, 27 Jan 2023 08:19:11 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/large-merge-statements-500-lines-of-code/m-p/10634#M5779</guid>
      <dc:creator>StevenW</dc:creator>
      <dc:date>2023-01-27T08:19:11Z</dc:date>
    </item>
    <item>
      <title>Re: Large MERGE Statements - 500+ lines of code!</title>
      <link>https://community.databricks.com/t5/data-engineering/large-merge-statements-500-lines-of-code/m-p/10635#M5780</link>
      <description>&lt;P&gt;I am still wondering what you mean by 500+ lines of code for a merge.&lt;/P&gt;&lt;P&gt;do you mean the list of columns which should be updated?&lt;/P&gt;&lt;P&gt;If you want to update a subset of columns that can become cumbersome indeed.  But with some coding in scala/python you can create a list of column names which you can then pass to the query.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;If you want to update all, use *&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;MERGE INTO table 
  USING updates 
  ON mgline.RegistrationYear IN ($yearlist) AND
     table.key1 = updates.key1 and 
     table.key2 = updates.key2 and
     table.key3 = updates.key3
  WHEN MATCHED THEN&amp;nbsp;
     UPDATE SET *&amp;nbsp;
  WHEN NOT MATCHED THEN 
     INSERT *&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Fri, 27 Jan 2023 13:31:56 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/large-merge-statements-500-lines-of-code/m-p/10635#M5780</guid>
      <dc:creator>-werners-</dc:creator>
      <dc:date>2023-01-27T13:31:56Z</dc:date>
    </item>
    <item>
      <title>Re: Large MERGE Statements - 500+ lines of code!</title>
      <link>https://community.databricks.com/t5/data-engineering/large-merge-statements-500-lines-of-code/m-p/10636#M5781</link>
      <description>&lt;P&gt;Yes, I mean the list of columns becomes large. To then maintain a MERGE statement could be very cumbersome.&lt;/P&gt;&lt;P&gt;Would you happen to have an example of such Python code? This does actually make sense.. if such a list was generated dynamically, and then used for the UPD/INS statements..&lt;/P&gt;</description>
      <pubDate>Fri, 27 Jan 2023 13:57:02 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/large-merge-statements-500-lines-of-code/m-p/10636#M5781</guid>
      <dc:creator>StevenW</dc:creator>
      <dc:date>2023-01-27T13:57:02Z</dc:date>
    </item>
    <item>
      <title>Re: Large MERGE Statements - 500+ lines of code!</title>
      <link>https://community.databricks.com/t5/data-engineering/large-merge-statements-500-lines-of-code/m-p/10637#M5782</link>
      <description>&lt;P&gt;I don't have an example at hand, but if you can do a df.columns, that gives you all the cols of the table (in dataframe format of course), then depending on the case you can drop columns or keep a few or ... and then try to use that list for the merge.&lt;/P&gt;&lt;P&gt;TBH I never did that though, I always use update * to avoid the hassle.&lt;/P&gt;</description>
      <pubDate>Fri, 27 Jan 2023 14:01:43 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/large-merge-statements-500-lines-of-code/m-p/10637#M5782</guid>
      <dc:creator>-werners-</dc:creator>
      <dc:date>2023-01-27T14:01:43Z</dc:date>
    </item>
    <item>
      <title>Re: Large MERGE Statements - 500+ lines of code!</title>
      <link>https://community.databricks.com/t5/data-engineering/large-merge-statements-500-lines-of-code/m-p/10638#M5783</link>
      <description>&lt;P&gt;Would UPDATE SET * not require the the source and target columns to have the same names, and the same column order ?&lt;/P&gt;</description>
      <pubDate>Fri, 27 Jan 2023 14:10:57 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/large-merge-statements-500-lines-of-code/m-p/10638#M5783</guid>
      <dc:creator>StevenW</dc:creator>
      <dc:date>2023-01-27T14:10:57Z</dc:date>
    </item>
    <item>
      <title>Re: Large MERGE Statements - 500+ lines of code!</title>
      <link>https://community.databricks.com/t5/data-engineering/large-merge-statements-500-lines-of-code/m-p/10639#M5784</link>
      <description>&lt;P&gt;don't know about order.&lt;/P&gt;&lt;P&gt;However, I always prepare the incoming data so that it has the same schema as the target.  This makes merges easy.  You indeed do not want to tinker around in a merge statement and typing tons of columns.&lt;/P&gt;&lt;P&gt;Using scala/python it is practically always possible to prepare your data.&lt;/P&gt;&lt;P&gt;Takes some time to learn, but it is worth it.&lt;/P&gt;</description>
      <pubDate>Fri, 27 Jan 2023 14:13:51 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/large-merge-statements-500-lines-of-code/m-p/10639#M5784</guid>
      <dc:creator>-werners-</dc:creator>
      <dc:date>2023-01-27T14:13:51Z</dc:date>
    </item>
    <item>
      <title>Re: Large MERGE Statements - 500+ lines of code!</title>
      <link>https://community.databricks.com/t5/data-engineering/large-merge-statements-500-lines-of-code/m-p/10640#M5785</link>
      <description>&lt;P&gt;Thanks, this does make sense.&lt;/P&gt;&lt;P&gt;I have a new lead to chase .. &lt;span class="lia-unicode-emoji" title=":slightly_smiling_face:"&gt;🙂&lt;/span&gt;&lt;/P&gt;&lt;P&gt;Much appreciated. &lt;span class="lia-unicode-emoji" title=":smiling_face_with_smiling_eyes:"&gt;😊&lt;/span&gt; &lt;/P&gt;</description>
      <pubDate>Fri, 27 Jan 2023 14:24:40 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/large-merge-statements-500-lines-of-code/m-p/10640#M5785</guid>
      <dc:creator>StevenW</dc:creator>
      <dc:date>2023-01-27T14:24:40Z</dc:date>
    </item>
  </channel>
</rss>

