<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic cross join issue generating surrogate keys in delta table in Get Started Discussions</title>
    <link>https://community.databricks.com/t5/get-started-discussions/cross-join-issue-generating-surrogate-keys-in-delta-table/m-p/59493#M2501</link>
    <description>&lt;P&gt;I used below code to populate target table, it is working as expected but expect for surrogatekey column. After I inserted dummy entry -1 and did run merge code, it is generating the numbers in surrogatekey column like this 1,3,5,7,9 odd numbers, it is really strange, but if I don't insert -1 entry then it is generating like this 1,2,3,4,5,6,7 but I need dummy entry for all dimensions to capture unknow data. Please help me on this.&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;is cross join causing issue? but not sure&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;%sql&lt;BR /&gt;CREATE TABLE xxxx.target_table (&lt;BR /&gt;general_sk BIGINT NOT NULL GENERATED BY DEFAULT AS IDENTITY (START WITH 1 INCREMENT BY 1),&lt;BR /&gt;ethnicity_code STRING,&lt;BR /&gt;ethnicity_description STRING,&lt;BR /&gt;indicator_code STRING,&lt;BR /&gt;indicator_description STRING,&lt;BR /&gt;effective_from DATE,&lt;BR /&gt;effective_to DATE)&lt;/P&gt;&lt;P&gt;%sql&lt;BR /&gt;insert into xxxx.target_table(gneral_sk,ethnicity_code,ethnicity_description,indicator_code,indicator_description,effective_from,effective_to)&lt;BR /&gt;SELECT -1,-1,'Missing (Not Applicable/ Not Known)',-1,'Missing (Not Applicable/ Not Known)','1900-01-01','9999-12-31'&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;df = spark.sql(f""" select distinct&lt;BR /&gt;et.code as ethnicity_code,&lt;BR /&gt;et.description as ethnicity_description,&lt;BR /&gt;hl.code as indicator_code,&lt;BR /&gt;hl.description as indicator_description&lt;BR /&gt;from&lt;BR /&gt;(select * from xxx.lookup where tablename in ('ethnicity') ) as et&lt;BR /&gt;cross join (select * from xxx.lookup where tablename in ('indicator')) as hl&lt;BR /&gt;""")&lt;/P&gt;&lt;P&gt;df.createOrReplaceTempView("updated_rows")&lt;/P&gt;&lt;P&gt;merge into xxxx.target_table trg&lt;BR /&gt;using (&lt;BR /&gt;select distinct&lt;BR /&gt;md5(concat(ethnicity_code,indicator_code)) as merge_key,&lt;BR /&gt;ethnicity_code,&lt;BR /&gt;ethnicity_description,&lt;BR /&gt;indicator_code,&lt;BR /&gt;indicator_description&lt;BR /&gt;&lt;BR /&gt;from&lt;BR /&gt;updated_rows st&lt;BR /&gt;union all&lt;BR /&gt;select distinct&lt;BR /&gt;null as merge_key,&lt;BR /&gt;st.ethnicity_code,&lt;BR /&gt;st.ethnicity_description,&lt;BR /&gt;st.indicator_code,&lt;BR /&gt;st.indicator_description&lt;BR /&gt;from&lt;BR /&gt;updated_rows st&lt;BR /&gt;join&lt;BR /&gt;xxxx.target_table trg&lt;BR /&gt;on&lt;BR /&gt;md5(concat(trg.ethnicity_code,trg.indicator_code)) = md5(concat(st.ethnicity_code,st.indicator_code))&lt;BR /&gt;where coalesce(trg.ethnicity_description,'-1') != coalesce(st.ethnicity_description,'-1')&lt;BR /&gt;and coalesce(trg.indicator_description,'-1') != coalesce(st.indicator_description,'-1')&lt;BR /&gt;&lt;BR /&gt;) st on md5(concat(coalesce(trg.ethnicity_code,'-1') , coalesce(trg.indicator_code,'-1'))) = st.merge_key&lt;BR /&gt;when matched and coalesce(trg.ethnicity_description,'-1') != coalesce(st.ethnicity_description,'-1')&lt;BR /&gt;and coalesce(trg.indicator_description,'-1') != coalesce(st.indicator_description,'-1') then&lt;BR /&gt;update set trg.effective_to = current_date()-1&lt;BR /&gt;when not matched then&lt;BR /&gt;insert (ethnicity_code,ethnicity_description,indicator_code,indicator_description,effective_from,effective_to)&lt;BR /&gt;values (st.ethnicity_code,st.ethnicity_description,st.indicator_code,st.indicator_description,current_date(),'9999-12-31')&lt;/P&gt;</description>
    <pubDate>Tue, 06 Feb 2024 17:16:25 GMT</pubDate>
    <dc:creator>Ramacloudworld</dc:creator>
    <dc:date>2024-02-06T17:16:25Z</dc:date>
    <item>
      <title>cross join issue generating surrogate keys in delta table</title>
      <link>https://community.databricks.com/t5/get-started-discussions/cross-join-issue-generating-surrogate-keys-in-delta-table/m-p/59493#M2501</link>
      <description>&lt;P&gt;I used below code to populate target table, it is working as expected but expect for surrogatekey column. After I inserted dummy entry -1 and did run merge code, it is generating the numbers in surrogatekey column like this 1,3,5,7,9 odd numbers, it is really strange, but if I don't insert -1 entry then it is generating like this 1,2,3,4,5,6,7 but I need dummy entry for all dimensions to capture unknow data. Please help me on this.&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;is cross join causing issue? but not sure&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;%sql&lt;BR /&gt;CREATE TABLE xxxx.target_table (&lt;BR /&gt;general_sk BIGINT NOT NULL GENERATED BY DEFAULT AS IDENTITY (START WITH 1 INCREMENT BY 1),&lt;BR /&gt;ethnicity_code STRING,&lt;BR /&gt;ethnicity_description STRING,&lt;BR /&gt;indicator_code STRING,&lt;BR /&gt;indicator_description STRING,&lt;BR /&gt;effective_from DATE,&lt;BR /&gt;effective_to DATE)&lt;/P&gt;&lt;P&gt;%sql&lt;BR /&gt;insert into xxxx.target_table(gneral_sk,ethnicity_code,ethnicity_description,indicator_code,indicator_description,effective_from,effective_to)&lt;BR /&gt;SELECT -1,-1,'Missing (Not Applicable/ Not Known)',-1,'Missing (Not Applicable/ Not Known)','1900-01-01','9999-12-31'&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;df = spark.sql(f""" select distinct&lt;BR /&gt;et.code as ethnicity_code,&lt;BR /&gt;et.description as ethnicity_description,&lt;BR /&gt;hl.code as indicator_code,&lt;BR /&gt;hl.description as indicator_description&lt;BR /&gt;from&lt;BR /&gt;(select * from xxx.lookup where tablename in ('ethnicity') ) as et&lt;BR /&gt;cross join (select * from xxx.lookup where tablename in ('indicator')) as hl&lt;BR /&gt;""")&lt;/P&gt;&lt;P&gt;df.createOrReplaceTempView("updated_rows")&lt;/P&gt;&lt;P&gt;merge into xxxx.target_table trg&lt;BR /&gt;using (&lt;BR /&gt;select distinct&lt;BR /&gt;md5(concat(ethnicity_code,indicator_code)) as merge_key,&lt;BR /&gt;ethnicity_code,&lt;BR /&gt;ethnicity_description,&lt;BR /&gt;indicator_code,&lt;BR /&gt;indicator_description&lt;BR /&gt;&lt;BR /&gt;from&lt;BR /&gt;updated_rows st&lt;BR /&gt;union all&lt;BR /&gt;select distinct&lt;BR /&gt;null as merge_key,&lt;BR /&gt;st.ethnicity_code,&lt;BR /&gt;st.ethnicity_description,&lt;BR /&gt;st.indicator_code,&lt;BR /&gt;st.indicator_description&lt;BR /&gt;from&lt;BR /&gt;updated_rows st&lt;BR /&gt;join&lt;BR /&gt;xxxx.target_table trg&lt;BR /&gt;on&lt;BR /&gt;md5(concat(trg.ethnicity_code,trg.indicator_code)) = md5(concat(st.ethnicity_code,st.indicator_code))&lt;BR /&gt;where coalesce(trg.ethnicity_description,'-1') != coalesce(st.ethnicity_description,'-1')&lt;BR /&gt;and coalesce(trg.indicator_description,'-1') != coalesce(st.indicator_description,'-1')&lt;BR /&gt;&lt;BR /&gt;) st on md5(concat(coalesce(trg.ethnicity_code,'-1') , coalesce(trg.indicator_code,'-1'))) = st.merge_key&lt;BR /&gt;when matched and coalesce(trg.ethnicity_description,'-1') != coalesce(st.ethnicity_description,'-1')&lt;BR /&gt;and coalesce(trg.indicator_description,'-1') != coalesce(st.indicator_description,'-1') then&lt;BR /&gt;update set trg.effective_to = current_date()-1&lt;BR /&gt;when not matched then&lt;BR /&gt;insert (ethnicity_code,ethnicity_description,indicator_code,indicator_description,effective_from,effective_to)&lt;BR /&gt;values (st.ethnicity_code,st.ethnicity_description,st.indicator_code,st.indicator_description,current_date(),'9999-12-31')&lt;/P&gt;</description>
      <pubDate>Tue, 06 Feb 2024 17:16:25 GMT</pubDate>
      <guid>https://community.databricks.com/t5/get-started-discussions/cross-join-issue-generating-surrogate-keys-in-delta-table/m-p/59493#M2501</guid>
      <dc:creator>Ramacloudworld</dc:creator>
      <dc:date>2024-02-06T17:16:25Z</dc:date>
    </item>
    <item>
      <title>Re: cross join issue generating surrogate keys in delta table</title>
      <link>https://community.databricks.com/t5/get-started-discussions/cross-join-issue-generating-surrogate-keys-in-delta-table/m-p/100150#M4613</link>
      <description>&lt;P data-unlink="true"&gt;This is expected. Identity columns can have gaps, or per &lt;A href="https://docs.databricks.com/en/delta/generated-columns.html#use-identity-columns-in-delta-lake" target="_self"&gt;the documentation&lt;/A&gt;:&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&lt;EM&gt;Values assigned by identity columns are unique and increment in the direction of the specified step, and in multiples of the specified step size, but &lt;STRONG&gt;are not guaranteed to be contiguous&lt;/STRONG&gt;. For example, with a starting value of&amp;nbsp;&lt;CODE class="docutils literal notranslate"&gt;&lt;SPAN class="pre"&gt;0&lt;/SPAN&gt;&lt;/CODE&gt;&amp;nbsp;and a step size of&amp;nbsp;&lt;CODE class="docutils literal notranslate"&gt;&lt;SPAN class="pre"&gt;2&lt;/SPAN&gt;&lt;/CODE&gt;, all values are positive even numbers &lt;STRONG&gt;but some even numbers might be skipped.&lt;/STRONG&gt;&lt;/EM&gt;&lt;/P&gt;
&lt;P&gt;Can you say more about your requirements to capture unknown data? Maybe there is a different way to accomplish these requirements.&lt;/P&gt;</description>
      <pubDate>Tue, 26 Nov 2024 18:57:27 GMT</pubDate>
      <guid>https://community.databricks.com/t5/get-started-discussions/cross-join-issue-generating-surrogate-keys-in-delta-table/m-p/100150#M4613</guid>
      <dc:creator>cgrant</dc:creator>
      <dc:date>2024-11-26T18:57:27Z</dc:date>
    </item>
  </channel>
</rss>

