I used below code to populate target table, it is working as expected but expect for surrogatekey column. After I inserted dummy entry -1 and did run merge code, it is generating the numbers in surrogatekey column like this 1,3,5,7,9 odd numbers, it is really strange, but if I don't insert -1 entry then it is generating like this 1,2,3,4,5,6,7 but I need dummy entry for all dimensions to capture unknow data. Please help me on this.
is cross join causing issue? but not sure
%sql
CREATE TABLE xxxx.target_table (
general_sk BIGINT NOT NULL GENERATED BY DEFAULT AS IDENTITY (START WITH 1 INCREMENT BY 1),
ethnicity_code STRING,
ethnicity_description STRING,
indicator_code STRING,
indicator_description STRING,
effective_from DATE,
effective_to DATE)
%sql
insert into xxxx.target_table(gneral_sk,ethnicity_code,ethnicity_description,indicator_code,indicator_description,effective_from,effective_to)
SELECT -1,-1,'Missing (Not Applicable/ Not Known)',-1,'Missing (Not Applicable/ Not Known)','1900-01-01','9999-12-31'
df = spark.sql(f""" select distinct
et.code as ethnicity_code,
et.description as ethnicity_description,
hl.code as indicator_code,
hl.description as indicator_description
from
(select * from xxx.lookup where tablename in ('ethnicity') ) as et
cross join (select * from xxx.lookup where tablename in ('indicator')) as hl
""")
df.createOrReplaceTempView("updated_rows")
merge into xxxx.target_table trg
using (
select distinct
md5(concat(ethnicity_code,indicator_code)) as merge_key,
ethnicity_code,
ethnicity_description,
indicator_code,
indicator_description
from
updated_rows st
union all
select distinct
null as merge_key,
st.ethnicity_code,
st.ethnicity_description,
st.indicator_code,
st.indicator_description
from
updated_rows st
join
xxxx.target_table trg
on
md5(concat(trg.ethnicity_code,trg.indicator_code)) = md5(concat(st.ethnicity_code,st.indicator_code))
where coalesce(trg.ethnicity_description,'-1') != coalesce(st.ethnicity_description,'-1')
and coalesce(trg.indicator_description,'-1') != coalesce(st.indicator_description,'-1')
) st on md5(concat(coalesce(trg.ethnicity_code,'-1') , coalesce(trg.indicator_code,'-1'))) = st.merge_key
when matched and coalesce(trg.ethnicity_description,'-1') != coalesce(st.ethnicity_description,'-1')
and coalesce(trg.indicator_description,'-1') != coalesce(st.indicator_description,'-1') then
update set trg.effective_to = current_date()-1
when not matched then
insert (ethnicity_code,ethnicity_description,indicator_code,indicator_description,effective_from,effective_to)
values (st.ethnicity_code,st.ethnicity_description,st.indicator_code,st.indicator_description,current_date(),'9999-12-31')