<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: XML DLT Autoloader - Ingestion of XML Files in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/xml-dlt-autoloader-ingestion-of-xml-files/m-p/71518#M34336</link>
    <description>&lt;P&gt;The Assistant recommended&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;%sql
replace(ABSENDER, '&amp;amp;', '&amp;amp;amp;')&lt;/LI-CODE&gt;&lt;P&gt;the output stays:&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;NAME: "Informatik GmbH &amp;amp; Co.KG"&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Mon, 03 Jun 2024 19:10:51 GMT</pubDate>
    <dc:creator>avrm91</dc:creator>
    <dc:date>2024-06-03T19:10:51Z</dc:date>
    <item>
      <title>XML DLT Autoloader - Ingestion of XML Files</title>
      <link>https://community.databricks.com/t5/data-engineering/xml-dlt-autoloader-ingestion-of-xml-files/m-p/71193#M34267</link>
      <description>&lt;P&gt;I want to ingest multiple XML files with varying but similar structures without defining a schema.&lt;/P&gt;&lt;P&gt;For example:&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;&amp;lt;?xml version="1.0" encoding="ISO-8859-1"?&amp;gt;
&amp;lt;LIEFERUNG&amp;gt;
  &amp;lt;ABSENDER&amp;gt;
    &amp;lt;RZLZ&amp;gt;R00000001&amp;lt;/RZLZ&amp;gt;
    &amp;lt;NAME&amp;gt;Informatik GmbH &amp;lt;/NAME&amp;gt;
    &amp;lt;STRASSE&amp;gt;Muster-Allee 90&amp;lt;/STRASSE&amp;gt;
    &amp;lt;PLZ&amp;gt;60486&amp;lt;/PLZ&amp;gt;
    &amp;lt;ORT&amp;gt;Frankfurt a.M.&amp;lt;/ORT&amp;gt;
    &amp;lt;LAND&amp;gt;DE&amp;lt;/LAND&amp;gt;
    &amp;lt;KONTAKT&amp;gt;
      &amp;lt;ANREDE&amp;gt;Herr&amp;lt;/ANREDE&amp;gt;
      &amp;lt;VORNAME&amp;gt;Max&amp;lt;/VORNAME&amp;gt;
      &amp;lt;ZUNAME&amp;gt;Mustermann&amp;lt;/ZUNAME&amp;gt;
      &amp;lt;TELEFON&amp;gt;xxxxx/xxxx-xx&amp;lt;/TELEFON&amp;gt;
      &amp;lt;FAX&amp;gt;xxx/xxxxx-xx&amp;lt;/FAX&amp;gt;
      &amp;lt;EMAIL&amp;gt;max.mustermann@informatik.de&amp;lt;/EMAIL&amp;gt;
    &amp;lt;/KONTAKT&amp;gt;
  &amp;lt;/ABSENDER&amp;gt;
  &amp;lt;ERSTELLER&amp;gt;
    &amp;lt;BLZ&amp;gt;0000000000000&amp;lt;/BLZ&amp;gt;
    &amp;lt;NAME&amp;gt;MUSTERBANK&amp;lt;/NAME&amp;gt;
    &amp;lt;STRASSE&amp;gt;MUSTER STR. 1&amp;lt;/STRASSE&amp;gt;
    &amp;lt;PLZ&amp;gt;12345&amp;lt;/PLZ&amp;gt;
    &amp;lt;ORT&amp;gt;Musterhausen&amp;lt;/ORT&amp;gt;
    &amp;lt;LAND&amp;gt;DE&amp;lt;/LAND&amp;gt;
    &amp;lt;KONTAKT&amp;gt;
      &amp;lt;ANREDE&amp;gt;Frau&amp;lt;/ANREDE&amp;gt;
      &amp;lt;VORNAME&amp;gt;Maxime&amp;lt;/VORNAME&amp;gt;
      &amp;lt;ZUNAME&amp;gt;Musterdame&amp;lt;/ZUNAME&amp;gt;
      &amp;lt;ABTEILUNG&amp;gt;Contolling&amp;lt;/ABTEILUNG&amp;gt;
      &amp;lt;TELEFON&amp;gt;xxxxx xx-xxxxx&amp;lt;/TELEFON&amp;gt;
      &amp;lt;FAX&amp;gt;xxxxx xx-xxxxx&amp;lt;/FAX&amp;gt;
      &amp;lt;EMAIL&amp;gt;maxime.musterdame@aol.de&amp;lt;/EMAIL&amp;gt;
    &amp;lt;/KONTAKT&amp;gt;
  &amp;lt;/ERSTELLER&amp;gt;
  &amp;lt;MELDUNG erstellzeit="2020-09-02T11:23:33"&amp;gt;
    ..... 
  &amp;lt;/MELDUNG&amp;gt;
&amp;lt;/LIEFERUNG&amp;gt;&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I attempted to load the XML within DLT pipelines using the following code:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;%sql
CREATE OR REFRESH STREAMING TABLE conformed
COMMENT "BRONZE" 
AS
SELECT
  *
FROM
  cloud_files(
    "&amp;lt;file-path&amp;gt;"
    "xml",
    map(
      "rowTag",
      "LIEFERUNG",
      "mode",
      "FAILFAST",
      "inferSchema",
      "true",
      "encoding",
      "ISO-8859-1",
      "ignoreNamespace",
      "false",
      "timestampFormat",
      "yyyy-MM-dd'T'HH:mm:ss[.SSS][XXX]",
      "timestampNTZFormat",
      "yyyy-MM-dd'T'HH:mm:ss[.SSS]",
      "dateFormat",
      "yyyy-MM-dd",
      "locale",
      "de-DE",
      "readerCaseSensitive",
      "true"
    )
  ) as src&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;The result is a table with 3 columns:&lt;/P&gt;&lt;DIV class=""&gt;&lt;DIV class=""&gt;&lt;SPAN class=""&gt;&lt;SPAN class=""&gt;&lt;SPAN class=""&gt;Column&amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; &amp;nbsp; Type&lt;BR /&gt;ABSENDER&amp;nbsp; &amp;nbsp;&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;SPAN class=""&gt;&lt;SPAN class=""&gt;&lt;SPAN class=""&gt;string&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV class=""&gt;&lt;DIV class=""&gt;&lt;SPAN&gt;ERSTELLER&amp;nbsp;&amp;nbsp;&lt;/SPAN&gt;&lt;SPAN&gt;string&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV class=""&gt;&lt;SPAN&gt;MELDUNG&amp;nbsp; &amp;nbsp; &amp;nbsp;&lt;/SPAN&gt;&lt;SPAN&gt;string&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV class=""&gt;&lt;SPAN&gt;&lt;SPAN&gt;Within those columns is XML Code, e.g.&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/DIV&gt;&lt;DIV class=""&gt;&lt;SPAN&gt;&lt;SPAN&gt;ABSENDER:&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;/DIV&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;&amp;lt;RZLZ&amp;gt;R00000001&amp;lt;/RZLZ&amp;gt;
    &amp;lt;NAME&amp;gt;Informatik GmbH &amp;lt;/NAME&amp;gt;
    &amp;lt;STRASSE&amp;gt;Muster-Allee 90&amp;lt;/STRASSE&amp;gt;
    &amp;lt;PLZ&amp;gt;60486&amp;lt;/PLZ&amp;gt;
    &amp;lt;ORT&amp;gt;Frankfurt a.M.&amp;lt;/ORT&amp;gt;
    &amp;lt;LAND&amp;gt;DE&amp;lt;/LAND&amp;gt;
    &amp;lt;KONTAKT&amp;gt;
      &amp;lt;ANREDE&amp;gt;Herr&amp;lt;/ANREDE&amp;gt;
      &amp;lt;VORNAME&amp;gt;Max&amp;lt;/VORNAME&amp;gt;
      &amp;lt;ZUNAME&amp;gt;Mustermann&amp;lt;/ZUNAME&amp;gt;
      &amp;lt;TELEFON&amp;gt;xxxxx/xxxx-xx&amp;lt;/TELEFON&amp;gt;
      &amp;lt;FAX&amp;gt;xxx/xxxxx-xx&amp;lt;/FAX&amp;gt;
      &amp;lt;EMAIL&amp;gt;max.mustermann@informatik.de&amp;lt;/EMAIL&amp;gt;
    &amp;lt;/KONTAKT&amp;gt;&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Now the questions:&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;Is it best practice for BRONZE to have xml stored like this, as it will have the least "issues" with schema changes, as the schema will just change within these tags?&lt;/LI&gt;&lt;LI&gt;Nevertheless, how can I make it within BRONZE DLT Step with SQL to explode the xml without defining the schema?&lt;/LI&gt;&lt;LI&gt;And how would I explode the XML within SILVER without defining the schema?&lt;/LI&gt;&lt;/UL&gt;&lt;P data-unlink="true"&gt;I already tried with those code:&amp;nbsp;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;SELECT
  from_xml(CONCAT('&amp;lt;ABSENDER&amp;gt;', ABSENDER, '&amp;lt;/ABSENDER&amp;gt;'), 
    schema_of_xml(CONCAT('&amp;lt;ABSENDER&amp;gt;', '&amp;lt;RZLZ&amp;gt;R00000001&amp;lt;/RZLZ&amp;gt;
      &amp;lt;NAME&amp;gt;Informatik GmbH &amp;lt;/NAME&amp;gt;
      &amp;lt;STRASSE&amp;gt;Muster-Allee 90&amp;lt;/STRASSE&amp;gt;
      &amp;lt;PLZ&amp;gt;60486&amp;lt;/PLZ&amp;gt;
      &amp;lt;ORT&amp;gt;Frankfurt a.M.&amp;lt;/ORT&amp;gt;
      &amp;lt;LAND&amp;gt;DE&amp;lt;/LAND&amp;gt;
      &amp;lt;KONTAKT&amp;gt;
        &amp;lt;ANREDE&amp;gt;Herr&amp;lt;/ANREDE&amp;gt;
        &amp;lt;VORNAME&amp;gt;Max&amp;lt;/VORNAME&amp;gt;
        &amp;lt;ZUNAME&amp;gt;Mustermann&amp;lt;/ZUNAME&amp;gt;
        &amp;lt;TELEFON&amp;gt;xxxxx/xxxx-xx&amp;lt;/TELEFON&amp;gt;
        &amp;lt;FAX&amp;gt;xxx/xxxxx-xx&amp;lt;/FAX&amp;gt;
        &amp;lt;EMAIL&amp;gt;max.mustermann@informatik.de&amp;lt;/EMAIL&amp;gt;
      &amp;lt;/KONTAKT&amp;gt;', '&amp;lt;/ABSENDER&amp;gt;')))  AS ABSENDER_XML
FROM conformed&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;But it gave me just a bad result with most of the attributes not mapped.&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;object
KONTAKT: null
LAND: null
NAME: null
ORT: null
PLZ: null
RZLZ: "R00000001"
STRASSE: null&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Fri, 31 May 2024 14:01:43 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/xml-dlt-autoloader-ingestion-of-xml-files/m-p/71193#M34267</guid>
      <dc:creator>avrm91</dc:creator>
      <dc:date>2024-05-31T14:01:43Z</dc:date>
    </item>
    <item>
      <title>Re: XML DLT Autoloader - Ingestion of XML Files</title>
      <link>https://community.databricks.com/t5/data-engineering/xml-dlt-autoloader-ingestion-of-xml-files/m-p/71489#M34323</link>
      <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/9"&gt;@Retired_mod&lt;/a&gt;&amp;nbsp;Thanks a lot.&lt;BR /&gt;I found an issue in&amp;nbsp;from_xml&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;&lt;SPAN&gt;function.&lt;BR /&gt;I posted above:&lt;BR /&gt;&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;SELECT
  from_xml(CONCAT('&amp;lt;ABSENDER&amp;gt;', ABSENDER, '&amp;lt;/ABSENDER&amp;gt;'), 
    schema_of_xml('
		&amp;lt;ABSENDER&amp;gt;
			&amp;lt;RZLZ&amp;gt;R00000001&amp;lt;/RZLZ&amp;gt;
			&amp;lt;NAME&amp;gt;Informatik GmbH&amp;lt;/NAME&amp;gt;
			&amp;lt;STRASSE&amp;gt;Muster-Allee 90&amp;lt;/STRASSE&amp;gt;
			&amp;lt;PLZ&amp;gt;60486&amp;lt;/PLZ&amp;gt;
			&amp;lt;ORT&amp;gt;Frankfurt a.M.&amp;lt;/ORT&amp;gt;
			&amp;lt;LAND&amp;gt;DE&amp;lt;/LAND&amp;gt;
			&amp;lt;KONTAKT&amp;gt;
				&amp;lt;ANREDE&amp;gt;Herr&amp;lt;/ANREDE&amp;gt;
				&amp;lt;VORNAME&amp;gt;Max&amp;lt;/VORNAME&amp;gt;
				&amp;lt;ZUNAME&amp;gt;Mustermann&amp;lt;/ZUNAME&amp;gt;
				&amp;lt;TELEFON&amp;gt;xxxxx/xxxx-xx&amp;lt;/TELEFON&amp;gt;
				&amp;lt;FAX&amp;gt;xxx/xxxxx-xx&amp;lt;/FAX&amp;gt;
				&amp;lt;EMAIL&amp;gt;max.mustermann@informatik.de&amp;lt;/EMAIL&amp;gt;
			&amp;lt;/KONTAKT&amp;gt;
		&amp;lt;/ABSENDER&amp;gt;'))  AS ABSENDER_XML
FROM conformed&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;But it gave me just a bad result with most of the attributes not mapped.&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;object
KONTAKT: null
LAND: null
NAME: null
ORT: null
PLZ: null
RZLZ: "R00000001"
STRASSE: null&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;This is not the real data as I "anonymized" the data before posting here.&lt;BR /&gt;So, the reason is that the&amp;nbsp;&lt;/SPAN&gt;&lt;SPAN&gt;&amp;lt;NAME&amp;gt;&lt;/SPAN&gt;&lt;SPAN&gt;Informatik GmbH &lt;/SPAN&gt;&lt;SPAN&gt;&amp;amp;&lt;/SPAN&gt;&lt;SPAN&gt; Co.KG&lt;/SPAN&gt;&lt;SPAN&gt;&amp;lt;/NAME&amp;gt; contains an "&amp;amp;" what will make the from_xml&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;&lt;SPAN&gt;function fail.&amp;nbsp;&lt;BR /&gt;I used a simple replace to "fix" it (make it work for now)&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;%sql
replace(ABSENDER, '&amp;amp;', '')&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Now the object looks like that.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;object
KONTAKT: 
	ANREDE: "Herr"
	EMAIL: "max.mustermann@informatik.de"
	FAX: "xxx/xxxxx-xx"
	TELEFON: "xxxxx/xxxx-xx"
	VORNAME: "Max"
	ZUNAME: "Mustermann"
LAND: "DE"
NAME: "Informatik GmbH  Co.KG"
ORT: "Frankfurt a.M."
PLZ: "60486"
RZLZ: "R00000001"
STRASSE: "Muster-Allee 90"&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;&lt;SPAN&gt;I found it after several hours of debugging. Sad it gave me no error. It just did not map the object correctly.&lt;BR /&gt;Hope you can address this, as in the documentation it says, this feature is in public preview&lt;BR /&gt;&lt;A href="https://learn.microsoft.com/en-us/azure/databricks/sql/language-manual/functions/from_xml" target="_blank" rel="noopener"&gt;from_xml function - Azure Databricks - Databricks SQL | Microsoft Learn&lt;/A&gt;&lt;BR /&gt;&lt;/SPAN&gt;&lt;/SPAN&gt;&lt;/P&gt;</description>
      <pubDate>Mon, 03 Jun 2024 14:29:35 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/xml-dlt-autoloader-ingestion-of-xml-files/m-p/71489#M34323</guid>
      <dc:creator>avrm91</dc:creator>
      <dc:date>2024-06-03T14:29:35Z</dc:date>
    </item>
    <item>
      <title>Re: XML DLT Autoloader - Ingestion of XML Files</title>
      <link>https://community.databricks.com/t5/data-engineering/xml-dlt-autoloader-ingestion-of-xml-files/m-p/71518#M34336</link>
      <description>&lt;P&gt;The Assistant recommended&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;%sql
replace(ABSENDER, '&amp;amp;', '&amp;amp;amp;')&lt;/LI-CODE&gt;&lt;P&gt;the output stays:&lt;/P&gt;&lt;LI-CODE lang="markup"&gt;NAME: "Informatik GmbH &amp;amp; Co.KG"&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Mon, 03 Jun 2024 19:10:51 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/xml-dlt-autoloader-ingestion-of-xml-files/m-p/71518#M34336</guid>
      <dc:creator>avrm91</dc:creator>
      <dc:date>2024-06-03T19:10:51Z</dc:date>
    </item>
  </channel>
</rss>

