<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: can we use spark-xml with delta live tables ? in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/can-we-use-spark-xml-with-delta-live-tables/m-p/23327#M16078</link>
    <description>&lt;P&gt;This is a tough one since the only magic command available is %pip, but spark-xml is a maven package. The only way I found to do this was to install the spark-xml jar from the maven repo using the databricks-cli. You can reference the cluster ID using spark.conf.get("spark.databricks.clusterUsageTags.clusterId"), something not well documented in the databricks cli documentation. This is not secure/production ready, but is a good starting point.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Found this post last week and couldn't find a solution. So here is my submission &lt;span class="lia-unicode-emoji" title=":slightly_smiling_face:"&gt;🙂&lt;/span&gt;&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;@dlt.table(
  name="xmldata",
  comment="Some XML Data")
def dlt_xmldata():    
&amp;nbsp;
    host = ""
    token = ""
    clusterid = spark.conf.get("spark.databricks.clusterUsageTags.clusterId")
    path = ""
    rowTag=""
&amp;nbsp;
    import subprocess
&amp;nbsp;
    pysh = """
    pip install databricks-cli
    rm ~/.databrickscfg
    ~/.databrickscfg
    echo "[DEFAULT]" &amp;gt;&amp;gt; ~/.databrickscfg
    echo "host = {1}" &amp;gt;&amp;gt; ~/.databrickscfg
    echo "token = {2}" &amp;gt;&amp;gt; ~/.databrickscfg
    export DATABRICKS_CONFIG_FILE=~/.databrickscfg
&amp;nbsp;
    databricks libraries install --cluster-id {0} --maven-coordinates "com.databricks:spark-xml_2.12:0.14.0"
    databricks libraries list --cluster-id {0}
    """
    
    subprocess.run(pysh.format(clusterid,host,token),
        shell=True, check=True,
        executable='/bin/bash')
&amp;nbsp;
    return spark.read.format("xml").option("rowTag",rowTag).option("nullValue","").load(path)&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&lt;/P&gt;</description>
    <pubDate>Mon, 16 May 2022 17:48:18 GMT</pubDate>
    <dc:creator>Zachary_Higgins</dc:creator>
    <dc:date>2022-05-16T17:48:18Z</dc:date>
    <item>
      <title>can we use spark-xml with delta live tables ?</title>
      <link>https://community.databricks.com/t5/data-engineering/can-we-use-spark-xml-with-delta-live-tables/m-p/23322#M16073</link>
      <description>&lt;P&gt;Hi&lt;/P&gt;&lt;P&gt;is there a way to use spark-xml with delta live tables (Azure Databricks) ?&lt;/P&gt;&lt;P&gt;i 've try something like this without any succes for the moment&lt;/P&gt;&lt;P&gt;CREATE LIVE TABLE df17&amp;nbsp;&lt;/P&gt;&lt;P&gt;USING com.databricks.spark.xml&lt;/P&gt;&lt;P&gt;AS SELECT * FROM cloud_files("/mnt/dev/bronze/xml/s4327994", "xml")&lt;/P&gt;&lt;P&gt;Can we load this libray with dlt ?&lt;/P&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Fri, 08 Apr 2022 06:29:31 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/can-we-use-spark-xml-with-delta-live-tables/m-p/23322#M16073</guid>
      <dc:creator>JeromeB974</dc:creator>
      <dc:date>2022-04-08T06:29:31Z</dc:date>
    </item>
    <item>
      <title>Re: can we use spark-xml with delta live tables ?</title>
      <link>https://community.databricks.com/t5/data-engineering/can-we-use-spark-xml-with-delta-live-tables/m-p/23323#M16074</link>
      <description>&lt;P&gt;@Jerome BASTIDE​&amp;nbsp;,&amp;nbsp;Custom implementations are more straightforward in python. You can read whatever. Just return DataFrame.&lt;/P&gt;&lt;P&gt;Autoloader doesn't support XML, so you need to load XML the traditional way.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;@dlt.view
def dlt_dev_bronze():
  return spark.read.option("rowTag", "tag").xml("dbfs:/mnt/dev/bronze/xml/s4327994")&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Mon, 11 Apr 2022 20:38:40 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/can-we-use-spark-xml-with-delta-live-tables/m-p/23323#M16074</guid>
      <dc:creator>Hubert-Dudek</dc:creator>
      <dc:date>2022-04-11T20:38:40Z</dc:date>
    </item>
    <item>
      <title>Re: can we use spark-xml with delta live tables ?</title>
      <link>https://community.databricks.com/t5/data-engineering/can-we-use-spark-xml-with-delta-live-tables/m-p/23325#M16076</link>
      <description>&lt;P&gt;hi&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;no i didn't succeed to make it work neither in sql nor in python.&lt;/P&gt;&lt;P&gt;it seem to require spark-xml and i didn't find a way to use it with delta live tables.&lt;/P&gt;&lt;P&gt;i will try autoloader in binary.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Regards.&lt;/P&gt;</description>
      <pubDate>Thu, 28 Apr 2022 16:13:29 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/can-we-use-spark-xml-with-delta-live-tables/m-p/23325#M16076</guid>
      <dc:creator>JeromeB974</dc:creator>
      <dc:date>2022-04-28T16:13:29Z</dc:date>
    </item>
    <item>
      <title>Re: can we use spark-xml with delta live tables ?</title>
      <link>https://community.databricks.com/t5/data-engineering/can-we-use-spark-xml-with-delta-live-tables/m-p/23327#M16078</link>
      <description>&lt;P&gt;This is a tough one since the only magic command available is %pip, but spark-xml is a maven package. The only way I found to do this was to install the spark-xml jar from the maven repo using the databricks-cli. You can reference the cluster ID using spark.conf.get("spark.databricks.clusterUsageTags.clusterId"), something not well documented in the databricks cli documentation. This is not secure/production ready, but is a good starting point.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Found this post last week and couldn't find a solution. So here is my submission &lt;span class="lia-unicode-emoji" title=":slightly_smiling_face:"&gt;🙂&lt;/span&gt;&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;@dlt.table(
  name="xmldata",
  comment="Some XML Data")
def dlt_xmldata():    
&amp;nbsp;
    host = ""
    token = ""
    clusterid = spark.conf.get("spark.databricks.clusterUsageTags.clusterId")
    path = ""
    rowTag=""
&amp;nbsp;
    import subprocess
&amp;nbsp;
    pysh = """
    pip install databricks-cli
    rm ~/.databrickscfg
    ~/.databrickscfg
    echo "[DEFAULT]" &amp;gt;&amp;gt; ~/.databrickscfg
    echo "host = {1}" &amp;gt;&amp;gt; ~/.databrickscfg
    echo "token = {2}" &amp;gt;&amp;gt; ~/.databrickscfg
    export DATABRICKS_CONFIG_FILE=~/.databrickscfg
&amp;nbsp;
    databricks libraries install --cluster-id {0} --maven-coordinates "com.databricks:spark-xml_2.12:0.14.0"
    databricks libraries list --cluster-id {0}
    """
    
    subprocess.run(pysh.format(clusterid,host,token),
        shell=True, check=True,
        executable='/bin/bash')
&amp;nbsp;
    return spark.read.format("xml").option("rowTag",rowTag).option("nullValue","").load(path)&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Mon, 16 May 2022 17:48:18 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/can-we-use-spark-xml-with-delta-live-tables/m-p/23327#M16078</guid>
      <dc:creator>Zachary_Higgins</dc:creator>
      <dc:date>2022-05-16T17:48:18Z</dc:date>
    </item>
    <item>
      <title>Re: can we use spark-xml with delta live tables ?</title>
      <link>https://community.databricks.com/t5/data-engineering/can-we-use-spark-xml-with-delta-live-tables/m-p/23328#M16079</link>
      <description>&lt;P&gt;Also need to give credit where credit is due regarding the idea to setup databricks-cli from the notebook: &lt;A href="https://stackoverflow.com/questions/55837773/how-to-fix-command-not-found-error-in-databricks-when-creating-a-secret-scope" alt="https://stackoverflow.com/questions/55837773/how-to-fix-command-not-found-error-in-databricks-when-creating-a-secret-scope" target="_blank"&gt;How to fix 'command not found' error in Databricks when creating a secret scope - Stack Overflow&lt;/A&gt;&lt;/P&gt;</description>
      <pubDate>Mon, 16 May 2022 18:57:35 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/can-we-use-spark-xml-with-delta-live-tables/m-p/23328#M16079</guid>
      <dc:creator>Zachary_Higgins</dc:creator>
      <dc:date>2022-05-16T18:57:35Z</dc:date>
    </item>
    <item>
      <title>Re: can we use spark-xml with delta live tables ?</title>
      <link>https://community.databricks.com/t5/data-engineering/can-we-use-spark-xml-with-delta-live-tables/m-p/23330#M16081</link>
      <description>&lt;P&gt;Just following up. My submission is a bad solution and shouldn't be implemented. This broke the moment we used %pip to install additional libraries.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;I sent our wishes to the Databricks reps we work with, but at this time there doesn't seem to be a good way to support XML. In our case, we added a workflow task (scheduled job) to load these XML documents into a delta table, and work the delta tables as one of the sources in our DLT pipeline.&lt;/P&gt;</description>
      <pubDate>Wed, 01 Jun 2022 20:42:44 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/can-we-use-spark-xml-with-delta-live-tables/m-p/23330#M16081</guid>
      <dc:creator>Zachary_Higgins</dc:creator>
      <dc:date>2022-06-01T20:42:44Z</dc:date>
    </item>
  </channel>
</rss>

