<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic BUG: Unity Catalog kills UDF in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/bug-unity-catalog-kills-udf/m-p/67977#M33501</link>
    <description>&lt;P&gt;We have UDFs in a few locations and today we noticed they died in performance. This seems to be caused by Unity Catalog.&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;Test environment 1:&lt;/STRONG&gt;&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;Databricks Runtime Environment: 14.3 / 15.1&lt;/LI&gt;&lt;LI&gt;Compute: 1 master, 4 nodes&lt;/LI&gt;&lt;LI&gt;Policy: Unrestricted&lt;/LI&gt;&lt;LI&gt;Access Mode: Shared&lt;/LI&gt;&lt;/UL&gt;&lt;P&gt;&lt;STRONG&gt;Test environment 2:&lt;/STRONG&gt;&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;Databricks Runtime Environment: 14.3 / 15.1&lt;/LI&gt;&lt;LI&gt;Compute: Single Node&lt;/LI&gt;&lt;LI&gt;Policy: Unrestricted&lt;/LI&gt;&lt;LI&gt;Access Mode: Single user&lt;/LI&gt;&lt;/UL&gt;&lt;P&gt;&lt;STRONG&gt;Code:&lt;/STRONG&gt;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;import pandas as pd
import numpy as np
import pyspark.sql.functions as F
import pyspark.sql.types as T

# Create test dataframe:
df = pd.DataFrame(np.random.randint(0,100,size=(100, 4)), columns=list('ABCD'))
sdf = spark.createDataFrame(df)
sdf.writeTo('test.playground.abcd').createOrReplace()

# Now load from unity catalog and apply UDF:
def squared(x):
    return x * x

squared_udf = F.udf(squared, T.LongType())

sdf_2 = spark.read.table('test.playground.abcd')
sdf_2.withColumn('sq', squared_udf('A')).display()&lt;/LI-CODE&gt;&lt;P&gt;&lt;STRONG&gt;&amp;nbsp;Performance:&lt;/STRONG&gt;&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;Test environment 1: 2 min 55s&lt;/LI&gt;&lt;LI&gt;Test environment 2: 8s&lt;/LI&gt;&lt;/UL&gt;</description>
    <pubDate>Thu, 02 May 2024 21:19:11 GMT</pubDate>
    <dc:creator>Erik_L</dc:creator>
    <dc:date>2024-05-02T21:19:11Z</dc:date>
    <item>
      <title>BUG: Unity Catalog kills UDF</title>
      <link>https://community.databricks.com/t5/data-engineering/bug-unity-catalog-kills-udf/m-p/67977#M33501</link>
      <description>&lt;P&gt;We have UDFs in a few locations and today we noticed they died in performance. This seems to be caused by Unity Catalog.&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;Test environment 1:&lt;/STRONG&gt;&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;Databricks Runtime Environment: 14.3 / 15.1&lt;/LI&gt;&lt;LI&gt;Compute: 1 master, 4 nodes&lt;/LI&gt;&lt;LI&gt;Policy: Unrestricted&lt;/LI&gt;&lt;LI&gt;Access Mode: Shared&lt;/LI&gt;&lt;/UL&gt;&lt;P&gt;&lt;STRONG&gt;Test environment 2:&lt;/STRONG&gt;&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;Databricks Runtime Environment: 14.3 / 15.1&lt;/LI&gt;&lt;LI&gt;Compute: Single Node&lt;/LI&gt;&lt;LI&gt;Policy: Unrestricted&lt;/LI&gt;&lt;LI&gt;Access Mode: Single user&lt;/LI&gt;&lt;/UL&gt;&lt;P&gt;&lt;STRONG&gt;Code:&lt;/STRONG&gt;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;import pandas as pd
import numpy as np
import pyspark.sql.functions as F
import pyspark.sql.types as T

# Create test dataframe:
df = pd.DataFrame(np.random.randint(0,100,size=(100, 4)), columns=list('ABCD'))
sdf = spark.createDataFrame(df)
sdf.writeTo('test.playground.abcd').createOrReplace()

# Now load from unity catalog and apply UDF:
def squared(x):
    return x * x

squared_udf = F.udf(squared, T.LongType())

sdf_2 = spark.read.table('test.playground.abcd')
sdf_2.withColumn('sq', squared_udf('A')).display()&lt;/LI-CODE&gt;&lt;P&gt;&lt;STRONG&gt;&amp;nbsp;Performance:&lt;/STRONG&gt;&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;Test environment 1: 2 min 55s&lt;/LI&gt;&lt;LI&gt;Test environment 2: 8s&lt;/LI&gt;&lt;/UL&gt;</description>
      <pubDate>Thu, 02 May 2024 21:19:11 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/bug-unity-catalog-kills-udf/m-p/67977#M33501</guid>
      <dc:creator>Erik_L</dc:creator>
      <dc:date>2024-05-02T21:19:11Z</dc:date>
    </item>
  </channel>
</rss>

