<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Very Slow UDF Execution on One Cluster Compared to Another with Similar Config in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/very-slow-udf-execution-on-one-cluster-compared-to-another-with/m-p/121449#M46454</link>
    <description>&lt;P class=""&gt;Hi all,&lt;/P&gt;&lt;P class=""&gt;I’m experiencing a significant slowdown behavior in Python UDF execution times on a particular cluster. The same code runs much faster on another cluster with very similar hardware and policy settings.&lt;BR /&gt;&lt;BR /&gt;This cell takes 2–3 minutes on the problematic cluster, but only 10–30 seconds on the previous cluster we had in the workspace with no UC.&lt;/P&gt;&lt;LI-CODE lang="python"&gt;# example from https://docs.databricks.com/aws/en/udf/unity-catalog

def squared(s):
    return s * s

spark.udf.register("squaredWithPython", squared)
spark.range(1, 20).createOrReplaceTempView("test")

from pyspark.sql.functions import udf
from pyspark.sql.types import LongType

squared_udf = udf(squared, LongType())
df = spark.table("test")
display(df.select("id", squared_udf("id").alias("id_squared")))

# This cell takes 2–3 minutes on the problematic cluster, but only 10–30 seconds on the other.&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P class=""&gt;The cell with an intentionally incorrect schema error runs instantly with an error the first time, but starting from the second or third run, it can take up to 10 minutes to fail.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;from pyspark.sql.functions import udf
from pyspark.sql import types as T

def test_func():
    return 0     

# correct schema
# schema = T.IntegerType()

# schema with intentional error
schema = T.StructType()

test_udf = udf(test_func, schema)

df_test = spark.createDataFrame([("test",)], ["col1"])
display(
    df_test.withColumn("udf_result", test_udf())
)&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P class=""&gt;&amp;nbsp;&lt;/P&gt;&lt;P class=""&gt;&lt;STRONG&gt;Cluster config:&lt;/STRONG&gt;&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;&lt;P class=""&gt;Policy: Unrestricted&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P class=""&gt;Node type: rd-fleet.xlarge (32 GB, 4 Cores)&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P class=""&gt;Workers: Min 1, Max 2 (current: 1)&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P class=""&gt;Driver: rd-fleet.xlarge (32 GB, 4 Cores)&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P class=""&gt;Access mode: Standard (Shared)&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P class=""&gt;Runtime: 15.4 LTS (Spark 3.5.0, Scala 2.12)&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P class=""&gt;Autoscaling: Enabled&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P class=""&gt;Photon: Off&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P class=""&gt;Auto-termination: 20 min&lt;/P&gt;&lt;/LI&gt;&lt;/UL&gt;&lt;P class=""&gt;&lt;STRONG&gt;Notes:&lt;/STRONG&gt;&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;All timings are observed when the cluster is already running and there are no other jobs or notebooks running in parallel.&lt;/LI&gt;&lt;LI&gt;&lt;P class=""&gt;No matter what I tried — such as renaming the UDF or using the &lt;SPAN class=""&gt;udf&lt;/SPAN&gt; decorator — after the first quick run with the schema error, all further runs of such cell take an extremely long time before the error is shown.&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P class=""&gt;Detaching and re-attaching the notebook does not help. I need to restart the cluster to resolve the issue for a single cell run, but the problem returns after running the cell again.&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P class=""&gt;I don’t have access to cluster logs on the problematic cluster, but I can create new clusters for jobs and view their logs.&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;&lt;P class=""&gt;I tried creating a new cluster with similar default configurations&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P class=""&gt;observed the same issues with simple code in first code block above: took 2-3 minutes to run.&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P class=""&gt;When running as a job, it fails on a JVM exception, so I haven’t found a way to make the cells with the wrong schema run twice to test if the long computation time occurs on subsequent runs.&lt;/P&gt;&lt;/LI&gt;&lt;/UL&gt;&lt;/LI&gt;&lt;/UL&gt;&lt;P class=""&gt;&lt;STRONG&gt;Questions:&lt;/STRONG&gt;&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;&lt;P class=""&gt;What can cause such slowdowns (10x or more) for simple UDFs or error feedback?&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P class=""&gt;What log/event should I look for if I can get jobs logs where issue also observed?&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P class=""&gt;Any tips for diagnosing this further?&lt;/P&gt;&lt;/LI&gt;&lt;/UL&gt;</description>
    <pubDate>Wed, 11 Jun 2025 10:38:25 GMT</pubDate>
    <dc:creator>alexbarev</dc:creator>
    <dc:date>2025-06-11T10:38:25Z</dc:date>
    <item>
      <title>Very Slow UDF Execution on One Cluster Compared to Another with Similar Config</title>
      <link>https://community.databricks.com/t5/data-engineering/very-slow-udf-execution-on-one-cluster-compared-to-another-with/m-p/121449#M46454</link>
      <description>&lt;P class=""&gt;Hi all,&lt;/P&gt;&lt;P class=""&gt;I’m experiencing a significant slowdown behavior in Python UDF execution times on a particular cluster. The same code runs much faster on another cluster with very similar hardware and policy settings.&lt;BR /&gt;&lt;BR /&gt;This cell takes 2–3 minutes on the problematic cluster, but only 10–30 seconds on the previous cluster we had in the workspace with no UC.&lt;/P&gt;&lt;LI-CODE lang="python"&gt;# example from https://docs.databricks.com/aws/en/udf/unity-catalog

def squared(s):
    return s * s

spark.udf.register("squaredWithPython", squared)
spark.range(1, 20).createOrReplaceTempView("test")

from pyspark.sql.functions import udf
from pyspark.sql.types import LongType

squared_udf = udf(squared, LongType())
df = spark.table("test")
display(df.select("id", squared_udf("id").alias("id_squared")))

# This cell takes 2–3 minutes on the problematic cluster, but only 10–30 seconds on the other.&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P class=""&gt;The cell with an intentionally incorrect schema error runs instantly with an error the first time, but starting from the second or third run, it can take up to 10 minutes to fail.&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;from pyspark.sql.functions import udf
from pyspark.sql import types as T

def test_func():
    return 0     

# correct schema
# schema = T.IntegerType()

# schema with intentional error
schema = T.StructType()

test_udf = udf(test_func, schema)

df_test = spark.createDataFrame([("test",)], ["col1"])
display(
    df_test.withColumn("udf_result", test_udf())
)&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P class=""&gt;&amp;nbsp;&lt;/P&gt;&lt;P class=""&gt;&lt;STRONG&gt;Cluster config:&lt;/STRONG&gt;&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;&lt;P class=""&gt;Policy: Unrestricted&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P class=""&gt;Node type: rd-fleet.xlarge (32 GB, 4 Cores)&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P class=""&gt;Workers: Min 1, Max 2 (current: 1)&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P class=""&gt;Driver: rd-fleet.xlarge (32 GB, 4 Cores)&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P class=""&gt;Access mode: Standard (Shared)&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P class=""&gt;Runtime: 15.4 LTS (Spark 3.5.0, Scala 2.12)&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P class=""&gt;Autoscaling: Enabled&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P class=""&gt;Photon: Off&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P class=""&gt;Auto-termination: 20 min&lt;/P&gt;&lt;/LI&gt;&lt;/UL&gt;&lt;P class=""&gt;&lt;STRONG&gt;Notes:&lt;/STRONG&gt;&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;All timings are observed when the cluster is already running and there are no other jobs or notebooks running in parallel.&lt;/LI&gt;&lt;LI&gt;&lt;P class=""&gt;No matter what I tried — such as renaming the UDF or using the &lt;SPAN class=""&gt;udf&lt;/SPAN&gt; decorator — after the first quick run with the schema error, all further runs of such cell take an extremely long time before the error is shown.&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P class=""&gt;Detaching and re-attaching the notebook does not help. I need to restart the cluster to resolve the issue for a single cell run, but the problem returns after running the cell again.&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P class=""&gt;I don’t have access to cluster logs on the problematic cluster, but I can create new clusters for jobs and view their logs.&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;&lt;P class=""&gt;I tried creating a new cluster with similar default configurations&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P class=""&gt;observed the same issues with simple code in first code block above: took 2-3 minutes to run.&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P class=""&gt;When running as a job, it fails on a JVM exception, so I haven’t found a way to make the cells with the wrong schema run twice to test if the long computation time occurs on subsequent runs.&lt;/P&gt;&lt;/LI&gt;&lt;/UL&gt;&lt;/LI&gt;&lt;/UL&gt;&lt;P class=""&gt;&lt;STRONG&gt;Questions:&lt;/STRONG&gt;&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;&lt;P class=""&gt;What can cause such slowdowns (10x or more) for simple UDFs or error feedback?&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P class=""&gt;What log/event should I look for if I can get jobs logs where issue also observed?&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P class=""&gt;Any tips for diagnosing this further?&lt;/P&gt;&lt;/LI&gt;&lt;/UL&gt;</description>
      <pubDate>Wed, 11 Jun 2025 10:38:25 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/very-slow-udf-execution-on-one-cluster-compared-to-another-with/m-p/121449#M46454</guid>
      <dc:creator>alexbarev</dc:creator>
      <dc:date>2025-06-11T10:38:25Z</dc:date>
    </item>
    <item>
      <title>Re: Very Slow UDF Execution on One Cluster Compared to Another with Similar Config</title>
      <link>https://community.databricks.com/t5/data-engineering/very-slow-udf-execution-on-one-cluster-compared-to-another-with/m-p/121473#M46462</link>
      <description>&lt;P&gt;Our infra team told me we might face strange databricks bug. It happens only in our team workspace. Other teams do not experience bug with clusters that have identical configuration.&lt;BR /&gt;&lt;BR /&gt;Also when I run jobs with identical settings as in our cluster, but&amp;nbsp;&lt;STRONG&gt;Policy: Job Compute - Single node&lt;/STRONG&gt; and&lt;STRONG&gt; Access mode: Dedicated (formerly Single user) –&lt;/STRONG&gt; issue dissapears.&lt;STRONG&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;/STRONG&gt;But when I use here&amp;nbsp;&lt;STRONG&gt;Access mode: Standard (formerly: Shared) – &lt;/STRONG&gt;as in our cluster – problem persist.&lt;/P&gt;</description>
      <pubDate>Wed, 11 Jun 2025 14:24:50 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/very-slow-udf-execution-on-one-cluster-compared-to-another-with/m-p/121473#M46462</guid>
      <dc:creator>alexbarev</dc:creator>
      <dc:date>2025-06-11T14:24:50Z</dc:date>
    </item>
    <item>
      <title>Re: Very Slow UDF Execution on One Cluster Compared to Another with Similar Config</title>
      <link>https://community.databricks.com/t5/data-engineering/very-slow-udf-execution-on-one-cluster-compared-to-another-with/m-p/121590#M46498</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/168328"&gt;@alexbarev&lt;/a&gt;&amp;nbsp;,&lt;/P&gt;&lt;P&gt;The slowdown is likely due to using Python UDFs on a Shared (Standard) access mode cluster with Unity Catalog, which adds extra security and isolation overhead. Using a Dedicated access mode cluster removes the extra isolation overhead from Unity Catalog, which typically resolves the UDF performance issues.&lt;/P&gt;&lt;P&gt;To further improve performance:&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;Enable spark.sql.execution.pythonUDF.arrow.enabled = true in cluster settings.&lt;/LI&gt;&lt;LI&gt;Check the Spark UI for task delays or scheduler bottlenecks related to UDFs.&lt;/LI&gt;&lt;LI&gt;Review job logs for high serialization/deserialization times.&lt;/LI&gt;&lt;/UL&gt;</description>
      <pubDate>Thu, 12 Jun 2025 11:11:35 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/very-slow-udf-execution-on-one-cluster-compared-to-another-with/m-p/121590#M46498</guid>
      <dc:creator>SP_6721</dc:creator>
      <dc:date>2025-06-12T11:11:35Z</dc:date>
    </item>
  </channel>
</rss>

