<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Pandas_UDF not working on shared access mode but works on personal cluster in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/pandas-udf-not-working-on-shared-access-mode-but-works-on/m-p/74887#M34807</link>
    <description>&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;The "dense_vector" column does not output on show(). Instead I get the error below. Any idea why it doesn't work on the shared access mode? Any alternatives?&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;from fastembed import TextEmbedding, SparseTextEmbedding
from pyspark.sql.pandas.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, FloatType, IntegerType
import pandas as pd
from pyspark.sql.functions import col

@pandas_udf(ArrayType(FloatType()))
def generate_dense_embeddings(contents: pd.Series) -&amp;gt;  pd.Series:
    small_embedding_model = TextEmbedding(model_name="BAAI/bge-small-en-v1.5", cache_dir="/tmp/local_cache/")
    dense_embeddings_list = small_embedding_model.embed(contents)
    return pd.Series(list(dense_embeddings_list))

df=df.limit(50)
df.show(10)
embeddings = df.withColumn("dense_vector", generate_dense_embeddings(col("content")))
embeddings.show(10)&lt;/LI-CODE&gt;&lt;LI-CODE lang="markup"&gt;Py4JJavaError: An error occurred while calling o474.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 21.0 failed 4 times, most recent failure: Lost task 0.3 in stage 21.0 (TID 28) (172.16.2.140 executor 0): org.apache.spark.SparkRuntimeException: [UDF_ERROR.ENV_LOST] Execution of function generate_dense_embeddings(content#73) failed  - the execution environment was lost during execution. This may be caused by the code crashing or the process exiting prematurely.&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Tue, 18 Jun 2024 15:53:28 GMT</pubDate>
    <dc:creator>Awoke101</dc:creator>
    <dc:date>2024-06-18T15:53:28Z</dc:date>
    <item>
      <title>Pandas_UDF not working on shared access mode but works on personal cluster</title>
      <link>https://community.databricks.com/t5/data-engineering/pandas-udf-not-working-on-shared-access-mode-but-works-on/m-p/74887#M34807</link>
      <description>&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;The "dense_vector" column does not output on show(). Instead I get the error below. Any idea why it doesn't work on the shared access mode? Any alternatives?&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;from fastembed import TextEmbedding, SparseTextEmbedding
from pyspark.sql.pandas.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, FloatType, IntegerType
import pandas as pd
from pyspark.sql.functions import col

@pandas_udf(ArrayType(FloatType()))
def generate_dense_embeddings(contents: pd.Series) -&amp;gt;  pd.Series:
    small_embedding_model = TextEmbedding(model_name="BAAI/bge-small-en-v1.5", cache_dir="/tmp/local_cache/")
    dense_embeddings_list = small_embedding_model.embed(contents)
    return pd.Series(list(dense_embeddings_list))

df=df.limit(50)
df.show(10)
embeddings = df.withColumn("dense_vector", generate_dense_embeddings(col("content")))
embeddings.show(10)&lt;/LI-CODE&gt;&lt;LI-CODE lang="markup"&gt;Py4JJavaError: An error occurred while calling o474.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 21.0 failed 4 times, most recent failure: Lost task 0.3 in stage 21.0 (TID 28) (172.16.2.140 executor 0): org.apache.spark.SparkRuntimeException: [UDF_ERROR.ENV_LOST] Execution of function generate_dense_embeddings(content#73) failed  - the execution environment was lost during execution. This may be caused by the code crashing or the process exiting prematurely.&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 18 Jun 2024 15:53:28 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/pandas-udf-not-working-on-shared-access-mode-but-works-on/m-p/74887#M34807</guid>
      <dc:creator>Awoke101</dc:creator>
      <dc:date>2024-06-18T15:53:28Z</dc:date>
    </item>
    <item>
      <title>Re: Pandas_UDF not working on shared access mode but works on personal cluster</title>
      <link>https://community.databricks.com/t5/data-engineering/pandas-udf-not-working-on-shared-access-mode-but-works-on/m-p/74904#M34809</link>
      <description>&lt;P&gt;Not 100% sure but I'm guessing it is because of the &lt;STRONG&gt;cache_dir.&amp;nbsp;&lt;/STRONG&gt;The Shared access clusters are meant for UC and should point to UC volumes instead of local paths. Can you try to change it to a UC volume?&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 18 Jun 2024 17:58:51 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/pandas-udf-not-working-on-shared-access-mode-but-works-on/m-p/74904#M34809</guid>
      <dc:creator>jacovangelder</dc:creator>
      <dc:date>2024-06-18T17:58:51Z</dc:date>
    </item>
    <item>
      <title>Re: Pandas_UDF not working on shared access mode but works on personal cluster</title>
      <link>https://community.databricks.com/t5/data-engineering/pandas-udf-not-working-on-shared-access-mode-but-works-on/m-p/74922#M34815</link>
      <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/102253"&gt;@jacovangelder&lt;/a&gt;&amp;nbsp;It throws the same error without cache_dir but will try with UC volumes.&lt;/P&gt;</description>
      <pubDate>Wed, 19 Jun 2024 05:58:06 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/pandas-udf-not-working-on-shared-access-mode-but-works-on/m-p/74922#M34815</guid>
      <dc:creator>Awoke101</dc:creator>
      <dc:date>2024-06-19T05:58:06Z</dc:date>
    </item>
    <item>
      <title>Re: Pandas_UDF not working on shared access mode but works on personal cluster</title>
      <link>https://community.databricks.com/t5/data-engineering/pandas-udf-not-working-on-shared-access-mode-but-works-on/m-p/74933#M34817</link>
      <description>&lt;P&gt;Got this error along with the one above, even though the model cached in the UC Volume.&lt;/P&gt;&lt;LI-CODE lang="python"&gt;from fastembed import TextEmbedding, SparseTextEmbedding
from pyspark.sql.pandas.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, FloatType, IntegerType
import pandas as pd

@pandas_udf(ArrayType(FloatType()))
def generate_dense_embeddings(contents: pd.Series) -&amp;gt;  pd.Series:
    small_embedding_model = TextEmbedding(model_name='BAAI/bge-base-en',cache_dir="/Volumes/qdrant_cache/default/model_cache/")
    dense_embeddings_list = small_embedding_model.embed(contents)
    return pd.Series(list(dense_embeddings_list))

from pyspark.sql.functions import col

df=df.limit(50)
df.show(10)
embeddings = df.withColumn("dense_vector", generate_dense_embeddings(col("content")))
embeddings.show(10)&lt;/LI-CODE&gt;&lt;LI-CODE lang="markup"&gt;Write not supported
Files in Repos are currently read-only. Please try writing to /tmp/&amp;lt;filename&amp;gt;. Alternatively, contact your Databricks representative to enable programmatically writing to files in a repository.&lt;/LI-CODE&gt;</description>
      <pubDate>Wed, 19 Jun 2024 06:47:23 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/pandas-udf-not-working-on-shared-access-mode-but-works-on/m-p/74933#M34817</guid>
      <dc:creator>Awoke101</dc:creator>
      <dc:date>2024-06-19T06:47:23Z</dc:date>
    </item>
    <item>
      <title>Re: Pandas_UDF not working on shared access mode but works on personal cluster</title>
      <link>https://community.databricks.com/t5/data-engineering/pandas-udf-not-working-on-shared-access-mode-but-works-on/m-p/74935#M34818</link>
      <description>&lt;P&gt;Hmm interesting, then it's something else.&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;The below code works for me on a Shared access mode cluster. (I don't know what your input dataset looks like):&lt;/P&gt;&lt;LI-CODE lang="python"&gt;df = spark.sql("SELECT '1' as content")

from fastembed import TextEmbedding, SparseTextEmbedding
from pyspark.sql.pandas.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, FloatType, IntegerType
import pandas as pd
from pyspark.sql.functions import col

@pandas_udf(ArrayType(FloatType()))
def generate_dense_embeddings(contents: pd.Series) -&amp;gt;  pd.Series:
    small_embedding_model = TextEmbedding(model_name="BAAI/bge-small-en-v1.5", cache_dir="/tmp/local_cache/")
    dense_embeddings_list = small_embedding_model.embed(contents)
    return pd.Series(list(dense_embeddings_list))

df=df.limit(50)
df.show(10)
embeddings = df.withColumn("dense_vector", generate_dense_embeddings(col("content")))
embeddings.show(10)&lt;/LI-CODE&gt;&lt;P&gt;Are you sure your cluster setup is sufficient enough for what you're trying to achieve?&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 19 Jun 2024 07:11:04 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/pandas-udf-not-working-on-shared-access-mode-but-works-on/m-p/74935#M34818</guid>
      <dc:creator>jacovangelder</dc:creator>
      <dc:date>2024-06-19T07:11:04Z</dc:date>
    </item>
    <item>
      <title>Re: Pandas_UDF not working on shared access mode but works on personal cluster</title>
      <link>https://community.databricks.com/t5/data-engineering/pandas-udf-not-working-on-shared-access-mode-but-works-on/m-p/74945#M34824</link>
      <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/102253"&gt;@jacovangelder&lt;/a&gt;&amp;nbsp;I think the resources are sufficient since it works on the personal cluster which has lesser resources. I tried to run the code you sent on my shared access mode cluster and it still didn't work. Maybe I need to make some changes to the cluster config? This is my current shared cluster config.&lt;/P&gt;&lt;P&gt;EDIT: Can you also share the version of the python packages? I had to downgrade my numpy version for DBR to work so that may also be the cause of this issue. Using fastembed v0.3.1 doesn't require a numpy downgrade but it still doesn't work with 13.3LTS. I am getting warnings due to version incompatibilities in pip.&lt;/P&gt;</description>
      <pubDate>Thu, 20 Jun 2024 04:52:00 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/pandas-udf-not-working-on-shared-access-mode-but-works-on/m-p/74945#M34824</guid>
      <dc:creator>Awoke101</dc:creator>
      <dc:date>2024-06-20T04:52:00Z</dc:date>
    </item>
    <item>
      <title>Re: Pandas_UDF not working on shared access mode but works on personal cluster</title>
      <link>https://community.databricks.com/t5/data-engineering/pandas-udf-not-working-on-shared-access-mode-but-works-on/m-p/75104#M34866</link>
      <description>&lt;P&gt;For some reason a moderator is removing my pip freeze? no idea why. Maybe too long/spammy for a comment.&lt;BR /&gt;Anyway, I am using DBR 14.3 LTS with Shared Access Mode. I haven't installed any other version apart from fastembed==0.3.1. Included a screenshot of my cluster config too.&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 20 Jun 2024 06:05:05 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/pandas-udf-not-working-on-shared-access-mode-but-works-on/m-p/75104#M34866</guid>
      <dc:creator>jacovangelder</dc:creator>
      <dc:date>2024-06-20T06:05:05Z</dc:date>
    </item>
    <item>
      <title>Re: Pandas_UDF not working on shared access mode but works on personal cluster</title>
      <link>https://community.databricks.com/t5/data-engineering/pandas-udf-not-working-on-shared-access-mode-but-works-on/m-p/75106#M34868</link>
      <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/102253"&gt;@jacovangelder&lt;/a&gt;&amp;nbsp;thanks but the error was solved by adding this in my UDF.&lt;/P&gt;&lt;LI-CODE lang="python"&gt;user = os.environ.get("USER")&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 20 Jun 2024 06:04:55 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/pandas-udf-not-working-on-shared-access-mode-but-works-on/m-p/75106#M34868</guid>
      <dc:creator>Awoke101</dc:creator>
      <dc:date>2024-06-20T06:04:55Z</dc:date>
    </item>
    <item>
      <title>Re: Pandas_UDF not working on shared access mode but works on personal cluster</title>
      <link>https://community.databricks.com/t5/data-engineering/pandas-udf-not-working-on-shared-access-mode-but-works-on/m-p/75108#M34869</link>
      <description>&lt;P&gt;Glad its resolved &lt;span class="lia-unicode-emoji" title=":slightly_smiling_face:"&gt;🙂&lt;/span&gt;&lt;/P&gt;</description>
      <pubDate>Thu, 20 Jun 2024 06:07:31 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/pandas-udf-not-working-on-shared-access-mode-but-works-on/m-p/75108#M34869</guid>
      <dc:creator>jacovangelder</dc:creator>
      <dc:date>2024-06-20T06:07:31Z</dc:date>
    </item>
  </channel>
</rss>

