<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Can't use pyspark bucketizer in Machine Learning</title>
    <link>https://community.databricks.com/t5/machine-learning/can-t-use-pyspark-bucketizer/m-p/133485#M4334</link>
    <description>&lt;P&gt;As title suggests, I am struggling to use pyspark bucketizer as I repeatedly get the following error:&lt;/P&gt;&lt;LI-CODE lang="python"&gt;File &amp;lt;command-8301298062763331&amp;gt;, line 4
      2 from pyspark.ml.feature import Bucketizer
      3 spark = SparkSession.builder.appName("test").getOrCreate()
----&amp;gt; 4 bucketizer = Bucketizer()
File /databricks/python/lib/python3.12/site-packages/pyspark/ml/wrapper.py:87, in JavaWrapper._new_java_obj(java_class, *args)
     84 from pyspark.core.context import SparkContext
     86 sc = SparkContext._active_spark_context
---&amp;gt; 87 assert sc is not None
     89 java_obj = _jvm()
     90 for name in java_class.split("."):&lt;/LI-CODE&gt;&lt;DIV class=""&gt;Minimal reproducible example on serverless compute:&lt;/DIV&gt;&lt;LI-CODE lang="python"&gt;from pyspark.sql import SparkSession
from pyspark.ml.feature import Bucketizer
spark = SparkSession.builder.appName("test").getOrCreate()
bucketizer = Bucketizer()&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Wed, 01 Oct 2025 17:25:12 GMT</pubDate>
    <dc:creator>wise_centipede</dc:creator>
    <dc:date>2025-10-01T17:25:12Z</dc:date>
    <item>
      <title>Can't use pyspark bucketizer</title>
      <link>https://community.databricks.com/t5/machine-learning/can-t-use-pyspark-bucketizer/m-p/133485#M4334</link>
      <description>&lt;P&gt;As title suggests, I am struggling to use pyspark bucketizer as I repeatedly get the following error:&lt;/P&gt;&lt;LI-CODE lang="python"&gt;File &amp;lt;command-8301298062763331&amp;gt;, line 4
      2 from pyspark.ml.feature import Bucketizer
      3 spark = SparkSession.builder.appName("test").getOrCreate()
----&amp;gt; 4 bucketizer = Bucketizer()
File /databricks/python/lib/python3.12/site-packages/pyspark/ml/wrapper.py:87, in JavaWrapper._new_java_obj(java_class, *args)
     84 from pyspark.core.context import SparkContext
     86 sc = SparkContext._active_spark_context
---&amp;gt; 87 assert sc is not None
     89 java_obj = _jvm()
     90 for name in java_class.split("."):&lt;/LI-CODE&gt;&lt;DIV class=""&gt;Minimal reproducible example on serverless compute:&lt;/DIV&gt;&lt;LI-CODE lang="python"&gt;from pyspark.sql import SparkSession
from pyspark.ml.feature import Bucketizer
spark = SparkSession.builder.appName("test").getOrCreate()
bucketizer = Bucketizer()&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 01 Oct 2025 17:25:12 GMT</pubDate>
      <guid>https://community.databricks.com/t5/machine-learning/can-t-use-pyspark-bucketizer/m-p/133485#M4334</guid>
      <dc:creator>wise_centipede</dc:creator>
      <dc:date>2025-10-01T17:25:12Z</dc:date>
    </item>
    <item>
      <title>Re: Can't use pyspark bucketizer</title>
      <link>https://community.databricks.com/t5/machine-learning/can-t-use-pyspark-bucketizer/m-p/133487#M4335</link>
      <description>&lt;P&gt;Can you try to provide the mandatory parameters in the bucketizer. Even though in docs it is mentioned as optional. I see it works when the provide the parameters splits, inputcol and outputcol&lt;/P&gt;&lt;LI-CODE lang="python"&gt;from pyspark.sql import SparkSession
from pyspark.ml.feature import Bucketizer

# Initialize SparkSession with error handling
try:
    spark = SparkSession.builder.appName("BucketizerTest").getOrCreate()
    print(f"Spark version: {spark.version}")  # Verify SparkSession
except Exception as e:
    print(f"Failed to initialize SparkSession: {e}")
    raise

# Create a sample DataFrame
data = [(1, -0.5), (2, 0.0), (3, 1.5), (4, 3.0)]
df = spark.createDataFrame(data, ["id", "value"])

# Define splits for bucketizing
splits = [float("-inf"), 0.0, 1.0, 2.0, float("inf")]

# Initialize Bucketizer with required parameters
try:
    bucketizer = Bucketizer(
        splits=splits,
        inputCol="value",
        outputCol="bucket"
    )
    # Apply Bucketizer to DataFrame
    bucketed_df = bucketizer.transform(df)
    bucketed_df.show()
except Exception as e:
    print(f"Error with Bucketizer: {e}")
    raise

# Optional: Stop SparkSession (only if needed)
# spark.stop()&lt;/LI-CODE&gt;</description>
      <pubDate>Wed, 01 Oct 2025 17:48:16 GMT</pubDate>
      <guid>https://community.databricks.com/t5/machine-learning/can-t-use-pyspark-bucketizer/m-p/133487#M4335</guid>
      <dc:creator>nayan_wylde</dc:creator>
      <dc:date>2025-10-01T17:48:16Z</dc:date>
    </item>
    <item>
      <title>Re: Can't use pyspark bucketizer</title>
      <link>https://community.databricks.com/t5/machine-learning/can-t-use-pyspark-bucketizer/m-p/133489#M4336</link>
      <description>&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/188255"&gt;@wise_centipede&lt;/a&gt;&amp;nbsp;,&lt;/P&gt;&lt;P&gt;In your Serverless compute select Environment Version: 4 and it will work &lt;span class="lia-unicode-emoji" title=":slightly_smiling_face:"&gt;🙂&lt;/span&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="szymon_dybczak_0-1759342015338.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/20337i49E637E377971F42/image-size/medium?v=v2&amp;amp;px=400" role="button" title="szymon_dybczak_0-1759342015338.png" alt="szymon_dybczak_0-1759342015338.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="szymon_dybczak_1-1759342093569.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/20338iAAE18ECDF469868C/image-size/medium?v=v2&amp;amp;px=400" role="button" title="szymon_dybczak_1-1759342093569.png" alt="szymon_dybczak_1-1759342093569.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;With version below 4 I've got the same error as you:&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="szymon_dybczak_2-1759342095917.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/20339iF7FBCA4B96D91E7F/image-size/medium?v=v2&amp;amp;px=400" role="button" title="szymon_dybczak_2-1759342095917.png" alt="szymon_dybczak_2-1759342095917.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;And when I've upgrade serverless environment ot version 4 it works as expected &lt;span class="lia-unicode-emoji" title=":winking_face:"&gt;😉&lt;/span&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;span class="lia-inline-image-display-wrapper lia-image-align-inline" image-alt="szymon_dybczak_3-1759342170033.png" style="width: 400px;"&gt;&lt;img src="https://community.databricks.com/t5/image/serverpage/image-id/20340iCDE9F89D4CE75E7E/image-size/medium?v=v2&amp;amp;px=400" role="button" title="szymon_dybczak_3-1759342170033.png" alt="szymon_dybczak_3-1759342170033.png" /&gt;&lt;/span&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 01 Oct 2025 18:10:02 GMT</pubDate>
      <guid>https://community.databricks.com/t5/machine-learning/can-t-use-pyspark-bucketizer/m-p/133489#M4336</guid>
      <dc:creator>szymon_dybczak</dc:creator>
      <dc:date>2025-10-01T18:10:02Z</dc:date>
    </item>
  </channel>
</rss>

