Can you try to provide the mandatory parameters in the bucketizer. Even though in docs it is mentioned as optional. I see it works when the provide the parameters splits, inputcol and outputcol
from pyspark.sql import SparkSession
from pyspark.ml.feature import Bucketizer
# Initialize SparkSession with error handling
try:
spark = SparkSession.builder.appName("BucketizerTest").getOrCreate()
print(f"Spark version: {spark.version}") # Verify SparkSession
except Exception as e:
print(f"Failed to initialize SparkSession: {e}")
raise
# Create a sample DataFrame
data = [(1, -0.5), (2, 0.0), (3, 1.5), (4, 3.0)]
df = spark.createDataFrame(data, ["id", "value"])
# Define splits for bucketizing
splits = [float("-inf"), 0.0, 1.0, 2.0, float("inf")]
# Initialize Bucketizer with required parameters
try:
bucketizer = Bucketizer(
splits=splits,
inputCol="value",
outputCol="bucket"
)
# Apply Bucketizer to DataFrame
bucketed_df = bucketizer.transform(df)
bucketed_df.show()
except Exception as e:
print(f"Error with Bucketizer: {e}")
raise
# Optional: Stop SparkSession (only if needed)
# spark.stop()