@pjv Can you please try the following, you'll basically want to have more than a single partition:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
# Initialize Spark session (if not already done)
spark = SparkSession.builder.appName("AppName").getOrCreate()
# Create a PySpark DataFrame from your input data
pyspark_dataframe = create_pyspark_dataframe(some_input_data)
# Repartition the DataFrame to ensure even distribution across worker nodes
num_partitions = 4 # Adjust based on your cluster size
pyspark_dataframe = pyspark_dataframe.repartition(num_partitions)
# Define your UDF
MyUDF = udf(myfunc, StringType())
# Apply the UDF to the DataFrame
pyspark_dataframe = pyspark_dataframe.withColumn('UDFOutput', MyUDF(input_data_columns))
# Collect the results
output_strings = [x["UDFOutput"] for x in pyspark_dataframe.select("UDFOutput").collect()]
# Confirm the distribution UDF execution.