<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Linear Regression HELP! Pickle + Broadcast Variable Error in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/linear-regression-help-pickle-broadcast-variable-error/m-p/2813#M95</link>
    <description>&lt;P&gt;Hi @Avkash Kana​&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Great to meet you, and thanks for your question!&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt; Let's see if your peers in the community have an answer to your question. Thanks.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;</description>
    <pubDate>Wed, 21 Jun 2023 03:57:02 GMT</pubDate>
    <dc:creator>Anonymous</dc:creator>
    <dc:date>2023-06-21T03:57:02Z</dc:date>
    <item>
      <title>Linear Regression HELP! Pickle + Broadcast Variable Error</title>
      <link>https://community.databricks.com/t5/data-engineering/linear-regression-help-pickle-broadcast-variable-error/m-p/2812#M94</link>
      <description>&lt;P&gt;Hi there,&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;I need some help with this example. We're trying to create a linearRegression model that can parallelize for thousands of symbols per date. When we run this we get a picklingError &lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Any suggestions would be much appreciated!&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;K&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Error:&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;PicklingError: Could not serialize object: RuntimeError: It appears that you are attempting to reference SparkContext from a broadcast variable, action, or transformation. SparkContext can only be used on the driver, not in code that it run on workers. For more information, see SPARK-5063.&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;Code:&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
&amp;nbsp;
# Create a SparkSession
spark = SparkSession.builder.getOrCreate()
&amp;nbsp;
# Create an RDD with your data
data_rdd = spark.sparkContext.parallelize([
    ("symbol1", 1, 2, 3),
    ("symbol2", 4, 5, 6),
    ("symbol3", 7, 8, 9)
])
&amp;nbsp;
# Convert the RDD to a DataFrame
data_df = data_rdd.toDF(["Symbol", "Feature1", "Feature2", "Feature3"])
&amp;nbsp;
# Define the features column
assembler = VectorAssembler(inputCols=["Feature1", "Feature2", "Feature3"], outputCol="features")
&amp;nbsp;
# Fit models on each partition and collect the weights
def fit_model(partition):
    # Create a new linear regression model
    model = LinearRegression(featuresCol="features", labelCol="Symbol")
&amp;nbsp;
    # Create an empty list to store the weights
    weights = []
&amp;nbsp;
    # Convert the partition iterator to a list
    data_list = list(partition)
&amp;nbsp;
    # Convert the list to a DataFrame
    data_partition_df = spark.createDataFrame(data_list, data_df.columns)
&amp;nbsp;
    # Perform vector assembly
    data_partition_df = assembler.transform(data_partition_df)
&amp;nbsp;
    # Fit the model on the partition data
    fitted_model = model.fit(data_partition_df)
&amp;nbsp;
    # Get the model weights
    weights = [fitted_model.coefficients[i] for i in range(len(fitted_model.coefficients))]
&amp;nbsp;
    # Yield the weights
    yield weights
&amp;nbsp;
# Fit models on each partition and collect the weights
partition_weights = data_df.rdd.mapPartitions(fit_model).collect()
&amp;nbsp;
# Create a DataFrame with the collected weights
weights_df = spark.createDataFrame(partition_weights, ["Weight1", "Weight2", "Weight3"])
&amp;nbsp;
# Show the weights DataFrame
weights_df.show()&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Tue, 20 Jun 2023 16:47:35 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/linear-regression-help-pickle-broadcast-variable-error/m-p/2812#M94</guid>
      <dc:creator>Kash</dc:creator>
      <dc:date>2023-06-20T16:47:35Z</dc:date>
    </item>
    <item>
      <title>Re: Linear Regression HELP! Pickle + Broadcast Variable Error</title>
      <link>https://community.databricks.com/t5/data-engineering/linear-regression-help-pickle-broadcast-variable-error/m-p/2813#M95</link>
      <description>&lt;P&gt;Hi @Avkash Kana​&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Great to meet you, and thanks for your question!&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt; Let's see if your peers in the community have an answer to your question. Thanks.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Wed, 21 Jun 2023 03:57:02 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/linear-regression-help-pickle-broadcast-variable-error/m-p/2813#M95</guid>
      <dc:creator>Anonymous</dc:creator>
      <dc:date>2023-06-21T03:57:02Z</dc:date>
    </item>
    <item>
      <title>Re: Linear Regression HELP! Pickle + Broadcast Variable Error</title>
      <link>https://community.databricks.com/t5/data-engineering/linear-regression-help-pickle-broadcast-variable-error/m-p/2814#M96</link>
      <description>&lt;P&gt;Thanks. We're eagerly waiting to see what the community thinks. We're also open to using DB built in ML technology but we're unclear how to use it for our use case. &lt;/P&gt;</description>
      <pubDate>Wed, 21 Jun 2023 14:13:05 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/linear-regression-help-pickle-broadcast-variable-error/m-p/2814#M96</guid>
      <dc:creator>Kash</dc:creator>
      <dc:date>2023-06-21T14:13:05Z</dc:date>
    </item>
    <item>
      <title>Re: Linear Regression HELP! Pickle + Broadcast Variable Error</title>
      <link>https://community.databricks.com/t5/data-engineering/linear-regression-help-pickle-broadcast-variable-error/m-p/2815#M97</link>
      <description>&lt;P&gt;@Vidula Khanna​&amp;nbsp;Can you assist?&lt;/P&gt;</description>
      <pubDate>Thu, 22 Jun 2023 14:45:02 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/linear-regression-help-pickle-broadcast-variable-error/m-p/2815#M97</guid>
      <dc:creator>Kash</dc:creator>
      <dc:date>2023-06-22T14:45:02Z</dc:date>
    </item>
  </channel>
</rss>

