<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Spark Driver Out of Memory Issue in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/spark-driver-out-of-memory-issue/m-p/21409#M14585</link>
    <description>&lt;P&gt;I am getting the above issue while writing a Spark DF as a parquet file to AWS S3. Not doing any broadcast join actually.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;</description>
    <pubDate>Sun, 08 May 2022 19:05:48 GMT</pubDate>
    <dc:creator>chandan_a_v</dc:creator>
    <dc:date>2022-05-08T19:05:48Z</dc:date>
    <item>
      <title>Spark Driver Out of Memory Issue</title>
      <link>https://community.databricks.com/t5/data-engineering/spark-driver-out-of-memory-issue/m-p/21405#M14581</link>
      <description>&lt;P&gt;Hi, &lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;I am executing a simple job in Databricks for which I am getting below error. I increased the Driver size still I faced same issue. &lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Spark config :&lt;/P&gt;&lt;P&gt;from pyspark.sql import SparkSession&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;spark_session = SparkSession.builder.appName("Demand Forecasting").config("spark.yarn.executor.memoryOverhead", 2048).getOrCreate()&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Driver and worker node type -r5.2xlarge&lt;/P&gt;&lt;P&gt;10 worker nodes.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Error Log:&lt;/P&gt;&lt;P&gt;Caused by: org.apache.spark.sql.execution.OutOfMemorySparkException: Size of broadcasted table far exceeds estimates and exceeds limit of spark.driver.maxResultSize=4294967296.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Fri, 06 May 2022 06:23:48 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/spark-driver-out-of-memory-issue/m-p/21405#M14581</guid>
      <dc:creator>chandan_a_v</dc:creator>
      <dc:date>2022-05-06T06:23:48Z</dc:date>
    </item>
    <item>
      <title>Re: Spark Driver Out of Memory Issue</title>
      <link>https://community.databricks.com/t5/data-engineering/spark-driver-out-of-memory-issue/m-p/21406#M14582</link>
      <description>&lt;P&gt;looking at the error message you try to broadcast a large table.  Remove the broadcast statement on the large table and you will be fine.&lt;/P&gt;</description>
      <pubDate>Fri, 06 May 2022 06:54:49 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/spark-driver-out-of-memory-issue/m-p/21406#M14582</guid>
      <dc:creator>-werners-</dc:creator>
      <dc:date>2022-05-06T06:54:49Z</dc:date>
    </item>
    <item>
      <title>Re: Spark Driver Out of Memory Issue</title>
      <link>https://community.databricks.com/t5/data-engineering/spark-driver-out-of-memory-issue/m-p/21407#M14583</link>
      <description>&lt;P&gt;In my opinion on databricks, you don't need to specify (spark_session = SparkSession.builder.appName("Demand Forecasting").config("spark.yarn.executor.memoryOverhead", 2048).getOrCreate()) and rest is as @Werner Stinckens​&amp;nbsp;said&lt;/P&gt;</description>
      <pubDate>Fri, 06 May 2022 16:04:50 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/spark-driver-out-of-memory-issue/m-p/21407#M14583</guid>
      <dc:creator>Hubert-Dudek</dc:creator>
      <dc:date>2022-05-06T16:04:50Z</dc:date>
    </item>
    <item>
      <title>Re: Spark Driver Out of Memory Issue</title>
      <link>https://community.databricks.com/t5/data-engineering/spark-driver-out-of-memory-issue/m-p/21408#M14584</link>
      <description>&lt;P&gt;HI @Werner Stinckens​&amp;nbsp;,&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;I am getting the above issue while writing a Spark DF as a parquet file to AWS S3. Not doing any broadcast join actually.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Thanks,&lt;/P&gt;&lt;P&gt;Chandan &lt;/P&gt;</description>
      <pubDate>Sun, 08 May 2022 19:05:33 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/spark-driver-out-of-memory-issue/m-p/21408#M14584</guid>
      <dc:creator>chandan_a_v</dc:creator>
      <dc:date>2022-05-08T19:05:33Z</dc:date>
    </item>
    <item>
      <title>Re: Spark Driver Out of Memory Issue</title>
      <link>https://community.databricks.com/t5/data-engineering/spark-driver-out-of-memory-issue/m-p/21409#M14585</link>
      <description>&lt;P&gt;I am getting the above issue while writing a Spark DF as a parquet file to AWS S3. Not doing any broadcast join actually.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Sun, 08 May 2022 19:05:48 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/spark-driver-out-of-memory-issue/m-p/21409#M14585</guid>
      <dc:creator>chandan_a_v</dc:creator>
      <dc:date>2022-05-08T19:05:48Z</dc:date>
    </item>
    <item>
      <title>Re: Spark Driver Out of Memory Issue</title>
      <link>https://community.databricks.com/t5/data-engineering/spark-driver-out-of-memory-issue/m-p/21410#M14586</link>
      <description>&lt;P&gt;As Hubert mentioned: you should not create a spark session on databricks, it is provided.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;The fact you do not broadcast manually makes me think Spark uses a broadcastjoin.&lt;/P&gt;&lt;P&gt;There is a KB about issues with that:&lt;/P&gt;&lt;P&gt;&lt;A href="https://kb.databricks.com/sql/bchashjoin-exceeds-bcjointhreshold-oom.html" alt="https://kb.databricks.com/sql/bchashjoin-exceeds-bcjointhreshold-oom.html" target="_blank"&gt;https://kb.databricks.com/sql/bchashjoin-exceeds-bcjointhreshold-oom.html&lt;/A&gt;&lt;/P&gt;&lt;P&gt;Can you check if it is applicable?&lt;/P&gt;</description>
      <pubDate>Wed, 11 May 2022 13:25:58 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/spark-driver-out-of-memory-issue/m-p/21410#M14586</guid>
      <dc:creator>-werners-</dc:creator>
      <dc:date>2022-05-11T13:25:58Z</dc:date>
    </item>
    <item>
      <title>Re: Spark Driver Out of Memory Issue</title>
      <link>https://community.databricks.com/t5/data-engineering/spark-driver-out-of-memory-issue/m-p/21412#M14588</link>
      <description>&lt;P&gt;Hi @Kaniz Fatma​&amp;nbsp;,&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Switching the runtime version to 10.4 fixed the issue for me.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Thanks,&lt;/P&gt;&lt;P&gt;Chandan &lt;/P&gt;</description>
      <pubDate>Thu, 02 Jun 2022 08:51:38 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/spark-driver-out-of-memory-issue/m-p/21412#M14588</guid>
      <dc:creator>chandan_a_v</dc:creator>
      <dc:date>2022-06-02T08:51:38Z</dc:date>
    </item>
  </channel>
</rss>

