<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Creating a Spark DataFrame from a very large dataset in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/creating-a-spark-dataframe-from-a-very-large-dataset/m-p/37817#M26468</link>
    <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/84918"&gt;@charry&lt;/a&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I would suggest saving the list as a CSV file and then reading it back in Spark using spark.read.csv and saving it in parquet format.&lt;/P&gt;</description>
    <pubDate>Tue, 18 Jul 2023 03:38:51 GMT</pubDate>
    <dc:creator>Tharun-Kumar</dc:creator>
    <dc:date>2023-07-18T03:38:51Z</dc:date>
    <item>
      <title>Creating a Spark DataFrame from a very large dataset</title>
      <link>https://community.databricks.com/t5/data-engineering/creating-a-spark-dataframe-from-a-very-large-dataset/m-p/37806#M26464</link>
      <description>&lt;P&gt;I am trying to create a DataFrame using Spark but am having some issues with the amount of data I'm using. I made a list with over 1 million entries through several API calls. The list was above the threshold for&amp;nbsp;&lt;SPAN&gt;spark.rpc.message.maxSize and it was also too large to use broadcasting. I kept on getting OOM errors from using such large amounts of memory. So, I created two separate lists from the data in the original list. When I tried to create the DataFrame again, the size was still too large for the spark.rpc.message.maxSize, and that was using 32 repartitions. My endgoal is to join the two tables together in a temporary view and then write to parquet so I can get all the data in a PowerBI report.&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Mon, 17 Jul 2023 19:02:36 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/creating-a-spark-dataframe-from-a-very-large-dataset/m-p/37806#M26464</guid>
      <dc:creator>charry</dc:creator>
      <dc:date>2023-07-17T19:02:36Z</dc:date>
    </item>
    <item>
      <title>Re: Creating a Spark DataFrame from a very large dataset</title>
      <link>https://community.databricks.com/t5/data-engineering/creating-a-spark-dataframe-from-a-very-large-dataset/m-p/37817#M26468</link>
      <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/84918"&gt;@charry&lt;/a&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;I would suggest saving the list as a CSV file and then reading it back in Spark using spark.read.csv and saving it in parquet format.&lt;/P&gt;</description>
      <pubDate>Tue, 18 Jul 2023 03:38:51 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/creating-a-spark-dataframe-from-a-very-large-dataset/m-p/37817#M26468</guid>
      <dc:creator>Tharun-Kumar</dc:creator>
      <dc:date>2023-07-18T03:38:51Z</dc:date>
    </item>
    <item>
      <title>Re: Creating a Spark DataFrame from a very large dataset</title>
      <link>https://community.databricks.com/t5/data-engineering/creating-a-spark-dataframe-from-a-very-large-dataset/m-p/37831#M26476</link>
      <description>&lt;P&gt;Have you tried specifying the schema when creating the DataFrame ? Providing the right types can help with the memory.&amp;nbsp;&lt;/P&gt;&lt;P&gt;Furthemore, you could incrementally load your data to a bronze delta table instead of loading the full million rows at once.&amp;nbsp;&lt;/P&gt;&lt;P&gt;Hope this helps !&lt;/P&gt;</description>
      <pubDate>Tue, 18 Jul 2023 06:41:27 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/creating-a-spark-dataframe-from-a-very-large-dataset/m-p/37831#M26476</guid>
      <dc:creator>erigaud</dc:creator>
      <dc:date>2023-07-18T06:41:27Z</dc:date>
    </item>
    <item>
      <title>Re: Creating a Spark DataFrame from a very large dataset</title>
      <link>https://community.databricks.com/t5/data-engineering/creating-a-spark-dataframe-from-a-very-large-dataset/m-p/37873#M26493</link>
      <description>&lt;P&gt;the best way is indeed to write the extracted data and then read it back into spark.&amp;nbsp; Like that you do not burden spark with all the api calls.&lt;/P&gt;</description>
      <pubDate>Tue, 18 Jul 2023 15:09:01 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/creating-a-spark-dataframe-from-a-very-large-dataset/m-p/37873#M26493</guid>
      <dc:creator>-werners-</dc:creator>
      <dc:date>2023-07-18T15:09:01Z</dc:date>
    </item>
    <item>
      <title>Re: Creating a Spark DataFrame from a very large dataset</title>
      <link>https://community.databricks.com/t5/data-engineering/creating-a-spark-dataframe-from-a-very-large-dataset/m-p/37942#M26516</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/84918"&gt;@charry&lt;/a&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&lt;SPAN&gt;Checking in. If &lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/14792"&gt;@-werners-&lt;/a&gt;&amp;nbsp;answer helped, would you let us know and mark the answer as best? If not, would you be happy to give us more information?&lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;&lt;SPAN&gt;Cheers!&lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 19 Jul 2023 09:14:56 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/creating-a-spark-dataframe-from-a-very-large-dataset/m-p/37942#M26516</guid>
      <dc:creator>Anonymous</dc:creator>
      <dc:date>2023-07-19T09:14:56Z</dc:date>
    </item>
    <item>
      <title>Re: Creating a Spark DataFrame from a very large dataset</title>
      <link>https://community.databricks.com/t5/data-engineering/creating-a-spark-dataframe-from-a-very-large-dataset/m-p/37988#M26534</link>
      <description>&lt;P&gt;Hey&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/84918"&gt;@charry&lt;/a&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Look at this KB article, this should help address the issue.&lt;/P&gt;&lt;P&gt;&lt;A href="https://kb.databricks.com/execution/spark-serialized-task-is-too-large" target="_blank"&gt;https://kb.databricks.com/execution/spark-serialized-task-is-too-large&lt;/A&gt;&lt;/P&gt;</description>
      <pubDate>Wed, 19 Jul 2023 19:47:23 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/creating-a-spark-dataframe-from-a-very-large-dataset/m-p/37988#M26534</guid>
      <dc:creator>saipujari_spark</dc:creator>
      <dc:date>2023-07-19T19:47:23Z</dc:date>
    </item>
  </channel>
</rss>

