<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: How to Read Terabytes of data in Databricks in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/how-to-read-terabytes-of-data-in-databricks/m-p/13192#M7906</link>
    <description>&lt;P&gt;None of the answers are relevant to me&lt;/P&gt;</description>
    <pubDate>Tue, 17 Jan 2023 05:18:41 GMT</pubDate>
    <dc:creator>Abhijeet</dc:creator>
    <dc:date>2023-01-17T05:18:41Z</dc:date>
    <item>
      <title>How to Read Terabytes of data in Databricks</title>
      <link>https://community.databricks.com/t5/data-engineering/how-to-read-terabytes-of-data-in-databricks/m-p/13187#M7901</link>
      <description>&lt;P&gt;I want to read 1000 GB data. As in spark we do in memory transformation. Do I need worker nodes with combined size of 1000 GB.&lt;/P&gt;&lt;P&gt;Also Just want to understand if will reading we store 1000 GB in memory. So how the Cache Data frame is different from the above case&lt;/P&gt;</description>
      <pubDate>Sat, 07 Jan 2023 14:01:59 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/how-to-read-terabytes-of-data-in-databricks/m-p/13187#M7901</guid>
      <dc:creator>Abhijeet</dc:creator>
      <dc:date>2023-01-07T14:01:59Z</dc:date>
    </item>
    <item>
      <title>Re: How to Read Terabytes of data in Databricks</title>
      <link>https://community.databricks.com/t5/data-engineering/how-to-read-terabytes-of-data-in-databricks/m-p/13188#M7902</link>
      <description>&lt;P&gt;in the master and slave node system &lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;your data chunk will be divided into 128 MB.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;so 1000/128= 7.8125&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;so it will require creating 7-8 partitions of that data so you don't need a 1000GB cluster 2-3 nodes with 10-30 GB size I will work fine &lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Let me know if I am wrong here&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Thanks&lt;/P&gt;&lt;P&gt;Aviral Bhardwaj&lt;/P&gt;</description>
      <pubDate>Sat, 07 Jan 2023 16:00:30 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/how-to-read-terabytes-of-data-in-databricks/m-p/13188#M7902</guid>
      <dc:creator>Aviral-Bhardwaj</dc:creator>
      <dc:date>2023-01-07T16:00:30Z</dc:date>
    </item>
    <item>
      <title>Re: How to Read Terabytes of data in Databricks</title>
      <link>https://community.databricks.com/t5/data-engineering/how-to-read-terabytes-of-data-in-databricks/m-p/13189#M7903</link>
      <description>&lt;P&gt;no of partitions will be&lt;/P&gt;&lt;P&gt;1000*1024/128=8000&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;So my question is, all these 8000 partitions combined will be 1000 GB. &lt;/P&gt;&lt;P&gt;And I am creating a data frame from this data.&lt;/P&gt;&lt;P&gt;How this data is loaded. It will require to somehow hold the data In memory.&lt;/P&gt;&lt;P&gt;So I am just trying to understand what happens at backend, how the data is read( how the nodes manages this load)&lt;/P&gt;</description>
      <pubDate>Sat, 07 Jan 2023 17:02:52 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/how-to-read-terabytes-of-data-in-databricks/m-p/13189#M7903</guid>
      <dc:creator>Abhijeet</dc:creator>
      <dc:date>2023-01-07T17:02:52Z</dc:date>
    </item>
    <item>
      <title>Re: How to Read Terabytes of data in Databricks</title>
      <link>https://community.databricks.com/t5/data-engineering/how-to-read-terabytes-of-data-in-databricks/m-p/13190#M7904</link>
      <description>&lt;P&gt;Hi @Abhijeet Singh​ below blog might help you-&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;A href="https://stackoverflow.com/questions/46638901/how-spark-read-a-large-file-petabyte-when-file-can-not-be-fit-in-sparks-main" alt="https://stackoverflow.com/questions/46638901/how-spark-read-a-large-file-petabyte-when-file-can-not-be-fit-in-sparks-main" target="_blank"&gt;Link&lt;/A&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Sun, 08 Jan 2023 07:24:59 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/how-to-read-terabytes-of-data-in-databricks/m-p/13190#M7904</guid>
      <dc:creator>Ajay-Pandey</dc:creator>
      <dc:date>2023-01-08T07:24:59Z</dc:date>
    </item>
    <item>
      <title>Re: How to Read Terabytes of data in Databricks</title>
      <link>https://community.databricks.com/t5/data-engineering/how-to-read-terabytes-of-data-in-databricks/m-p/13192#M7906</link>
      <description>&lt;P&gt;None of the answers are relevant to me&lt;/P&gt;</description>
      <pubDate>Tue, 17 Jan 2023 05:18:41 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/how-to-read-terabytes-of-data-in-databricks/m-p/13192#M7906</guid>
      <dc:creator>Abhijeet</dc:creator>
      <dc:date>2023-01-17T05:18:41Z</dc:date>
    </item>
  </channel>
</rss>

