<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Save to parquet with fixed size in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/save-to-parquet-with-fixed-size/m-p/37543#M26384</link>
    <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/84270"&gt;@erigaud&lt;/a&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&lt;SPAN&gt;Thank you for posting your question in our community! We are happy to assist you.&lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;&lt;SPAN&gt;To help us provide you with the most accurate information, could you please take a moment to review the responses and select the one that best answers your question?&lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;&lt;SPAN&gt;This will also help other community members who may have similar questions in the future. Thank you for your participation and let us know if you need any further assistance!&amp;nbsp;&lt;/SPAN&gt;&lt;/P&gt;</description>
    <pubDate>Thu, 13 Jul 2023 06:44:03 GMT</pubDate>
    <dc:creator>Anonymous</dc:creator>
    <dc:date>2023-07-13T06:44:03Z</dc:date>
    <item>
      <title>Save to parquet with fixed size</title>
      <link>https://community.databricks.com/t5/data-engineering/save-to-parquet-with-fixed-size/m-p/37500#M26373</link>
      <description>&lt;P&gt;I have a large dataframe (&amp;gt;1TB) I have to save in parquet format (not delta for this use case). When I save the dataframe using .format("parquet") it results in several parquet files. I want these files to be a specific size (ie not larger than 500Mb each). Is there a way to enforce that ?&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 12 Jul 2023 11:13:58 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/save-to-parquet-with-fixed-size/m-p/37500#M26373</guid>
      <dc:creator>erigaud</dc:creator>
      <dc:date>2023-07-12T11:13:58Z</dc:date>
    </item>
    <item>
      <title>Re: Save to parquet with fixed size</title>
      <link>https://community.databricks.com/t5/data-engineering/save-to-parquet-with-fixed-size/m-p/37507#M26377</link>
      <description>&lt;P&gt;Let's say you want the average partition size to be 400MB, then you can do:&lt;/P&gt;&lt;LI-CODE lang="python"&gt;(df.repartition(1024 * 1024 // 400)
    .write.mode('overwrite')
    .format('parquet')
    .save('path/to/file'))&lt;/LI-CODE&gt;</description>
      <pubDate>Wed, 12 Jul 2023 13:10:50 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/save-to-parquet-with-fixed-size/m-p/37507#M26377</guid>
      <dc:creator>dream</dc:creator>
      <dc:date>2023-07-12T13:10:50Z</dc:date>
    </item>
    <item>
      <title>Re: Save to parquet with fixed size</title>
      <link>https://community.databricks.com/t5/data-engineering/save-to-parquet-with-fixed-size/m-p/37509#M26378</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/84270"&gt;@erigaud&lt;/a&gt;&amp;nbsp;&amp;nbsp;Good day!&lt;/P&gt;&lt;P&gt;Whenever you are saving the data, you could pass the &lt;STRONG&gt;parquet.block.size &lt;/STRONG&gt;config as an option:&lt;/P&gt;&lt;P&gt;&lt;STRONG&gt;Example:&lt;/STRONG&gt;&lt;/P&gt;&lt;P&gt;spark.read.parquet("dbfs:/delta/delta-path/part-xxxx.snappy.parquet").write.mode("overwrite")&lt;STRONG&gt;.option("parquet.block.size", 500)&lt;/STRONG&gt;.parquet("/tmp/vinay/parquet/blocksize1")&lt;/P&gt;</description>
      <pubDate>Wed, 12 Jul 2023 13:38:06 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/save-to-parquet-with-fixed-size/m-p/37509#M26378</guid>
      <dc:creator>Vinay_M_R</dc:creator>
      <dc:date>2023-07-12T13:38:06Z</dc:date>
    </item>
    <item>
      <title>Re: Save to parquet with fixed size</title>
      <link>https://community.databricks.com/t5/data-engineering/save-to-parquet-with-fixed-size/m-p/37543#M26384</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/84270"&gt;@erigaud&lt;/a&gt;&amp;nbsp;&lt;/P&gt;
&lt;P&gt;&lt;SPAN&gt;Thank you for posting your question in our community! We are happy to assist you.&lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;&lt;SPAN&gt;To help us provide you with the most accurate information, could you please take a moment to review the responses and select the one that best answers your question?&lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;&lt;SPAN&gt;This will also help other community members who may have similar questions in the future. Thank you for your participation and let us know if you need any further assistance!&amp;nbsp;&lt;/SPAN&gt;&lt;/P&gt;</description>
      <pubDate>Thu, 13 Jul 2023 06:44:03 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/save-to-parquet-with-fixed-size/m-p/37543#M26384</guid>
      <dc:creator>Anonymous</dc:creator>
      <dc:date>2023-07-13T06:44:03Z</dc:date>
    </item>
    <item>
      <title>Re: Save to parquet with fixed size</title>
      <link>https://community.databricks.com/t5/data-engineering/save-to-parquet-with-fixed-size/m-p/37580#M26389</link>
      <description>&lt;P&gt;In addition to the solutions provided above, we can also control the behavior by specifying maximum records per file if we have a rough estimate of how many records should be written to a file to reach 500 MB size.&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;df.write.option("maxRecordsPerFile", 1000000)&lt;/SPAN&gt;&lt;/P&gt;</description>
      <pubDate>Thu, 13 Jul 2023 16:02:24 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/save-to-parquet-with-fixed-size/m-p/37580#M26389</guid>
      <dc:creator>Lakshay</dc:creator>
      <dc:date>2023-07-13T16:02:24Z</dc:date>
    </item>
  </channel>
</rss>

