<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic What is the most efficient way to read in a partitioned parquet file with pyspark? in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/what-is-the-most-efficient-way-to-read-in-a-partitioned-parquet/m-p/20738#M14036</link>
    <description>&lt;P&gt;I work with parquet files stored in AWS S3 buckets. They are multiple TB in size and partitioned by a numeric column containing integer values between 1 and 200, call it my_partition. I read in and perform compute actions on this data in Databricks with autoscaling turned off.&lt;/P&gt;</description>
    <pubDate>Thu, 24 Jun 2021 15:09:20 GMT</pubDate>
    <dc:creator>User16790091296</dc:creator>
    <dc:date>2021-06-24T15:09:20Z</dc:date>
    <item>
      <title>What is the most efficient way to read in a partitioned parquet file with pyspark?</title>
      <link>https://community.databricks.com/t5/data-engineering/what-is-the-most-efficient-way-to-read-in-a-partitioned-parquet/m-p/20738#M14036</link>
      <description>&lt;P&gt;I work with parquet files stored in AWS S3 buckets. They are multiple TB in size and partitioned by a numeric column containing integer values between 1 and 200, call it my_partition. I read in and perform compute actions on this data in Databricks with autoscaling turned off.&lt;/P&gt;</description>
      <pubDate>Thu, 24 Jun 2021 15:09:20 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/what-is-the-most-efficient-way-to-read-in-a-partitioned-parquet/m-p/20738#M14036</guid>
      <dc:creator>User16790091296</dc:creator>
      <dc:date>2021-06-24T15:09:20Z</dc:date>
    </item>
  </channel>
</rss>

