<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: How to efficiently read the data lake files' metadata? in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/how-to-efficiently-read-the-data-lake-files-metadata/m-p/73733#M34657</link>
    <description>&lt;P&gt;Efficiently reading data lake files involves:&lt;/P&gt;&lt;OL&gt;&lt;LI&gt;&lt;P&gt;&lt;STRONG&gt;Choosing the Right Tools&lt;/STRONG&gt;: Select tools optimized for data lake file formats (e.g., Parquet, ORC) and distributed computing frameworks (e.g., Apache Spark, Apache Flink).&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P&gt;&lt;STRONG&gt;Partitioning and Indexing&lt;/STRONG&gt;: Partition data logically and create indexes to minimize data scanning.&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P&gt;&lt;STRONG&gt;Optimizing Queries&lt;/STRONG&gt;: Write queries that leverage predicate pushdown, column pruning, and other optimizations provided by the data lake engine.&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P&gt;&lt;STRONG&gt;Parallel Processing&lt;/STRONG&gt;: Utilize parallel processing to distribute workload across multiple nodes or cores, improving read performance.&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P&gt;&lt;STRONG&gt;Caching and Materialization&lt;/STRONG&gt;: Cache frequently accessed data or precompute aggregates to reduce read times for subsequent queries.&lt;/P&gt;&lt;/LI&gt;&lt;/OL&gt;&lt;P&gt;Krunal Medapara,&lt;/P&gt;&lt;P&gt;CTO&lt;/P&gt;&lt;P&gt;NewEvol&lt;/P&gt;&lt;P&gt;&lt;A href="https://www.newevol.io/product/data-lake-solutions.php" target="_self"&gt;https://www.newevol.io/product/data-lake-solutions.php&lt;/A&gt;&lt;/P&gt;</description>
    <pubDate>Thu, 13 Jun 2024 07:17:33 GMT</pubDate>
    <dc:creator>KrunalMedapara</dc:creator>
    <dc:date>2024-06-13T07:17:33Z</dc:date>
    <item>
      <title>How to efficiently read the data lake files' metadata?</title>
      <link>https://community.databricks.com/t5/data-engineering/how-to-efficiently-read-the-data-lake-files-metadata/m-p/20651#M13963</link>
      <description>&lt;P&gt;&lt;B&gt;I want to read the last modified datetime of the files in data lake in a databricks script. If I could read it efficiently as a column when reading data from data lake, it would be perfect.&lt;/B&gt;&lt;/P&gt;&lt;P&gt;&lt;B&gt;Thank you:)&lt;/B&gt;&lt;/P&gt;</description>
      <pubDate>Thu, 24 Jun 2021 15:17:28 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/how-to-efficiently-read-the-data-lake-files-metadata/m-p/20651#M13963</guid>
      <dc:creator>User16790091296</dc:creator>
      <dc:date>2021-06-24T15:17:28Z</dc:date>
    </item>
    <item>
      <title>Re: How to efficiently read the data lake files' metadata?</title>
      <link>https://community.databricks.com/t5/data-engineering/how-to-efficiently-read-the-data-lake-files-metadata/m-p/73733#M34657</link>
      <description>&lt;P&gt;Efficiently reading data lake files involves:&lt;/P&gt;&lt;OL&gt;&lt;LI&gt;&lt;P&gt;&lt;STRONG&gt;Choosing the Right Tools&lt;/STRONG&gt;: Select tools optimized for data lake file formats (e.g., Parquet, ORC) and distributed computing frameworks (e.g., Apache Spark, Apache Flink).&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P&gt;&lt;STRONG&gt;Partitioning and Indexing&lt;/STRONG&gt;: Partition data logically and create indexes to minimize data scanning.&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P&gt;&lt;STRONG&gt;Optimizing Queries&lt;/STRONG&gt;: Write queries that leverage predicate pushdown, column pruning, and other optimizations provided by the data lake engine.&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P&gt;&lt;STRONG&gt;Parallel Processing&lt;/STRONG&gt;: Utilize parallel processing to distribute workload across multiple nodes or cores, improving read performance.&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P&gt;&lt;STRONG&gt;Caching and Materialization&lt;/STRONG&gt;: Cache frequently accessed data or precompute aggregates to reduce read times for subsequent queries.&lt;/P&gt;&lt;/LI&gt;&lt;/OL&gt;&lt;P&gt;Krunal Medapara,&lt;/P&gt;&lt;P&gt;CTO&lt;/P&gt;&lt;P&gt;NewEvol&lt;/P&gt;&lt;P&gt;&lt;A href="https://www.newevol.io/product/data-lake-solutions.php" target="_self"&gt;https://www.newevol.io/product/data-lake-solutions.php&lt;/A&gt;&lt;/P&gt;</description>
      <pubDate>Thu, 13 Jun 2024 07:17:33 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/how-to-efficiently-read-the-data-lake-files-metadata/m-p/73733#M34657</guid>
      <dc:creator>KrunalMedapara</dc:creator>
      <dc:date>2024-06-13T07:17:33Z</dc:date>
    </item>
  </channel>
</rss>

