<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Delta Live Tables: How does it identify new files? in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/delta-live-tables-how-does-it-identify-new-files/m-p/99498#M40003</link>
    <description>&lt;P&gt;To resolve the issue of excessive directory scanning, I have changed the folder structure to separate historical files from current files and reduce the number of folders and files that the Databrick process monitors.&lt;/P&gt;</description>
    <pubDate>Wed, 20 Nov 2024 10:30:13 GMT</pubDate>
    <dc:creator>dbuschi</dc:creator>
    <dc:date>2024-11-20T10:30:13Z</dc:date>
    <item>
      <title>Delta Live Tables: How does it identify new files?</title>
      <link>https://community.databricks.com/t5/data-engineering/delta-live-tables-how-does-it-identify-new-files/m-p/98604#M39752</link>
      <description>&lt;P&gt;Hi,&lt;BR /&gt;I'm importing large numbers of parquet files (ca 5200 files per day, they each land in a separate folder) into Azure ADLS storage.&lt;BR /&gt;I have a DLT streaming table reading from the root folder.&lt;BR /&gt;I noticed a massive spike in storage account costs due to file system reads.&lt;BR /&gt;Questions: How does DLT identify newly arriving files? Does it always have to monitor the entire folder including all historical files?&lt;BR /&gt;Are there any design patterns to resolve this (i.e regarding folder structure, archiving of processed files)?&lt;BR /&gt;Many thanks for your help!&lt;/P&gt;</description>
      <pubDate>Wed, 13 Nov 2024 05:43:16 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/delta-live-tables-how-does-it-identify-new-files/m-p/98604#M39752</guid>
      <dc:creator>dbuschi</dc:creator>
      <dc:date>2024-11-13T05:43:16Z</dc:date>
    </item>
    <item>
      <title>Re: Delta Live Tables: How does it identify new files?</title>
      <link>https://community.databricks.com/t5/data-engineering/delta-live-tables-how-does-it-identify-new-files/m-p/98606#M39753</link>
      <description>&lt;P&gt;Please refer to the autoloader for details&amp;nbsp;&lt;A href="https://learn.microsoft.com/en-us/azure/databricks/ingestion/cloud-object-storage/auto-loader/" target="_blank"&gt;https://learn.microsoft.com/en-us/azure/databricks/ingestion/cloud-object-storage/auto-loader/&lt;/A&gt;&amp;nbsp;You can use &lt;A href="https://learn.microsoft.com/en-us/azure/databricks/ingestion/cloud-object-storage/auto-loader/#incremental-ingestion-using-auto-loader-with-delta-live-tables" target="_self"&gt;autoloader&lt;/A&gt; in DLT to detect new files. Our document also mentions the file name&amp;nbsp;&lt;SPAN&gt;patterns that work with the autoloader.&amp;nbsp;&lt;/SPAN&gt;&lt;/P&gt;</description>
      <pubDate>Wed, 13 Nov 2024 06:02:11 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/delta-live-tables-how-does-it-identify-new-files/m-p/98606#M39753</guid>
      <dc:creator>SparkJun</dc:creator>
      <dc:date>2024-11-13T06:02:11Z</dc:date>
    </item>
    <item>
      <title>Re: Delta Live Tables: How does it identify new files?</title>
      <link>https://community.databricks.com/t5/data-engineering/delta-live-tables-how-does-it-identify-new-files/m-p/99498#M40003</link>
      <description>&lt;P&gt;To resolve the issue of excessive directory scanning, I have changed the folder structure to separate historical files from current files and reduce the number of folders and files that the Databrick process monitors.&lt;/P&gt;</description>
      <pubDate>Wed, 20 Nov 2024 10:30:13 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/delta-live-tables-how-does-it-identify-new-files/m-p/99498#M40003</guid>
      <dc:creator>dbuschi</dc:creator>
      <dc:date>2024-11-20T10:30:13Z</dc:date>
    </item>
  </channel>
</rss>

