<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Databricks Autoloader processing old files in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/databricks-autoloader-processing-old-files/m-p/117120#M45429</link>
    <description>&lt;P&gt;Hi Prashanth, Auto loader for me its reading only new files, can you pls go through the below script.&lt;/P&gt;&lt;P&gt;df = (&lt;BR /&gt;spark.readStream&lt;BR /&gt;.format("cloudFiles")&lt;BR /&gt;.option("cloudFiles.format", "csv")&lt;BR /&gt;.option("cloudFiles.schemaLocation", "path")&lt;BR /&gt;.option("recursiveFileLookup", "true")&lt;BR /&gt;.option("header", "true")&lt;BR /&gt;.schema(schema)&lt;BR /&gt;.load("path")&lt;BR /&gt;)&lt;/P&gt;&lt;P&gt;df.writeStream \&lt;BR /&gt;.format("delta") \&lt;BR /&gt;.option("checkpointLocation", "path") \&lt;BR /&gt;.option("mergeSchema", "true") \&lt;BR /&gt;.trigger(availableNow=True) \&lt;BR /&gt;.toTable("path")&lt;/P&gt;</description>
    <pubDate>Wed, 30 Apr 2025 09:36:54 GMT</pubDate>
    <dc:creator>RameshChejarla</dc:creator>
    <dc:date>2025-04-30T09:36:54Z</dc:date>
    <item>
      <title>Databricks Autoloader processing old files</title>
      <link>https://community.databricks.com/t5/data-engineering/databricks-autoloader-processing-old-files/m-p/117095#M45426</link>
      <description>&lt;P&gt;I have implemented Databricks Autoloader and found that every time i executes the code, it is still reading all old existing files + new files. As per the concept of Autoloader, it should read and process only new files. Below is the code. Please help me to understand what might have went wrong&lt;/P&gt;&lt;P&gt;df = (&lt;/P&gt;&lt;P&gt;spark.readStream.format("cloudFiles")&lt;/P&gt;&lt;P&gt;.option("cloudFiles.format","csv")&lt;/P&gt;&lt;P&gt;.option("cloudFiles.schemaLocation",&amp;lt;ADLS_PATH&amp;gt;)&lt;/P&gt;&lt;P&gt;.option("cloudEvolutionMode","rescue")&lt;/P&gt;&lt;P&gt;.option("header",True)&lt;/P&gt;&lt;P&gt;.load("abfss://container2@storageaccount1.dfs.core.windows.net/autoloader/input1/*/")&lt;/P&gt;&lt;P&gt;.writeStream&lt;/P&gt;&lt;P&gt;.option("checkpoint_location","abfss://container2@storageaccount1.dfs.core.windows.net/autoloader/checkpoint1")&lt;/P&gt;&lt;P&gt;.option("mergeSchema",True)&lt;/P&gt;&lt;P&gt;.trigger(availableNow=True)&lt;/P&gt;&lt;P&gt;.toTable("uniform_catalog1.autoloader2.table1")&lt;/P&gt;&lt;P&gt;)&lt;/P&gt;</description>
      <pubDate>Wed, 30 Apr 2025 07:38:46 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/databricks-autoloader-processing-old-files/m-p/117095#M45426</guid>
      <dc:creator>Prashanth24</dc:creator>
      <dc:date>2025-04-30T07:38:46Z</dc:date>
    </item>
    <item>
      <title>Re: Databricks Autoloader processing old files</title>
      <link>https://community.databricks.com/t5/data-engineering/databricks-autoloader-processing-old-files/m-p/117120#M45429</link>
      <description>&lt;P&gt;Hi Prashanth, Auto loader for me its reading only new files, can you pls go through the below script.&lt;/P&gt;&lt;P&gt;df = (&lt;BR /&gt;spark.readStream&lt;BR /&gt;.format("cloudFiles")&lt;BR /&gt;.option("cloudFiles.format", "csv")&lt;BR /&gt;.option("cloudFiles.schemaLocation", "path")&lt;BR /&gt;.option("recursiveFileLookup", "true")&lt;BR /&gt;.option("header", "true")&lt;BR /&gt;.schema(schema)&lt;BR /&gt;.load("path")&lt;BR /&gt;)&lt;/P&gt;&lt;P&gt;df.writeStream \&lt;BR /&gt;.format("delta") \&lt;BR /&gt;.option("checkpointLocation", "path") \&lt;BR /&gt;.option("mergeSchema", "true") \&lt;BR /&gt;.trigger(availableNow=True) \&lt;BR /&gt;.toTable("path")&lt;/P&gt;</description>
      <pubDate>Wed, 30 Apr 2025 09:36:54 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/databricks-autoloader-processing-old-files/m-p/117120#M45429</guid>
      <dc:creator>RameshChejarla</dc:creator>
      <dc:date>2025-04-30T09:36:54Z</dc:date>
    </item>
    <item>
      <title>Re: Databricks Autoloader processing old files</title>
      <link>https://community.databricks.com/t5/data-engineering/databricks-autoloader-processing-old-files/m-p/117129#M45432</link>
      <description>&lt;P&gt;Thanks for the reply Ramesh. Have you tried executing display(df) after below code and see the results. Both of our code is almost similar and when i tried to see the results of df, contents of old processed files and new files were getting displayed&lt;/P&gt;&lt;P&gt;&lt;SPAN&gt;df = (&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;spark.readStream&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;.format("cloudFiles")&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;.option("cloudFiles.format", "csv")&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;.option("cloudFiles.schemaLocation", "path")&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;.option("recursiveFileLookup", "true")&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;.option("header", "true")&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;.schema(schema)&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;.load("path")&lt;/SPAN&gt;&lt;BR /&gt;&lt;SPAN&gt;)&lt;/SPAN&gt;&lt;/P&gt;</description>
      <pubDate>Wed, 30 Apr 2025 10:36:20 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/databricks-autoloader-processing-old-files/m-p/117129#M45432</guid>
      <dc:creator>Prashanth24</dc:creator>
      <dc:date>2025-04-30T10:36:20Z</dc:date>
    </item>
  </channel>
</rss>

