<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic How can I get date when autoloader processes the file in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/how-can-i-get-date-when-autoloader-processes-the-file/m-p/8369#M4031</link>
    <description>&lt;P&gt;Hi,&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;I am running autoloader which is running continuously and checks for new file every 1 minute. I need to store when file was received/processed but its giving me date when autoloader started. &lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Here is my code.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;df = (spark&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;.readStream&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;.format("cloudFiles")&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;.option("cloudFiles.format", "json")&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;.option("cloudFiles.includeExistingFiles", "true")&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;.option("cloudFiles.validateOptions", "true")&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;.option("cloudFiles.region", "us-east-1")&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;.option("cloudFiles.backfillInterval", "1 day")&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;.option("cloudFiles.fetchParallelism", 100)&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;.option("cloudFiles.useNotifications", "true")&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;.schema(streamSchema)&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;.load(raw_path)&lt;/P&gt;&lt;P&gt;&lt;B&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;.withColumn('process_date',lit(date.today()))&lt;/B&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;)&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;(df&lt;/P&gt;&lt;P&gt;&amp;nbsp;.writeStream&lt;/P&gt;&lt;P&gt;&amp;nbsp;.format("delta")&lt;/P&gt;&lt;P&gt;&amp;nbsp;.outputMode("append")&lt;/P&gt;&lt;P&gt;&amp;nbsp;.option("checkpointLocation", bronze_checkpoint_path)&lt;/P&gt;&lt;P&gt;&amp;nbsp;.option("path", bronze_path)&lt;/P&gt;&lt;P&gt;&amp;nbsp;.option("mergeSchema", True)&lt;/P&gt;&lt;P&gt;&amp;nbsp;.trigger(processingTime="1 minute")&amp;nbsp;# or set this to whatever makes sense to the data source&lt;/P&gt;&lt;P&gt;&amp;nbsp;.start()&amp;nbsp;&lt;/P&gt;&lt;P&gt;)&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Appreciate any help.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Regards,&lt;/P&gt;&lt;P&gt;Sanjay&lt;/P&gt;</description>
    <pubDate>Thu, 02 Mar 2023 13:48:37 GMT</pubDate>
    <dc:creator>sanjay</dc:creator>
    <dc:date>2023-03-02T13:48:37Z</dc:date>
    <item>
      <title>How can I get date when autoloader processes the file</title>
      <link>https://community.databricks.com/t5/data-engineering/how-can-i-get-date-when-autoloader-processes-the-file/m-p/8369#M4031</link>
      <description>&lt;P&gt;Hi,&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;I am running autoloader which is running continuously and checks for new file every 1 minute. I need to store when file was received/processed but its giving me date when autoloader started. &lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Here is my code.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;df = (spark&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;.readStream&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;.format("cloudFiles")&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;.option("cloudFiles.format", "json")&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;.option("cloudFiles.includeExistingFiles", "true")&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;.option("cloudFiles.validateOptions", "true")&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;.option("cloudFiles.region", "us-east-1")&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;.option("cloudFiles.backfillInterval", "1 day")&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;.option("cloudFiles.fetchParallelism", 100)&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;.option("cloudFiles.useNotifications", "true")&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;.schema(streamSchema)&lt;/P&gt;&lt;P&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;.load(raw_path)&lt;/P&gt;&lt;P&gt;&lt;B&gt;&amp;nbsp;&amp;nbsp;&amp;nbsp;.withColumn('process_date',lit(date.today()))&lt;/B&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;)&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;(df&lt;/P&gt;&lt;P&gt;&amp;nbsp;.writeStream&lt;/P&gt;&lt;P&gt;&amp;nbsp;.format("delta")&lt;/P&gt;&lt;P&gt;&amp;nbsp;.outputMode("append")&lt;/P&gt;&lt;P&gt;&amp;nbsp;.option("checkpointLocation", bronze_checkpoint_path)&lt;/P&gt;&lt;P&gt;&amp;nbsp;.option("path", bronze_path)&lt;/P&gt;&lt;P&gt;&amp;nbsp;.option("mergeSchema", True)&lt;/P&gt;&lt;P&gt;&amp;nbsp;.trigger(processingTime="1 minute")&amp;nbsp;# or set this to whatever makes sense to the data source&lt;/P&gt;&lt;P&gt;&amp;nbsp;.start()&amp;nbsp;&lt;/P&gt;&lt;P&gt;)&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Appreciate any help.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Regards,&lt;/P&gt;&lt;P&gt;Sanjay&lt;/P&gt;</description>
      <pubDate>Thu, 02 Mar 2023 13:48:37 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/how-can-i-get-date-when-autoloader-processes-the-file/m-p/8369#M4031</guid>
      <dc:creator>sanjay</dc:creator>
      <dc:date>2023-03-02T13:48:37Z</dc:date>
    </item>
    <item>
      <title>Re: How can I get date when autoloader processes the file</title>
      <link>https://community.databricks.com/t5/data-engineering/how-can-i-get-date-when-autoloader-processes-the-file/m-p/8370#M4032</link>
      <description>&lt;P&gt;Hi @Sanjay Jain​&amp;nbsp;, You can use the File Metadata column functionality to collect that information.&lt;/P&gt;&lt;P&gt;Ref doc:- &lt;A href="https://docs.databricks.com/ingestion/file-metadata-column.html" target="test_blank"&gt;https://docs.databricks.com/ingestion/file-metadata-column.html&lt;/A&gt;&lt;/P&gt;</description>
      <pubDate>Thu, 02 Mar 2023 14:55:35 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/how-can-i-get-date-when-autoloader-processes-the-file/m-p/8370#M4032</guid>
      <dc:creator>Lakshay</dc:creator>
      <dc:date>2023-03-02T14:55:35Z</dc:date>
    </item>
    <item>
      <title>Re: How can I get date when autoloader processes the file</title>
      <link>https://community.databricks.com/t5/data-engineering/how-can-i-get-date-when-autoloader-processes-the-file/m-p/8371#M4033</link>
      <description>&lt;P&gt;Thank you Lakshay. Its helpful.&lt;/P&gt;&lt;P&gt;Another query related to autoloader&lt;/P&gt;&lt;OL&gt;&lt;LI&gt;How to delete files automatically once its processed successfully.&lt;/LI&gt;&lt;/OL&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Regards,&lt;/P&gt;&lt;P&gt;Sanjay&lt;/P&gt;</description>
      <pubDate>Thu, 02 Mar 2023 17:10:12 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/how-can-i-get-date-when-autoloader-processes-the-file/m-p/8371#M4033</guid>
      <dc:creator>sanjay</dc:creator>
      <dc:date>2023-03-02T17:10:12Z</dc:date>
    </item>
    <item>
      <title>Re: How can I get date when autoloader processes the file</title>
      <link>https://community.databricks.com/t5/data-engineering/how-can-i-get-date-when-autoloader-processes-the-file/m-p/8372#M4034</link>
      <description>&lt;P&gt;Hi @Sanjay Jain​&amp;nbsp;, Currently we don't have a way to delete the files automatically. However, we are working on a feature called "CleanSource" which will do this. Currently, it is in private preview. You can explore that option.&lt;/P&gt;&lt;P&gt;Or the other way is to develop a small code that uses the file metadata column information to delete the files periodically.&lt;/P&gt;</description>
      <pubDate>Thu, 02 Mar 2023 19:06:55 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/how-can-i-get-date-when-autoloader-processes-the-file/m-p/8372#M4034</guid>
      <dc:creator>Lakshay</dc:creator>
      <dc:date>2023-03-02T19:06:55Z</dc:date>
    </item>
    <item>
      <title>Re: How can I get date when autoloader processes the file</title>
      <link>https://community.databricks.com/t5/data-engineering/how-can-i-get-date-when-autoloader-processes-the-file/m-p/8373#M4035</link>
      <description>&lt;P&gt;Thank you Lakshay.&lt;/P&gt;</description>
      <pubDate>Fri, 03 Mar 2023 05:47:31 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/how-can-i-get-date-when-autoloader-processes-the-file/m-p/8373#M4035</guid>
      <dc:creator>sanjay</dc:creator>
      <dc:date>2023-03-03T05:47:31Z</dc:date>
    </item>
  </channel>
</rss>

