<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Autoloader cloudFiles.maxFilesPerTrigger ignored with .trigger(availableNow=True)? in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/autoloader-cloudfiles-maxfilespertrigger-ignored-with-trigger/m-p/112798#M44333</link>
    <description>&lt;P&gt;Hi,&amp;nbsp;&lt;/P&gt;&lt;P&gt;I'm using the &lt;STRONG&gt;Auto Loader&lt;/STRONG&gt; feature to read streaming data from Delta Lake files and process them in a batch. The trigger is set to &lt;STRONG&gt;availableNow&lt;/STRONG&gt; to include all new data from the checkpoint offset but I limit the amount of delta files for the batch to be 10 using the &lt;STRONG&gt;cloudFiles.maxFilesPerTrigger&lt;/STRONG&gt; option. However, the&amp;nbsp; `process_batch` function always reports that it receives the default 1000 files for its batch. Am I misinterpreting the options here?&lt;/P&gt;&lt;LI-CODE lang="python"&gt;from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import col


def process_batch(df: DataFrame, batch_id: int) -&amp;gt; None:
    batch_id: int = batch_id
    num_files: int = df.select("source_file").distinct().count()
    num_rows_total: int = df.count()

    print(
        f"Batch: '{batch_id}' - Processing {num_files:,} delta files with {num_rows_total:,} rows."
    )


spark_session: SparkSession = SparkSession.getActiveSession()

checkpoint_path: str = "/Volumes/checkpoint_path"
table_path: str = "/Volumes/table_path"

df: DataFrame = (
    spark_session.readStream.format(source="delta")
    .option(key="cloudFiles.format", value="delta")
    .option(key="cloudFiles.schemaLocation", value=checkpoint_path)
    .option(key="cloudFiles.maxFilesPerTrigger", value=10)
    .load(path=table_path)
    .select("*", col("_metadata.file_path").alias("source_file"))
)

df.writeStream.trigger(
    availableNow=True
).foreachBatch(func=process_batch).start()&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Mon, 17 Mar 2025 12:26:23 GMT</pubDate>
    <dc:creator>johschmidt42</dc:creator>
    <dc:date>2025-03-17T12:26:23Z</dc:date>
    <item>
      <title>Autoloader cloudFiles.maxFilesPerTrigger ignored with .trigger(availableNow=True)?</title>
      <link>https://community.databricks.com/t5/data-engineering/autoloader-cloudfiles-maxfilespertrigger-ignored-with-trigger/m-p/112798#M44333</link>
      <description>&lt;P&gt;Hi,&amp;nbsp;&lt;/P&gt;&lt;P&gt;I'm using the &lt;STRONG&gt;Auto Loader&lt;/STRONG&gt; feature to read streaming data from Delta Lake files and process them in a batch. The trigger is set to &lt;STRONG&gt;availableNow&lt;/STRONG&gt; to include all new data from the checkpoint offset but I limit the amount of delta files for the batch to be 10 using the &lt;STRONG&gt;cloudFiles.maxFilesPerTrigger&lt;/STRONG&gt; option. However, the&amp;nbsp; `process_batch` function always reports that it receives the default 1000 files for its batch. Am I misinterpreting the options here?&lt;/P&gt;&lt;LI-CODE lang="python"&gt;from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import col


def process_batch(df: DataFrame, batch_id: int) -&amp;gt; None:
    batch_id: int = batch_id
    num_files: int = df.select("source_file").distinct().count()
    num_rows_total: int = df.count()

    print(
        f"Batch: '{batch_id}' - Processing {num_files:,} delta files with {num_rows_total:,} rows."
    )


spark_session: SparkSession = SparkSession.getActiveSession()

checkpoint_path: str = "/Volumes/checkpoint_path"
table_path: str = "/Volumes/table_path"

df: DataFrame = (
    spark_session.readStream.format(source="delta")
    .option(key="cloudFiles.format", value="delta")
    .option(key="cloudFiles.schemaLocation", value=checkpoint_path)
    .option(key="cloudFiles.maxFilesPerTrigger", value=10)
    .load(path=table_path)
    .select("*", col("_metadata.file_path").alias("source_file"))
)

df.writeStream.trigger(
    availableNow=True
).foreachBatch(func=process_batch).start()&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Mon, 17 Mar 2025 12:26:23 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/autoloader-cloudfiles-maxfilespertrigger-ignored-with-trigger/m-p/112798#M44333</guid>
      <dc:creator>johschmidt42</dc:creator>
      <dc:date>2025-03-17T12:26:23Z</dc:date>
    </item>
    <item>
      <title>Re: Autoloader cloudFiles.maxFilesPerTrigger ignored with .trigger(availableNow=True)?</title>
      <link>https://community.databricks.com/t5/data-engineering/autoloader-cloudfiles-maxfilespertrigger-ignored-with-trigger/m-p/112846#M44349</link>
      <description>&lt;P&gt;It works when changing "cloudFiles.maxFilesPerTrigger" to "maxFilesPerTrigger". But this is unexpected..&lt;/P&gt;</description>
      <pubDate>Mon, 17 Mar 2025 20:55:36 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/autoloader-cloudfiles-maxfilespertrigger-ignored-with-trigger/m-p/112846#M44349</guid>
      <dc:creator>johschmidt42</dc:creator>
      <dc:date>2025-03-17T20:55:36Z</dc:date>
    </item>
    <item>
      <title>Re: Autoloader cloudFiles.maxFilesPerTrigger ignored with .trigger(availableNow=True)?</title>
      <link>https://community.databricks.com/t5/data-engineering/autoloader-cloudfiles-maxfilespertrigger-ignored-with-trigger/m-p/113687#M44609</link>
      <description>&lt;P&gt;In doc it is: "&lt;SPAN&gt;cloudFiles.maxFilesPerTrigger" &lt;span class="lia-unicode-emoji" title=":confused_face:"&gt;😕&lt;/span&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;A href="https://docs.databricks.com/aws/en/ingestion/cloud-object-storage/auto-loader/options" target="_blank"&gt;https://docs.databricks.com/aws/en/ingestion/cloud-object-storage/auto-loader/options&lt;/A&gt;&lt;BR /&gt;&lt;/SPAN&gt;&lt;/P&gt;</description>
      <pubDate>Wed, 26 Mar 2025 15:40:33 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/autoloader-cloudfiles-maxfilespertrigger-ignored-with-trigger/m-p/113687#M44609</guid>
      <dc:creator>p_romm</dc:creator>
      <dc:date>2025-03-26T15:40:33Z</dc:date>
    </item>
  </channel>
</rss>

