<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: can we commit offset in spark structured streaming in databricks. in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/can-we-commit-offset-in-spark-structured-streaming-in-databricks/m-p/71946#M34442</link>
    <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/64195"&gt;@Nis&lt;/a&gt;&amp;nbsp;,&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Spark&amp;nbsp;&lt;SPAN&gt;Structured Streaming manages which offsets are consumed internally, rather than rely on the kafka Consumer to do it. Spark manages the source offsets and write them to the spark streaming query checkpoint.&lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;&lt;SPAN&gt;So the answer is no, you cannot commit a Kafka offset through a spark structured streaming query.&lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;Might be worth checking&amp;nbsp;&lt;A href="https://stackoverflow.com/questions/50844449/how-to-manually-set-group-id-and-commit-kafka-offsets-in-spark-structured-stream" target="_blank"&gt;https://stackoverflow.com/questions/50844449/how-to-manually-set-group-id-and-commit-kafka-offsets-in-spark-structured-stream&lt;/A&gt;.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Thu, 06 Jun 2024 19:20:22 GMT</pubDate>
    <dc:creator>raphaelblg</dc:creator>
    <dc:date>2024-06-06T19:20:22Z</dc:date>
    <item>
      <title>can we commit offset in spark structured streaming in databricks.</title>
      <link>https://community.databricks.com/t5/data-engineering/can-we-commit-offset-in-spark-structured-streaming-in-databricks/m-p/71900#M34432</link>
      <description>&lt;P&gt;We are storing offset details in checkpoint location wanted to know is there a way can we commit offset once we consume the message from kafka.&lt;/P&gt;</description>
      <pubDate>Thu, 06 Jun 2024 12:50:30 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/can-we-commit-offset-in-spark-structured-streaming-in-databricks/m-p/71900#M34432</guid>
      <dc:creator>Nis</dc:creator>
      <dc:date>2024-06-06T12:50:30Z</dc:date>
    </item>
    <item>
      <title>Re: can we commit offset in spark structured streaming in databricks.</title>
      <link>https://community.databricks.com/t5/data-engineering/can-we-commit-offset-in-spark-structured-streaming-in-databricks/m-p/71946#M34442</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/64195"&gt;@Nis&lt;/a&gt;&amp;nbsp;,&amp;nbsp;&lt;/P&gt;
&lt;P&gt;Spark&amp;nbsp;&lt;SPAN&gt;Structured Streaming manages which offsets are consumed internally, rather than rely on the kafka Consumer to do it. Spark manages the source offsets and write them to the spark streaming query checkpoint.&lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;&lt;SPAN&gt;So the answer is no, you cannot commit a Kafka offset through a spark structured streaming query.&lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;Might be worth checking&amp;nbsp;&lt;A href="https://stackoverflow.com/questions/50844449/how-to-manually-set-group-id-and-commit-kafka-offsets-in-spark-structured-stream" target="_blank"&gt;https://stackoverflow.com/questions/50844449/how-to-manually-set-group-id-and-commit-kafka-offsets-in-spark-structured-stream&lt;/A&gt;.&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 06 Jun 2024 19:20:22 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/can-we-commit-offset-in-spark-structured-streaming-in-databricks/m-p/71946#M34442</guid>
      <dc:creator>raphaelblg</dc:creator>
      <dc:date>2024-06-06T19:20:22Z</dc:date>
    </item>
    <item>
      <title>Re: can we commit offset in spark structured streaming in databricks.</title>
      <link>https://community.databricks.com/t5/data-engineering/can-we-commit-offset-in-spark-structured-streaming-in-databricks/m-p/89002#M37668</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/97998"&gt;@raphaelblg&lt;/a&gt;&amp;nbsp;, thanks a lot for providing an elaborate answer. Do you happen to you, by any chance, of some solutions that developers use to track a consumer lag when streaming with Spark from a Kafka topic? It's a rather essential knowledge to have to know if more spark workers are needed or more resources, etc.&lt;/P&gt;&lt;P&gt;Thanks in advance!&lt;/P&gt;</description>
      <pubDate>Sat, 07 Sep 2024 11:03:19 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/can-we-commit-offset-in-spark-structured-streaming-in-databricks/m-p/89002#M37668</guid>
      <dc:creator>dmytro</dc:creator>
      <dc:date>2024-09-07T11:03:19Z</dc:date>
    </item>
    <item>
      <title>Re: can we commit offset in spark structured streaming in databricks.</title>
      <link>https://community.databricks.com/t5/data-engineering/can-we-commit-offset-in-spark-structured-streaming-in-databricks/m-p/89078#M37684</link>
      <description>&lt;P&gt;Sorry for taking it off-topic, but this behaviour of Databricks to store the offset on its own and not depend on Kafka's offset used to cause the storage to grow by a lot - I am talking some 2-3 DBR versions back - is it how it is now or is there any setting that needs to be enable to fix this ? Will it cause any issues with the history ? (I do not have any data on this now, been a long time since I worked on such a use case)&lt;/P&gt;</description>
      <pubDate>Sun, 08 Sep 2024 15:33:43 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/can-we-commit-offset-in-spark-structured-streaming-in-databricks/m-p/89078#M37684</guid>
      <dc:creator>ranged_coop</dc:creator>
      <dc:date>2024-09-08T15:33:43Z</dc:date>
    </item>
    <item>
      <title>Re: can we commit offset in spark structured streaming in databricks.</title>
      <link>https://community.databricks.com/t5/data-engineering/can-we-commit-offset-in-spark-structured-streaming-in-databricks/m-p/89200#M37725</link>
      <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/47980"&gt;@ranged_coop&lt;/a&gt;&amp;nbsp;Regarding your questions:&lt;/P&gt;
&lt;UL&gt;
&lt;LI&gt;&lt;STRONG&gt;Is there any setting that needs to be enable to fix this?&lt;BR /&gt;&lt;/STRONG&gt;There is no setting to change this behavior, as it is a design decision and not an issue. Looks like you're referring to checkpointing. These are the docs:&amp;nbsp;&lt;A href="https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#recovering-from-failures-with-checkpointing" target="_blank" rel="noopener"&gt;https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#recovering-from-failures-with-checkpointing&lt;/A&gt;&lt;STRONG&gt;&lt;BR /&gt;&lt;/STRONG&gt;&lt;/LI&gt;
&lt;LI&gt;&lt;STRONG&gt;Will it cause any issues with the history?&lt;BR /&gt;&lt;/STRONG&gt;&lt;SPAN&gt;Spark structured streaming provides exactly-once processing guarantees. How you process the data depends on the logic implemented in your state management.&lt;/SPAN&gt;&lt;/LI&gt;
&lt;/UL&gt;
&lt;P&gt;&lt;STRONG&gt;Structured Streaming Programming Guide:&lt;/STRONG&gt; &lt;A href="https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#structured-streaming-programming-guide" target="_blank" rel="noopener"&gt;https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#structured-streaming-programming-guide&lt;/A&gt;&lt;/P&gt;</description>
      <pubDate>Mon, 09 Sep 2024 15:44:08 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/can-we-commit-offset-in-spark-structured-streaming-in-databricks/m-p/89200#M37725</guid>
      <dc:creator>raphaelblg</dc:creator>
      <dc:date>2024-09-09T15:44:08Z</dc:date>
    </item>
    <item>
      <title>Re: can we commit offset in spark structured streaming in databricks.</title>
      <link>https://community.databricks.com/t5/data-engineering/can-we-commit-offset-in-spark-structured-streaming-in-databricks/m-p/89201#M37726</link>
      <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/117735"&gt;@dmytro&lt;/a&gt;&amp;nbsp;yes, it's possible to monitor the consumer lag through the streaming query metrics. Every cluster that runs a spark structured streaming query will log the metrics for each streaming batch in the &lt;A href="https://docs.databricks.com/en/compute/clusters-manage.html#compute-driver-and-worker-logs" target="_blank"&gt;driver logs&lt;/A&gt; and &lt;A href="https://docs.databricks.com/en/compute/troubleshooting/debugging-spark-ui.html#debugging-with-the-apache-spark-ui" target="_blank"&gt;Spark UI&lt;/A&gt;. More details at&amp;nbsp;&lt;A href="https://docs.databricks.com/en/structured-streaming/stream-monitoring.html#monitoring-structured-streaming-queries-on-databricks" target="_blank"&gt;Monitoring Structured Streaming queries on Databricks&lt;/A&gt;.&lt;/P&gt;</description>
      <pubDate>Mon, 09 Sep 2024 15:43:02 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/can-we-commit-offset-in-spark-structured-streaming-in-databricks/m-p/89201#M37726</guid>
      <dc:creator>raphaelblg</dc:creator>
      <dc:date>2024-09-09T15:43:02Z</dc:date>
    </item>
    <item>
      <title>Re: can we commit offset in spark structured streaming in databricks.</title>
      <link>https://community.databricks.com/t5/data-engineering/can-we-commit-offset-in-spark-structured-streaming-in-databricks/m-p/89258#M37744</link>
      <description>&lt;P&gt;Thanks Raphael! That's helpful. I'll look into the links.&lt;/P&gt;&lt;P&gt;If I could ask you one more question, do you have any references or links to how upscaling and downscaling of the number of workers and cluster size is done for structured streaming? I have a use-case where the amount of data varies drastically at times and I wanted to use the consumer lag to build some scaling logic based on it.&lt;/P&gt;</description>
      <pubDate>Tue, 10 Sep 2024 08:38:26 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/can-we-commit-offset-in-spark-structured-streaming-in-databricks/m-p/89258#M37744</guid>
      <dc:creator>dmytro</dc:creator>
      <dc:date>2024-09-10T08:38:26Z</dc:date>
    </item>
    <item>
      <title>Re: can we commit offset in spark structured streaming in databricks.</title>
      <link>https://community.databricks.com/t5/data-engineering/can-we-commit-offset-in-spark-structured-streaming-in-databricks/m-p/89310#M37757</link>
      <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/117735"&gt;@dmytro&lt;/a&gt;,&amp;nbsp;&lt;BR /&gt;&lt;BR /&gt;Autoscaling is managed by Databricks and it's logic is mostly automatic. But If you're planning on structured streaming for production I suggest you to go for a fixed amount of workers and limiting your streaming query input rate &lt;STRONG&gt;or&lt;/STRONG&gt;&amp;nbsp;create a DLT pipeline that uses enhanced autoscaling.&amp;nbsp;&lt;/P&gt;
&lt;P&gt;This doc covers the production considerations for structured streaming workloads:&amp;nbsp;&lt;A href="https://docs.databricks.com/en/structured-streaming/production.html" target="_blank"&gt;https://docs.databricks.com/en/structured-streaming/production.html.&lt;/A&gt;&lt;/P&gt;
&lt;P&gt;As mentioned in the docs above, when working with compute auto-scaling, the auto-scaling algorithm will have some difficulties scaling down for structured streaming workloads:&lt;/P&gt;
&lt;P&gt;&amp;nbsp;&lt;/P&gt;
&lt;BLOCKQUOTE&gt;&lt;SPAN&gt;&lt;STRONG&gt;Compute auto-scaling has limitations scaling down cluster size for Structured Streaming workloads.&lt;/STRONG&gt; Databricks recommends using Delta Live Tables with Enhanced Autoscaling for streaming workloads. See&amp;nbsp;&lt;/SPAN&gt;&lt;A class="reference internal" href="https://docs.databricks.com/en/delta-live-tables/auto-scaling.html" target="_blank"&gt;&lt;SPAN class="doc"&gt;Optimize the cluster utilization of Delta Live Tables pipelines with Enhanced Autoscaling&lt;/SPAN&gt;&lt;/A&gt;&lt;SPAN&gt;.&lt;/SPAN&gt;&lt;BR /&gt;&lt;HR /&gt;&lt;/BLOCKQUOTE&gt;
&lt;P&gt;Compute auto-scaling docs:&amp;nbsp;&lt;A href="https://docs.databricks.com/en/compute/configure.html#benefits-of-autoscaling" target="_self"&gt;https://docs.databricks.com/en/compute/configure.html#benefits-of-autoscaling&lt;/A&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;</description>
      <pubDate>Tue, 10 Sep 2024 14:38:16 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/can-we-commit-offset-in-spark-structured-streaming-in-databricks/m-p/89310#M37757</guid>
      <dc:creator>raphaelblg</dc:creator>
      <dc:date>2024-09-10T14:38:16Z</dc:date>
    </item>
    <item>
      <title>Re: can we commit offset in spark structured streaming in databricks.</title>
      <link>https://community.databricks.com/t5/data-engineering/can-we-commit-offset-in-spark-structured-streaming-in-databricks/m-p/89312#M37758</link>
      <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/47980"&gt;@ranged_coop&lt;/a&gt;&amp;nbsp;In addition to my previous message, c&lt;SPAN&gt;heckpointing&amp;nbsp;&lt;/SPAN&gt;&lt;SPAN&gt;is not a Databricks behavior as you said, checkpointing is part of open source Spark structured streaming.&lt;/SPAN&gt;&lt;/P&gt;</description>
      <pubDate>Tue, 10 Sep 2024 14:42:28 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/can-we-commit-offset-in-spark-structured-streaming-in-databricks/m-p/89312#M37758</guid>
      <dc:creator>raphaelblg</dc:creator>
      <dc:date>2024-09-10T14:42:28Z</dc:date>
    </item>
    <item>
      <title>Re: can we commit offset in spark structured streaming in databricks.</title>
      <link>https://community.databricks.com/t5/data-engineering/can-we-commit-offset-in-spark-structured-streaming-in-databricks/m-p/104826#M41895</link>
      <description>&lt;P&gt;hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/97998"&gt;@raphaelblg&lt;/a&gt;&amp;nbsp;! a quick question: is it possible to write data from a DLT to a Kafka topic? Is this functionality implemented? I've seen that is a new create_sink() function, but I cannot find any information about.&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 09 Jan 2025 02:40:00 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/can-we-commit-offset-in-spark-structured-streaming-in-databricks/m-p/104826#M41895</guid>
      <dc:creator>dmytro</dc:creator>
      <dc:date>2025-01-09T02:40:00Z</dc:date>
    </item>
    <item>
      <title>Re: can we commit offset in spark structured streaming in databricks.</title>
      <link>https://community.databricks.com/t5/data-engineering/can-we-commit-offset-in-spark-structured-streaming-in-databricks/m-p/105036#M41975</link>
      <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/117735"&gt;@dmytro&lt;/a&gt;&amp;nbsp;yes, but this feature is currently in Private Preview. Please submit a support case in&amp;nbsp;&lt;A href="https://help.databricks.com/s/" target="_blank"&gt;https://help.databricks.com/s/&lt;/A&gt; if you have interest in trying out this new feature.&lt;/P&gt;</description>
      <pubDate>Thu, 09 Jan 2025 18:21:12 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/can-we-commit-offset-in-spark-structured-streaming-in-databricks/m-p/105036#M41975</guid>
      <dc:creator>raphaelblg</dc:creator>
      <dc:date>2025-01-09T18:21:12Z</dc:date>
    </item>
    <item>
      <title>Re: can we commit offset in spark structured streaming in databricks.</title>
      <link>https://community.databricks.com/t5/data-engineering/can-we-commit-offset-in-spark-structured-streaming-in-databricks/m-p/105082#M41989</link>
      <description>&lt;P&gt;thanks Raphael, i'll do so.&lt;/P&gt;</description>
      <pubDate>Thu, 09 Jan 2025 21:08:43 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/can-we-commit-offset-in-spark-structured-streaming-in-databricks/m-p/105082#M41989</guid>
      <dc:creator>dmytro</dc:creator>
      <dc:date>2025-01-09T21:08:43Z</dc:date>
    </item>
  </channel>
</rss>

