<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Data processing metrics in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/data-processing-metrics/m-p/111159#M43815</link>
    <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/106294"&gt;@Alberto_Umana&lt;/a&gt;&amp;nbsp;thanks for the reply. without the current code change, is it possible to extract these metrics from logs?&lt;/P&gt;</description>
    <pubDate>Tue, 25 Feb 2025 19:09:10 GMT</pubDate>
    <dc:creator>noorbasha534</dc:creator>
    <dc:date>2025-02-25T19:09:10Z</dc:date>
    <item>
      <title>Data processing metrics</title>
      <link>https://community.databricks.com/t5/data-engineering/data-processing-metrics/m-p/108310#M43031</link>
      <description>&lt;P&gt;Dear all,&lt;/P&gt;&lt;P&gt;What are some proven ways of capturing data processing metrics (number of rows processed/updated/inserted, number of micro-batches etc etc) in a PySpark/SQL code based notebook irrespective of the fact it uses auto-loader, structured streaming, DLT etc.&lt;/P&gt;&lt;P&gt;At the moment, we do profile the tables using "desc history" command and capture these as a reactive step. But, I like to achieve the same in real-time, and prepare an operation dashboard so it is useful for operators to pro-actively find those tables that have high latency on a particular day ((could be sudden increase in volume of records impacting a whole bunch of other tables))&lt;/P&gt;&lt;P&gt;Appreciate your thoughts.&lt;/P&gt;&lt;P&gt;Br,&lt;/P&gt;&lt;P&gt;Noor.&lt;/P&gt;</description>
      <pubDate>Sat, 01 Feb 2025 17:55:13 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/data-processing-metrics/m-p/108310#M43031</guid>
      <dc:creator>noorbasha534</dc:creator>
      <dc:date>2025-02-01T17:55:13Z</dc:date>
    </item>
    <item>
      <title>Re: Data processing metrics</title>
      <link>https://community.databricks.com/t5/data-engineering/data-processing-metrics/m-p/108333#M43039</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/124839"&gt;@noorbasha534&lt;/a&gt;,&lt;/P&gt;
&lt;P&gt;You can use the &lt;CODE&gt;StreamingQueryListener&lt;/CODE&gt; interface to capture metrics like the number of input rows, processing time, and batch duration. This can be integrated into your PySpark code to log these metrics in real-time.&lt;/P&gt;
&lt;P class="_1t7bu9h1 paragraph"&gt;Example:&lt;/P&gt;
&lt;DIV class="gb5fhw2"&gt;
&lt;PRE&gt;&lt;CODE class="markdown-code-python _1t7bu9hb hljs language-python gb5fhw3"&gt;&lt;SPAN class="hljs-keyword"&gt;from&lt;/SPAN&gt; pyspark.sql.streaming &lt;SPAN class="hljs-keyword"&gt;import&lt;/SPAN&gt; StreamingQueryListener

&lt;SPAN class="hljs-keyword"&gt;class&lt;/SPAN&gt; &lt;SPAN class="hljs-title class_"&gt;MyListener&lt;/SPAN&gt;(&lt;SPAN class="hljs-title class_ inherited__"&gt;StreamingQueryListener&lt;/SPAN&gt;):
    &lt;SPAN class="hljs-keyword"&gt;def&lt;/SPAN&gt; &lt;SPAN class="hljs-title function_"&gt;onQueryStarted&lt;/SPAN&gt;(&lt;SPAN class="hljs-params"&gt;self, event&lt;/SPAN&gt;):
        &lt;SPAN class="hljs-built_in"&gt;print&lt;/SPAN&gt;(&lt;SPAN class="hljs-string"&gt;f"Query started: &lt;SPAN class="hljs-subst"&gt;{event.&lt;SPAN class="hljs-built_in"&gt;id&lt;/SPAN&gt;}&lt;/SPAN&gt;"&lt;/SPAN&gt;)

    &lt;SPAN class="hljs-keyword"&gt;def&lt;/SPAN&gt; &lt;SPAN class="hljs-title function_"&gt;onQueryProgress&lt;/SPAN&gt;(&lt;SPAN class="hljs-params"&gt;self, event&lt;/SPAN&gt;):
        &lt;SPAN class="hljs-built_in"&gt;print&lt;/SPAN&gt;(&lt;SPAN class="hljs-string"&gt;f"Query made progress: &lt;SPAN class="hljs-subst"&gt;{event.progress}&lt;/SPAN&gt;"&lt;/SPAN&gt;)

    &lt;SPAN class="hljs-keyword"&gt;def&lt;/SPAN&gt; &lt;SPAN class="hljs-title function_"&gt;onQueryTerminated&lt;/SPAN&gt;(&lt;SPAN class="hljs-params"&gt;self, event&lt;/SPAN&gt;):
        &lt;SPAN class="hljs-built_in"&gt;print&lt;/SPAN&gt;(&lt;SPAN class="hljs-string"&gt;f"Query terminated: &lt;SPAN class="hljs-subst"&gt;{event.&lt;SPAN class="hljs-built_in"&gt;id&lt;/SPAN&gt;}&lt;/SPAN&gt;"&lt;/SPAN&gt;)

spark.streams.addListener(MyListener())

df = spark.readStream.&lt;SPAN class="hljs-built_in"&gt;format&lt;/SPAN&gt;(&lt;SPAN class="hljs-string"&gt;"cloudFiles"&lt;/SPAN&gt;) \
    .option(&lt;SPAN class="hljs-string"&gt;"cloudFiles.format"&lt;/SPAN&gt;, &lt;SPAN class="hljs-string"&gt;"parquet"&lt;/SPAN&gt;) \
    .load(&lt;SPAN class="hljs-string"&gt;"s3://your-bucket/path"&lt;/SPAN&gt;)

query = df.writeStream.&lt;SPAN class="hljs-built_in"&gt;format&lt;/SPAN&gt;(&lt;SPAN class="hljs-string"&gt;"delta"&lt;/SPAN&gt;) \
    .option(&lt;SPAN class="hljs-string"&gt;"checkpointLocation"&lt;/SPAN&gt;, &lt;SPAN class="hljs-string"&gt;"s3://your-bucket/checkpoints"&lt;/SPAN&gt;) \
    .start(&lt;SPAN class="hljs-string"&gt;"s3://your-bucket/output"&lt;/SPAN&gt;)&lt;/CODE&gt;&lt;/PRE&gt;
&lt;DIV class="gb5fhw4"&gt;
&lt;DIV class="gb5fhw5"&gt;&amp;nbsp;&lt;/DIV&gt;
&lt;/DIV&gt;
&lt;/DIV&gt;</description>
      <pubDate>Sun, 02 Feb 2025 02:37:27 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/data-processing-metrics/m-p/108333#M43039</guid>
      <dc:creator>Alberto_Umana</dc:creator>
      <dc:date>2025-02-02T02:37:27Z</dc:date>
    </item>
    <item>
      <title>Re: Data processing metrics</title>
      <link>https://community.databricks.com/t5/data-engineering/data-processing-metrics/m-p/111159#M43815</link>
      <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/106294"&gt;@Alberto_Umana&lt;/a&gt;&amp;nbsp;thanks for the reply. without the current code change, is it possible to extract these metrics from logs?&lt;/P&gt;</description>
      <pubDate>Tue, 25 Feb 2025 19:09:10 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/data-processing-metrics/m-p/111159#M43815</guid>
      <dc:creator>noorbasha534</dc:creator>
      <dc:date>2025-02-25T19:09:10Z</dc:date>
    </item>
  </channel>
</rss>

