<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Setting up observability for serverless Databricks in Administration &amp; Architecture</title>
    <link>https://community.databricks.com/t5/administration-architecture/setting-up-observability-for-serverless-databricks/m-p/129043#M3918</link>
    <description>&lt;P&gt;Databricks deployed on AWS Platform&lt;/P&gt;</description>
    <pubDate>Wed, 20 Aug 2025 19:43:15 GMT</pubDate>
    <dc:creator>APJESK</dc:creator>
    <dc:date>2025-08-20T19:43:15Z</dc:date>
    <item>
      <title>Setting up observability for serverless Databricks</title>
      <link>https://community.databricks.com/t5/administration-architecture/setting-up-observability-for-serverless-databricks/m-p/129041#M3916</link>
      <description>&lt;P&gt;I’m looking for best practices and guidance on setting up &lt;STRONG&gt;observability for serverless Databricks&lt;/STRONG&gt;. Specifically, I’d like to know:&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;&lt;P&gt;How to capture and monitor &lt;STRONG&gt;system-level metrics&lt;/STRONG&gt; (CPU, memory, network, disk) in a serverless setup.&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P&gt;How to configure and collect &lt;STRONG&gt;application metrics&lt;/STRONG&gt; (e.g., using Spark listeners, StreamingQueryListener, QueryExecutionListener).&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P&gt;Best way to manage and forward &lt;STRONG&gt;logs&lt;/STRONG&gt; (driver logs, executor logs, audit logs, event logs) in a serverless environment.&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P&gt;Recommended approaches for integrating with external monitoring tools like &lt;STRONG&gt;Amazon CloudWatch, Datadog, or SIEM platforms&lt;/STRONG&gt;.&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P&gt;Suggestions for building &lt;STRONG&gt;dashboards, alerts, and anomaly detection&lt;/STRONG&gt; to ensure end-to-end observability.&lt;/P&gt;&lt;/LI&gt;&lt;/UL&gt;&lt;P&gt;If anyone has implemented observability for &lt;STRONG&gt;serverless Databricks workloads&lt;/STRONG&gt;, I’d really appreciate insights into the architecture, tools used, and lessons learned.&lt;/P&gt;&lt;P&gt;Thanks in advance for your help!&lt;/P&gt;</description>
      <pubDate>Wed, 20 Aug 2025 19:31:01 GMT</pubDate>
      <guid>https://community.databricks.com/t5/administration-architecture/setting-up-observability-for-serverless-databricks/m-p/129041#M3916</guid>
      <dc:creator>APJESK</dc:creator>
      <dc:date>2025-08-20T19:31:01Z</dc:date>
    </item>
    <item>
      <title>Re: Setting up observability for serverless Databricks</title>
      <link>https://community.databricks.com/t5/administration-architecture/setting-up-observability-for-serverless-databricks/m-p/129043#M3918</link>
      <description>&lt;P&gt;Databricks deployed on AWS Platform&lt;/P&gt;</description>
      <pubDate>Wed, 20 Aug 2025 19:43:15 GMT</pubDate>
      <guid>https://community.databricks.com/t5/administration-architecture/setting-up-observability-for-serverless-databricks/m-p/129043#M3918</guid>
      <dc:creator>APJESK</dc:creator>
      <dc:date>2025-08-20T19:43:15Z</dc:date>
    </item>
    <item>
      <title>Re: Setting up observability for serverless Databricks</title>
      <link>https://community.databricks.com/t5/administration-architecture/setting-up-observability-for-serverless-databricks/m-p/129049#M3920</link>
      <description>&lt;P class=""&gt;&lt;STRONG&gt;Here are few recommended methods:&lt;/STRONG&gt;&lt;/P&gt;&lt;P class=""&gt;&amp;nbsp;&lt;/P&gt;&lt;OL class=""&gt;&lt;LI&gt;How to capture and monitor&amp;nbsp;&lt;STRONG&gt;system-level metrics&lt;/STRONG&gt;&amp;nbsp;(CPU, memory, network, disk) in a serverless setup.&lt;UL&gt;&lt;LI&gt;In serverless you don’t have host access (no node agents, no Ganglia). Treat the &lt;STRONG&gt;workspace/platform&lt;/STRONG&gt;&lt;SPAN&gt; as your “system” and monitor via&lt;/SPAN&gt;&lt;UL class="lia-list-style-type-square"&gt;&lt;LI&gt;&lt;STRONG&gt;Databricks system tables&lt;/STRONG&gt; for platform &amp;amp; job health (enable once per workspace). These are first-party tables in system.* you can query from any workspace. Start here for account activity, jobs, and Spark events.&lt;/LI&gt;&lt;LI&gt;&lt;STRONG&gt;Audit logs&lt;/STRONG&gt; (low latency delivery to S3 and/or system table system.access.audit) to track who did what, when, from where—great for availability, security, and change correlation.&lt;/LI&gt;&lt;LI&gt;You can also use API to collect the metrices.&lt;/LI&gt;&lt;/UL&gt;&lt;/LI&gt;&lt;/UL&gt;&lt;/LI&gt;&lt;/OL&gt;&lt;LI-CODE lang="python"&gt;import requests
import json

def get_cluster_metrics(workspace_url, token, cluster_id):
    headers = {"Authorization": f"Bearer {token}"}
    
    # Get cluster events
    events_url = f"{workspace_url}/api/2.0/clusters/events"
    events_response = requests.get(
        events_url, 
        headers=headers, 
        params={"cluster_id": cluster_id}
    )
    
    # Get cluster details including resource utilization
    details_url = f"{workspace_url}/api/2.0/clusters/get"
    details_response = requests.get(
        details_url, 
        headers=headers, 
        params={"cluster_id": cluster_id}
    )
    
    return {
        "events": events_response.json(),
        "details": details_response.json()
    }​&lt;/LI-CODE&gt;&lt;UL&gt;&lt;LI&gt;&lt;P&gt;How to configure and collect&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;&lt;STRONG&gt;application metrics&lt;/STRONG&gt;&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;(e.g., using Spark listeners, StreamingQueryListener, QueryExecutionListener).&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;&lt;P&gt;&lt;STRONG&gt;Spark listeners&lt;/STRONG&gt; (e.g., SparkListener, QueryExecutionListener) and &lt;STRONG&gt;StreamingQueryListener&lt;/STRONG&gt; work in serverless because they’re application-level. Register them in your notebook/job and push metrics out (HTTP to a gateway, StatsD, or directly to a lakehouse table). Note that some runtime/version caveats exist by language; check your DBR version.&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P&gt;For &lt;STRONG&gt;structured streaming&lt;/STRONG&gt;, attach a StreamingQueryListener to publish: input rows/sec, batch duration, state ops, watermark, last progress. Persist to Delta tables or ship to CloudWatch/Datadog via HTTPS from the driver.&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P&gt;For SQL workloads, use &lt;STRONG&gt;SQL warehouse&lt;/STRONG&gt; built-in telemetry plus query history tables to compute p95 latency, scan bytes, error rates (if you also run serverless SQL).&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;Sample code for listener implementation.&lt;/LI&gt;&lt;/UL&gt;&lt;/LI&gt;&lt;/UL&gt;&lt;LI-CODE lang="python"&gt;from pyspark.sql.streaming import StreamingQueryListener
import json
import boto3

class CustomStreamingQueryListener(StreamingQueryListener):
    def __init__(self, cloudwatch_client):
        self.cloudwatch = cloudwatch_client
    
    def onQueryStarted(self, event):
        """Track query start events"""
        self.cloudwatch.put_metric_data(
            Namespace='Databricks/Streaming',
            MetricData=[
                {
                    'MetricName': 'StreamingQueryStarted',
                    'Value': 1,
                    'Unit': 'Count',
                    'Dimensions': [
                        {'Name': 'QueryId', 'Value': event.id},
                        {'Name': 'QueryName', 'Value': event.name or 'unnamed'}
                    ]
                }
            ]
        )
    
    def onQueryProgress(self, event):
        """Track query progress metrics"""
        progress = event.progress
        
        metrics = [
            {
                'MetricName': 'InputRowsPerSecond',
                'Value': progress.inputRowsPerSecond,
                'Unit': 'Count/Second'
            },
            {
                'MetricName': 'ProcessedRowsPerSecond',
                'Value': progress.processedRowsPerSecond,
                'Unit': 'Count/Second'
            },
            {
                'MetricName': 'BatchDuration',
                'Value': progress.batchDuration,
                'Unit': 'Milliseconds'
            }
        ]
        
        self.cloudwatch.put_metric_data(
            Namespace='Databricks/Streaming',
            MetricData=metrics
        )
    
    def onQueryTerminated(self, event):
        """Track query termination"""
        status = "Success" if event.exception is None else "Failed"
        self.cloudwatch.put_metric_data(
            Namespace='Databricks/Streaming',
            MetricData=[
                {
                    'MetricName': 'StreamingQueryCompleted',
                    'Value': 1,
                    'Unit': 'Count',
                    'Dimensions': [
                        {'Name': 'Status', 'Value': status}
                    ]
                }
            ]
        )

# Register the listener
cloudwatch = boto3.client('cloudwatch')
listener = CustomStreamingQueryListener(cloudwatch)
spark.streams.addListener(listener)​&lt;/LI-CODE&gt;&lt;UL&gt;&lt;LI&gt;Query Execution Listener&lt;/LI&gt;&lt;/UL&gt;&lt;LI-CODE lang="python"&gt;from pyspark.sql.util import QueryExecutionListener
import time
import logging

class MetricsQueryExecutionListener(QueryExecutionListener):
    def __init__(self, metrics_client):
        self.metrics_client = metrics_client
        self.logger = logging.getLogger(__name__)
    
    def onSuccess(self, funcName, qe, durationNs):
        """Track successful query executions"""
        duration_ms = durationNs / 1000000
        
        # Extract query complexity metrics
        physical_plan = qe.executedPlan
        stages = len([node for node in physical_plan.children])
        
        metrics = {
            'query_duration_ms': duration_ms,
            'query_stages': stages,
            'function_name': funcName,
            'success': 1
        }
        
        self.metrics_client.send_metrics(metrics)
        
    def onFailure(self, funcName, qe, exception):
        """Track failed query executions"""
        metrics = {
            'function_name': funcName,
            'failure': 1,
            'error_type': type(exception).__name__
        }
        
        self.metrics_client.send_metrics(metrics)
        self.logger.error(f"Query failed: {funcName}, Error: {exception}")

# Register the listener
metrics_listener = MetricsQueryExecutionListener(your_metrics_client)
spark.listenerManager.register(metrics_listener)​&lt;/LI-CODE&gt;&lt;UL&gt;&lt;LI&gt;&lt;P&gt;Best way to manage and forward&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;&lt;STRONG&gt;logs&lt;/STRONG&gt;&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;(driver logs, executor logs, audit logs, event logs) in a serverless environment.&lt;/P&gt;&lt;OL&gt;&lt;LI&gt;&lt;P&gt;&lt;STRONG&gt;Audit logs&lt;/STRONG&gt; → authoritative security/ops timeline&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;&lt;P&gt;&lt;STRONG&gt;Recommended path&lt;/STRONG&gt;: use the &lt;STRONG&gt;audit log system table&lt;/STRONG&gt; (system.access.audit) for querying; optionally configure &lt;STRONG&gt;S3 delivery&lt;/STRONG&gt; (near-real-time JSON) for downstream tools and SIEMs.&lt;/P&gt;&lt;/LI&gt;&lt;/UL&gt;&lt;/LI&gt;&lt;/OL&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P&gt;Recommended approaches for integrating with external monitoring tools like&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;&lt;STRONG&gt;Amazon CloudWatch, Datadog, or SIEM platforms&lt;/STRONG&gt;.&lt;/P&gt;&lt;OL&gt;&lt;LI&gt;&lt;STRONG&gt;Datadog&lt;/STRONG&gt;: Datadog’s &lt;STRONG&gt;Data Jobs Monitoring&lt;/STRONG&gt; now &lt;STRONG&gt;supports serverless Databricks jobs and serverless SQL&lt;/STRONG&gt;—it correlates job health, query issues, and cost. Pair with the Databricks integration and (for non-serverless) the Agent; for serverless, use API-based ingestion and system logs.&lt;/LI&gt;&lt;/OL&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P&gt;Suggestions for building&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;&lt;STRONG&gt;dashboards, alerts, and anomaly detection&lt;/STRONG&gt;&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;to ensure end-to-end observability.&lt;/P&gt;&lt;OL&gt;&lt;LI&gt;&lt;P&gt;&lt;STRONG&gt;Reliability&lt;/STRONG&gt;&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;&lt;P&gt;Job failure rate, mean time to recovery → system tables (jobs) + audit logs.&lt;/P&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P&gt;Streaming freshness (max event age, batch duration) → StreamingQueryListener + Delta table.&lt;/P&gt;&lt;/LI&gt;&lt;/UL&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P&gt;&lt;STRONG&gt;Performance&lt;/STRONG&gt;&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;&lt;P&gt;&amp;nbsp;SQL query latency &amp;amp; scan size → system tables / SQL telemetry.&lt;/P&gt;&lt;/LI&gt;&lt;/UL&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P&gt;&lt;STRONG&gt;Cost&lt;/STRONG&gt;&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;&lt;P&gt;DBUs by job/owner/workflow, $/successful run → system tables + your billing exports (or Datadog Cloud Cost Management).&lt;/P&gt;&lt;/LI&gt;&lt;/UL&gt;&lt;/LI&gt;&lt;LI&gt;&lt;P&gt;&lt;STRONG&gt;Security &amp;amp; governance&lt;/STRONG&gt;&lt;/P&gt;&lt;UL&gt;&lt;LI&gt;&lt;P&gt;Admin actions, permission changes, token/credential events, external location changes → system.access.audit.&lt;/P&gt;&lt;/LI&gt;&lt;/UL&gt;&lt;/LI&gt;&lt;/OL&gt;&lt;/LI&gt;&lt;/UL&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 20 Aug 2025 23:30:07 GMT</pubDate>
      <guid>https://community.databricks.com/t5/administration-architecture/setting-up-observability-for-serverless-databricks/m-p/129049#M3920</guid>
      <dc:creator>nayan_wylde</dc:creator>
      <dc:date>2025-08-20T23:30:07Z</dc:date>
    </item>
    <item>
      <title>Re: Setting up observability for serverless Databricks</title>
      <link>https://community.databricks.com/t5/administration-architecture/setting-up-observability-for-serverless-databricks/m-p/129050#M3921</link>
      <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/170854"&gt;@APJESK&lt;/a&gt;&amp;nbsp;Serverless is designed to relieve DevOps teams from monitoring these types of metrics. You should be able to track the cost and usage with system tables&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 21 Aug 2025 00:24:44 GMT</pubDate>
      <guid>https://community.databricks.com/t5/administration-architecture/setting-up-observability-for-serverless-databricks/m-p/129050#M3921</guid>
      <dc:creator>Sharanya13</dc:creator>
      <dc:date>2025-08-21T00:24:44Z</dc:date>
    </item>
    <item>
      <title>Re: Setting up observability for serverless Databricks</title>
      <link>https://community.databricks.com/t5/administration-architecture/setting-up-observability-for-serverless-databricks/m-p/129054#M3922</link>
      <description>&lt;P&gt;Thank you very much, I will go through your solution and get back to you, If I have any doubts.&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 21 Aug 2025 03:45:45 GMT</pubDate>
      <guid>https://community.databricks.com/t5/administration-architecture/setting-up-observability-for-serverless-databricks/m-p/129054#M3922</guid>
      <dc:creator>APJESK</dc:creator>
      <dc:date>2025-08-21T03:45:45Z</dc:date>
    </item>
    <item>
      <title>Re: Setting up observability for serverless Databricks</title>
      <link>https://community.databricks.com/t5/administration-architecture/setting-up-observability-for-serverless-databricks/m-p/129055#M3923</link>
      <description>&lt;P&gt;Ok, Got it I will start explore it and get back to you.&lt;/P&gt;</description>
      <pubDate>Thu, 21 Aug 2025 03:48:46 GMT</pubDate>
      <guid>https://community.databricks.com/t5/administration-architecture/setting-up-observability-for-serverless-databricks/m-p/129055#M3923</guid>
      <dc:creator>APJESK</dc:creator>
      <dc:date>2025-08-21T03:48:46Z</dc:date>
    </item>
  </channel>
</rss>

