<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: Inconsistent Cluster Log Persistence to Volume/S3 (stderr, stdout, log4j-active.log) in Data Engineering</title>
    <link>https://community.databricks.com/t5/data-engineering/inconsistent-cluster-log-persistence-to-volume-s3-stderr-stdout/m-p/157705#M54605</link>
    <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/102072"&gt;@aleksandra_ch&lt;/a&gt;,&lt;/P&gt;&lt;P&gt;Unfortunately, the &lt;EM&gt;&lt;STRONG&gt;Volumes&lt;/STRONG&gt;&lt;/EM&gt; directory is only accessible through the Databricks interface. At the OS level, it is not accessible, even when running as &lt;FONT color="#000000"&gt;&lt;STRONG&gt;root&lt;/STRONG&gt;&lt;/FONT&gt;&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;(FUSE limitation).&lt;/P&gt;&lt;P&gt;I have also observed intermittent periods where, for certain job runs, logs are not updated in Volumes at all, specifically they are not copied from&lt;SPAN&gt;&amp;nbsp;&lt;STRONG&gt;&lt;EM&gt;/databricks/driver/logs&lt;/EM&gt;&lt;/STRONG&gt;&lt;/SPAN&gt;&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;to &lt;EM&gt;&lt;STRONG&gt;Volumes&lt;/STRONG&gt;&lt;/EM&gt;.&lt;BR /&gt;To mitigate this, I implemented an alternative approach that performs direct log synchronization to S3 (the same backing location used by the Volume) at a predefined interval.&lt;/P&gt;&lt;P&gt;It would be very helpful to have either:&lt;/P&gt;&lt;OL&gt;&lt;LI&gt;A &lt;STRONG&gt;shutdown_script&lt;/STRONG&gt;&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;feature as an alternative to &lt;STRONG&gt;init_script&lt;/STRONG&gt;, or&lt;/LI&gt;&lt;LI&gt;A built-in log synchronization process executed automatically before cluster termination.&lt;BR /&gt;&lt;BR /&gt;Please find the script used below:&lt;/LI&gt;&lt;/OL&gt;&lt;LI-CODE lang="javascript"&gt;#!/bin/bash
set -e

# ===============================================================
# 1. BASIC CONFIGURATION (S3)
# ===============================================================
S3_BUCKET="&amp;lt;&amp;lt;bucket&amp;gt;&amp;gt;"
S3_BASE_PREFIX="data/&amp;lt;&amp;lt;catalog&amp;gt;&amp;gt;/&amp;lt;&amp;lt;schemas&amp;gt;&amp;gt;/__unitystorage/schemas/&amp;lt;&amp;lt;schema_id&amp;gt;&amp;gt;/volumes/&amp;lt;&amp;lt;volume_id&amp;gt;&amp;gt;"

# Automatically fetch cluster metadata at startup
CLUSTER_ID="${DB_CLUSTER_ID:-unknown_cluster}"
DT=$(date +%Y-%m-%d)
HH=$(date +%H)
MM=$(date +%M)

# ===============================================================
# 2. CREATE THE PYTHON UTILITY FOR DIRECT S3 SYNC
# ===============================================================
cat &amp;lt;&amp;lt; 'EOF' &amp;gt; /usr/local/bin/sync_single_run.py
import boto3
import os
import sys

bucket_name = sys.argv[1]
base_prefix = sys.argv[2]
cluster_id = sys.argv[3]
dt = sys.argv[4]
hh = sys.argv[5]
mm = sys.argv[6]

s3_target_prefix = f"{base_prefix}/{cluster_id}/driver"
local_log_dir = "/databricks/driver/logs"
log_files = ["stdout", "stderr", "log4j-active.log", "stacktrace.log"]

s3 = boto3.client('s3')

def get_s3_name(file_name):
    if file_name == "log4j-active.log": return f"log4j-{dt}-{hh}-{mm}.log"
    if file_name == "stacktrace.log": return f"{dt}-{hh}-{mm}.stacktrace.log"
    if file_name == "stdout": return f"stdout--{dt}--{hh}-{mm}.log"
    if file_name == "stderr": return f"stderr--{dt}--{hh}-{mm}.log"
    return file_name

for file_name in log_files:
    local_path = os.path.join(local_log_dir, file_name)
    if os.path.exists(local_path) and os.path.getsize(local_path) &amp;gt; 0:
        s3_key = f"{s3_target_prefix}/{get_s3_name(file_name)}"
        try:
            s3.upload_file(local_path, bucket_name, s3_key)
        except:
            pass  # Ignore temporary errors to avoid blocking script execution
EOF

chmod +x /usr/local/bin/sync_single_run.py

# ===============================================================
# 3. CREATE AND START THE BASH DAEMON (LIVE SYNC WATCHER)
# ===============================================================
cat &amp;lt;&amp;lt; EOF &amp;gt; /tmp/run_daemon.sh
#!/bin/bash
# Let the cluster finish its initial boot phase before the first sync
sleep 30

while true; do
  python3 /usr/local/bin/sync_single_run.py "$S3_BUCKET" "$S3_BASE_PREFIX" "$CLUSTER_ID" "$DT" "$HH" "$MM" &amp;gt; /dev/null 2&amp;gt;&amp;amp;1
    sleep 300  # Run every 300 seconds
done
EOF

chmod +x /tmp/run_daemon.sh

# Launch the daemon in the background as an independent process
nohup /bin/bash /tmp/run_daemon.sh &amp;gt; /dev/null 2&amp;gt;&amp;amp;1 &amp;amp;

echo "The permanent Live Sync system (10s) with Databricks pattern was installed successfully!"​&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Wed, 27 May 2026 05:50:10 GMT</pubDate>
    <dc:creator>ccsalt</dc:creator>
    <dc:date>2026-05-27T05:50:10Z</dc:date>
    <item>
      <title>Inconsistent Cluster Log Persistence to Volume/S3 (stderr, stdout, log4j-active.log)</title>
      <link>https://community.databricks.com/t5/data-engineering/inconsistent-cluster-log-persistence-to-volume-s3-stderr-stdout/m-p/157235#M54522</link>
      <description>&lt;P&gt;Saving logs from an all-purpose cluster to Volume or S3 is not consistent, because&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;stderr,&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;stdout, and&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;log4j-active.log&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;get overwritten when the cluster is restarted between minutes&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;01&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;and&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;59.&lt;/P&gt;&lt;P&gt;Tested case:&lt;BR /&gt;A job is configured to start every 20 minutes, for example:&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;10:10 -&amp;gt; 10:30 -&amp;gt; 10:50.&lt;BR /&gt;Cluster logs (stderr,&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;stdout,&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;log4j-active.log) are overwritten at each restart, because the cluster does not reach the exact hour (10:00,&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;11:00) when automatic log rotation happens.&lt;/P&gt;&lt;P&gt;In the Databricks UI, the logs appear later, but with the same name (for example&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;log4j-active.log).&lt;BR /&gt;The issue is that, although they seem visible in the UI, in Volume the files are overwritten and information is lost.&lt;/P&gt;&lt;P&gt;Does anyone have an idea of how I can still preserve all logs?&lt;/P&gt;&lt;P&gt;Thanks!&lt;/P&gt;</description>
      <pubDate>Tue, 19 May 2026 09:45:52 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/inconsistent-cluster-log-persistence-to-volume-s3-stderr-stdout/m-p/157235#M54522</guid>
      <dc:creator>ccsalt</dc:creator>
      <dc:date>2026-05-19T09:45:52Z</dc:date>
    </item>
    <item>
      <title>Re: Inconsistent Cluster Log Persistence to Volume/S3 (stderr, stdout, log4j-active.log)</title>
      <link>https://community.databricks.com/t5/data-engineering/inconsistent-cluster-log-persistence-to-volume-s3-stderr-stdout/m-p/157421#M54553</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/230406"&gt;@ccsalt&lt;/a&gt;&amp;nbsp;,&lt;/P&gt;
&lt;P&gt;This is a known limitation.&amp;nbsp;Log rotation&lt;SPAN&gt;&amp;nbsp;(renaming to&amp;nbsp;&lt;/SPAN&gt;&lt;CODE&gt;log4j-YYYY-MM-DD-HH.log.gz&lt;/CODE&gt;&lt;SPAN&gt;) only happens&amp;nbsp;&lt;/SPAN&gt;on the hour boundary. The active log file&amp;nbsp;&lt;STRONG&gt;&lt;CODE&gt;log4j-active.log&lt;/CODE&gt;&lt;/STRONG&gt;&lt;SPAN&gt;&amp;nbsp;has always the same name and is overwritten if a cluster restart happens within one hour.&amp;nbsp;&lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;&lt;SPAN&gt;As a workaround:&lt;/SPAN&gt;&lt;/P&gt;
&lt;UL&gt;
&lt;LI&gt;&lt;SPAN&gt;Use a cluster-scoped &lt;A href="https://docs.databricks.com/aws/en/init-scripts/cluster-scoped" target="_self"&gt;init script&lt;/A&gt; to rename active log file before the cluster starts;&lt;/SPAN&gt;&lt;/LI&gt;
&lt;LI&gt;&lt;SPAN&gt;Or switch to a job cluster, if possible. Each run will write its logs to a separate folder.&lt;/SPAN&gt;&lt;/LI&gt;
&lt;/UL&gt;
&lt;P&gt;&lt;SPAN&gt;Hope it helps.&lt;/SPAN&gt;&lt;/P&gt;
&lt;P&gt;&lt;SPAN&gt;Best regards,&lt;/SPAN&gt;&lt;/P&gt;</description>
      <pubDate>Thu, 21 May 2026 14:56:06 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/inconsistent-cluster-log-persistence-to-volume-s3-stderr-stdout/m-p/157421#M54553</guid>
      <dc:creator>aleksandra_ch</dc:creator>
      <dc:date>2026-05-21T14:56:06Z</dc:date>
    </item>
    <item>
      <title>Re: Inconsistent Cluster Log Persistence to Volume/S3 (stderr, stdout, log4j-active.log)</title>
      <link>https://community.databricks.com/t5/data-engineering/inconsistent-cluster-log-persistence-to-volume-s3-stderr-stdout/m-p/157485#M54571</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&amp;nbsp;&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/102072"&gt;@aleksandra_ch&lt;/a&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Do you happen to have an example for the workaround “Use a cluster-scoped init script to rename the active log file before the cluster starts”?&lt;/P&gt;&lt;P&gt;I’ve tried many variants, but without success. I keep running into an “&lt;EM&gt;operation not permitted&lt;/EM&gt;” error because the &lt;STRONG&gt;Volumes&lt;/STRONG&gt; directory is empty at startup (more precisely, during the init script phase).&lt;/P&gt;&lt;P&gt;I also tried using&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;&lt;STRONG&gt;/databricks/driver/&lt;/STRONG&gt;, but the&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;&lt;STRONG&gt;logs&lt;/STRONG&gt;&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;directory is not available until about 1 minute after startup. When it does become available, it only contains&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;stdout&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;and&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;stderr. By the time &lt;STRONG&gt;log4j-active.log&lt;/STRONG&gt;&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;appears, but it has already been reset.&lt;BR /&gt;&lt;BR /&gt;Thank you!&lt;/P&gt;</description>
      <pubDate>Fri, 22 May 2026 11:26:51 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/inconsistent-cluster-log-persistence-to-volume-s3-stderr-stdout/m-p/157485#M54571</guid>
      <dc:creator>ccsalt</dc:creator>
      <dc:date>2026-05-22T11:26:51Z</dc:date>
    </item>
    <item>
      <title>Re: Inconsistent Cluster Log Persistence to Volume/S3 (stderr, stdout, log4j-active.log)</title>
      <link>https://community.databricks.com/t5/data-engineering/inconsistent-cluster-log-persistence-to-volume-s3-stderr-stdout/m-p/157513#M54575</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/230406"&gt;@ccsalt&lt;/a&gt;&amp;nbsp;,&lt;/P&gt;
&lt;P&gt;Can you try this one:&lt;/P&gt;
&lt;LI-CODE lang="markup"&gt;#!/bin/bash
# Cluster log preservation init script.
#
# Cluster log delivery overwrites stdout/stderr/log4j-active.log/stacktrace.log
# on every cluster restart. This script runs at startup and makes a timestamped
# copy of those files so the previous session's logs are kept.
#
# To use: edit LOG_BASE below to match your cluster's cluster_log_conf destination,
# then attach this script as an init script.

set -uo pipefail

# &amp;gt;&amp;gt;&amp;gt; EDIT THIS to your cluster_log_conf destination &amp;lt;&amp;lt;&amp;lt;
LOG_BASE="/Volumes/&amp;lt;catalog&amp;gt;/&amp;lt;schema&amp;gt;/&amp;lt;volume&amp;gt;/&amp;lt;subdir&amp;gt;"

TS="$(date -u +%Y%m%dT%H%M%SZ)"
DRIVER_LOG_DIR="${LOG_BASE}/${DB_CLUSTER_ID}/driver"

echo "[preserve_logs] cluster=${DB_CLUSTER_ID} ts=${TS}"
echo "[preserve_logs] checking ${DRIVER_LOG_DIR}"

if [ ! -d "${DRIVER_LOG_DIR}" ]; then
  echo "[preserve_logs] no prior driver log dir, nothing to preserve"
  exit 0
fi

for f in stdout stderr log4j-active.log stacktrace.log; do
  src="${DRIVER_LOG_DIR}/${f}"
  dst="${DRIVER_LOG_DIR}/${f}.preserved-${TS}"
  if [ -f "${src}" ]; then
    cp "${src}" "${dst}" &amp;amp;&amp;amp; echo "[preserve_logs] preserved ${f} -&amp;gt; $(basename ${dst})"
  fi
done

echo "[preserve_logs] done"&lt;/LI-CODE&gt;
&lt;P&gt;Best regards,&lt;/P&gt;</description>
      <pubDate>Fri, 22 May 2026 16:58:37 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/inconsistent-cluster-log-persistence-to-volume-s3-stderr-stdout/m-p/157513#M54575</guid>
      <dc:creator>aleksandra_ch</dc:creator>
      <dc:date>2026-05-22T16:58:37Z</dc:date>
    </item>
    <item>
      <title>Re: Inconsistent Cluster Log Persistence to Volume/S3 (stderr, stdout, log4j-active.log)</title>
      <link>https://community.databricks.com/t5/data-engineering/inconsistent-cluster-log-persistence-to-volume-s3-stderr-stdout/m-p/157705#M54605</link>
      <description>&lt;P&gt;Hi&amp;nbsp;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/102072"&gt;@aleksandra_ch&lt;/a&gt;,&lt;/P&gt;&lt;P&gt;Unfortunately, the &lt;EM&gt;&lt;STRONG&gt;Volumes&lt;/STRONG&gt;&lt;/EM&gt; directory is only accessible through the Databricks interface. At the OS level, it is not accessible, even when running as &lt;FONT color="#000000"&gt;&lt;STRONG&gt;root&lt;/STRONG&gt;&lt;/FONT&gt;&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;(FUSE limitation).&lt;/P&gt;&lt;P&gt;I have also observed intermittent periods where, for certain job runs, logs are not updated in Volumes at all, specifically they are not copied from&lt;SPAN&gt;&amp;nbsp;&lt;STRONG&gt;&lt;EM&gt;/databricks/driver/logs&lt;/EM&gt;&lt;/STRONG&gt;&lt;/SPAN&gt;&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;to &lt;EM&gt;&lt;STRONG&gt;Volumes&lt;/STRONG&gt;&lt;/EM&gt;.&lt;BR /&gt;To mitigate this, I implemented an alternative approach that performs direct log synchronization to S3 (the same backing location used by the Volume) at a predefined interval.&lt;/P&gt;&lt;P&gt;It would be very helpful to have either:&lt;/P&gt;&lt;OL&gt;&lt;LI&gt;A &lt;STRONG&gt;shutdown_script&lt;/STRONG&gt;&lt;SPAN&gt;&amp;nbsp;&lt;/SPAN&gt;feature as an alternative to &lt;STRONG&gt;init_script&lt;/STRONG&gt;, or&lt;/LI&gt;&lt;LI&gt;A built-in log synchronization process executed automatically before cluster termination.&lt;BR /&gt;&lt;BR /&gt;Please find the script used below:&lt;/LI&gt;&lt;/OL&gt;&lt;LI-CODE lang="javascript"&gt;#!/bin/bash
set -e

# ===============================================================
# 1. BASIC CONFIGURATION (S3)
# ===============================================================
S3_BUCKET="&amp;lt;&amp;lt;bucket&amp;gt;&amp;gt;"
S3_BASE_PREFIX="data/&amp;lt;&amp;lt;catalog&amp;gt;&amp;gt;/&amp;lt;&amp;lt;schemas&amp;gt;&amp;gt;/__unitystorage/schemas/&amp;lt;&amp;lt;schema_id&amp;gt;&amp;gt;/volumes/&amp;lt;&amp;lt;volume_id&amp;gt;&amp;gt;"

# Automatically fetch cluster metadata at startup
CLUSTER_ID="${DB_CLUSTER_ID:-unknown_cluster}"
DT=$(date +%Y-%m-%d)
HH=$(date +%H)
MM=$(date +%M)

# ===============================================================
# 2. CREATE THE PYTHON UTILITY FOR DIRECT S3 SYNC
# ===============================================================
cat &amp;lt;&amp;lt; 'EOF' &amp;gt; /usr/local/bin/sync_single_run.py
import boto3
import os
import sys

bucket_name = sys.argv[1]
base_prefix = sys.argv[2]
cluster_id = sys.argv[3]
dt = sys.argv[4]
hh = sys.argv[5]
mm = sys.argv[6]

s3_target_prefix = f"{base_prefix}/{cluster_id}/driver"
local_log_dir = "/databricks/driver/logs"
log_files = ["stdout", "stderr", "log4j-active.log", "stacktrace.log"]

s3 = boto3.client('s3')

def get_s3_name(file_name):
    if file_name == "log4j-active.log": return f"log4j-{dt}-{hh}-{mm}.log"
    if file_name == "stacktrace.log": return f"{dt}-{hh}-{mm}.stacktrace.log"
    if file_name == "stdout": return f"stdout--{dt}--{hh}-{mm}.log"
    if file_name == "stderr": return f"stderr--{dt}--{hh}-{mm}.log"
    return file_name

for file_name in log_files:
    local_path = os.path.join(local_log_dir, file_name)
    if os.path.exists(local_path) and os.path.getsize(local_path) &amp;gt; 0:
        s3_key = f"{s3_target_prefix}/{get_s3_name(file_name)}"
        try:
            s3.upload_file(local_path, bucket_name, s3_key)
        except:
            pass  # Ignore temporary errors to avoid blocking script execution
EOF

chmod +x /usr/local/bin/sync_single_run.py

# ===============================================================
# 3. CREATE AND START THE BASH DAEMON (LIVE SYNC WATCHER)
# ===============================================================
cat &amp;lt;&amp;lt; EOF &amp;gt; /tmp/run_daemon.sh
#!/bin/bash
# Let the cluster finish its initial boot phase before the first sync
sleep 30

while true; do
  python3 /usr/local/bin/sync_single_run.py "$S3_BUCKET" "$S3_BASE_PREFIX" "$CLUSTER_ID" "$DT" "$HH" "$MM" &amp;gt; /dev/null 2&amp;gt;&amp;amp;1
    sleep 300  # Run every 300 seconds
done
EOF

chmod +x /tmp/run_daemon.sh

# Launch the daemon in the background as an independent process
nohup /bin/bash /tmp/run_daemon.sh &amp;gt; /dev/null 2&amp;gt;&amp;amp;1 &amp;amp;

echo "The permanent Live Sync system (10s) with Databricks pattern was installed successfully!"​&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 27 May 2026 05:50:10 GMT</pubDate>
      <guid>https://community.databricks.com/t5/data-engineering/inconsistent-cluster-log-persistence-to-volume-s3-stderr-stdout/m-p/157705#M54605</guid>
      <dc:creator>ccsalt</dc:creator>
      <dc:date>2026-05-27T05:50:10Z</dc:date>
    </item>
  </channel>
</rss>

