4 weeks ago
In one of my projects, I needed to generate structured documentation for an entire directory of Databricks notebooks.
This solution uses the Databricks Workspace API together with a Serving Endpoint (LLM) to automatically create HTML documentation for each notebook, plus an index.html with links to all generated files.
Key points:
* Uses the current Databricks context token (dbutils) to call the Workspace API and Serving Endpoint.
* Exports notebook source code and sends it to an LLM that produces structured HTML.
* Stores the results in DBFS (or Workspace paths) with a common HTML template.
* Generates an index.html linking all generated documentation.
This approach helps data engineering teams keep up-to-date technical documentation of notebooks, with sections like objective, parameters, transformations, schemas, data quality notes, and even a "Solution Flow" diagram description.
# Databricks notebook source
# MAGIC %python
# --- Refactoring: document an ENTIRE DIRECTORY of notebooks ---
# Important notes:
# 1) This notebook uses the Databricks context token (dbutils) to call both the Workspace API and the LLM Serving Endpoint.
# 2) The HTML output is saved into DBFS (default: "/dbfs/tmp/doc_notebooks"). You may mount FileStore if you prefer to access via browser.
# 3) It also generates an index.html with links to all generated HTMLs.
# 4) The same HTML template is reused for all notebooks (loaded once from the Workspace).
import requests
import base64
import json
import os
import time
import re
from typing import List, Dict, Optional
# ===== Utilities =====
def _slugify_path(path: str) -> str:
"""Convert a workspace path into a safe file name."""
s = path.strip('/').replace('/','__')
s = re.sub(r"[^\w\-_.]+", "-", s)
return s
# ===== Classes =====
class DocumentNotebook:
def __init__(
self,
workspace_url: str,
notebook_path: str,
template_html_text: str,
serving_endpoint: str,
dest_html_dir: str = "/dbfs/tmp/doc_notebooks",
max_tokens: int = 5000,
):
self.workspace_url = workspace_url.rstrip('/')
self.notebook_path = notebook_path
self.template_html_text = template_html_text
self.serving_endpoint = serving_endpoint
self.dest_html_dir = dest_html_dir
self.max_tokens = max_tokens
# Current context token
self.token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()
self.headers = {"Authorization": f"Bearer {self.token}", "Content-Type": "application/json"}
os.makedirs(self.dest_html_dir, exist_ok=True)
self.decoded_source = ""
# --- Workspace API ---
def _export_source(self) -> None:
resp = requests.get(
f"{self.workspace_url}/api/2.0/workspace/export",
headers=self.headers,
params={"path": self.notebook_path, "format": "SOURCE"}
)
if resp.status_code != 200:
raise RuntimeError(f"Failed to export {self.notebook_path}: {resp.status_code} - {resp.text}")
encoded = resp.json().get("content")
self.decoded_source = base64.b64decode(encoded).decode("utf-8") if encoded else ""
# --- HTML Generation via LLM ---
def _call_llm_html(self) -> str:
endpoint = f"{self.workspace_url}/serving-endpoints/{self.serving_endpoint}/invocations"
prompt = {
"messages": [
{"role": "system", "content": "You are a technical assistant specialized in Databricks notebooks and structured HTML documentation."},
{
"role": "user",
"content": f"""Analyze the following notebook (path: {self.notebook_path}):\n\n{self.decoded_source}\n\nExtract if available: configurations, version history, parameters, technical notes, transformations, data volume, schema, CTE query, main query, Data Quality details, objective, and a short summary.\n\nThen generate a structured technical HTML with:\n- Header\n- Sections: Objective, Parameters, Transformations, Data Volume, Technical Notes, LLM Generation Info, Schema (if available)\n- A new section called "Solution Flow" containing an HTML description inspired by a technical diagram or architecture\n- Footer with the organization (your organization / Data&AI / Data Engineering)\n\nUse the following HTML template as base:\n{self.template_html_text}\n\nReturn **only** the full HTML as response."""
}
],
"max_tokens": self.max_tokens
}
resp = requests.post(endpoint, headers=self.headers, json=prompt)
if resp.status_code != 200:
raise RuntimeError(f"Endpoint {self.serving_endpoint} invocation failed: {resp.status_code} - {resp.text}")
rj = resp.json()
html = rj.get("choices", [{}])[0].get("message", {}).get("content", "")
if not html or "<html" not in html.lower():
raise ValueError("LLM response does not contain valid HTML.")
return html
def generate(self) -> str:
"""Generates the HTML documentation for this notebook and returns the DBFS file path."""
self._export_source()
html = self._call_llm_html()
fname = _slugify_path(self.notebook_path) + "_doc.html"
fpath = os.path.join(self.dest_html_dir, fname)
with open(fpath, "w", encoding="utf-8") as f:
f.write(html)
print(f"Generated: {fpath}")
return fpath
class DocumentDirectory:
def __init__(
self,
workspace_url: str,
root_dir: str,
template_path: str,
serving_endpoint: str,
dest_html_dir: str = "/dbfs/tmp/doc_notebooks",
recursive: bool = True,
pause_seconds: float = 0.5,
regex_filter_paths: Optional[str] = None,
max_tokens_per_notebook: int = 5000,
):
self.workspace_url = workspace_url.rstrip('/')
self.root_dir = root_dir.rstrip('/')
self.template_path = template_path
self.serving_endpoint = serving_endpoint
self.dest_html_dir = dest_html_dir
self.recursive = recursive
self.pause_seconds = pause_seconds
self.regex_filter_paths = re.compile(regex_filter_paths) if regex_filter_paths else None
self.max_tokens = max_tokens_per_notebook
self.token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()
self.headers = {"Authorization": f"Bearer {self.token}", "Content-Type": "application/json"}
os.makedirs(self.dest_html_dir, exist_ok=True)
# Load template HTML ONCE
self.template_html_text = self._load_template_html()
# --- Workspace API ---
def _load_template_html(self) -> str:
resp = requests.get(
f"{self.workspace_url}/api/2.0/workspace/export",
headers=self.headers,
params={"path": self.template_path, "format": "SOURCE"}
)
if resp.status_code != 200:
raise RuntimeError(f"Error loading template {self.template_path}: {resp.status_code} - {resp.text}")
encoded = resp.json().get("content")
return base64.b64decode(encoded).decode("utf-8") if encoded else ""
def _list_dir(self, path: str) -> List[Dict]:
resp = requests.get(
f"{self.workspace_url}/api/2.0/workspace/list",
headers=self.headers,
params={"path": path}
)
if resp.status_code != 200:
raise RuntimeError(f"Error listing {path}: {resp.status_code} - {resp.text}")
return resp.json().get("objects", [])
def _collect_notebooks(self, path: str) -> List[str]:
paths: List[str] = []
items = self._list_dir(path)
for it in items:
otype = it.get("object_type")
opath = it.get("path")
if otype == "NOTEBOOK":
if self.regex_filter_paths and not self.regex_filter_paths.search(opath):
continue
paths.append(opath)
elif otype == "DIRECTORY" and self.recursive:
paths.extend(self._collect_notebooks(opath))
return sorted(paths)
def _create_index(self, records: List[Dict[str, str]]) -> str:
"""Create an index.html with links to all generated files."""
lines = [
"<html><head><meta charset='utf-8'><title>Notebook Documentation</title></head><body>",
f"<h1>Notebook Documentation</h1>",
f"<p>Root directory: <code>{self.root_dir}</code></p>",
"<ul>"
]
for r in records:
lines.append(f"<li><a href='{os.path.basename(r['html_path'])}'>{r['notebook_path']}</a></li>")
lines.append("</ul></body></html>")
content = "\n".join(lines)
index_path = os.path.join(self.dest_html_dir, "index.html")
with open(index_path, "w", encoding="utf-8") as f:
f.write(content)
print(f"Index generated: {index_path}")
return index_path
def run(self) -> Dict[str, str]:
notebooks = self._collect_notebooks(self.root_dir)
print(f"Found {len(notebooks)} notebooks in {self.root_dir} (recursive={self.recursive}).")
records = []
for i, nb_path in enumerate(notebooks, start=1):
try:
print(f"[{i}/{len(notebooks)}] Documenting: {nb_path}")
generator = DocumentNotebook(
workspace_url=self.workspace_url,
notebook_path=nb_path,
template_html_text=self.template_html_text,
serving_endpoint=self.serving_endpoint,
dest_html_dir=self.dest_html_dir,
max_tokens=self.max_tokens,
)
html_path = generator.generate()
records.append({"notebook_path": nb_path, "html_path": html_path})
time.sleep(self.pause_seconds)
except Exception as e:
print(f"ERROR processing {nb_path}: {e}")
index_path = self._create_index(records)
return {"index_html": index_path, "count": str(len(records))}
# ======================
# ===== USAGE EXAMPLE =====
# Adjust parameters according to your environment:
# ======================
if __name__ == "__main__":
WORKSPACE_URL = "https://your_workspace.databricks.com"
ROOT_DIR = "/your_root_dir"
TEMPLATE_PATH = "/your_template_path/template.html"
SERVING_ENDPOINT = "databricks-claude-3-7-sonnet"
DEST_HTML_DIR = "/your_dest_dir"
docdir = DocumentDirectory(
workspace_url=WORKSPACE_URL,
root_dir=ROOT_DIR,
template_path=TEMPLATE_PATH,
serving_endpoint=SERVING_ENDPOINT,
dest_html_dir=DEST_HTML_DIR,
recursive=True,
pause_seconds=0.6,
regex_filter_paths=None, # e.g. r"nb_write_.*|etl_.*"
max_tokens_per_notebook=5000,
)
result = docdir.run()
print("Summary:", result)
4 weeks ago
Suggestions are always welcome — I hope this helps anyone looking to automate notebook documentation in Databricks.
4 weeks ago
Suggestions are always welcome — I hope this helps anyone looking to automate notebook documentation in Databricks.
Passionate about hosting events and connecting people? Help us grow a vibrant local community—sign up today to get started!
Sign Up Now