Databricks Community

cleversuresh · ‎02-27-2025

I am working on a personalized price package recommendation and implemented an AutoGluon code integrating it with MLflow.

The code has been created in a modular fashion to be used by other team members. They just need to pass the data, target column and experiment name to create the experiment.

I always face some problems when logging the model with MLflow, any help would be greatly appreciated.

This is my code:

class AutoGluonPyFuncWrapper(mlflow.pyfunc.PythonModel):
"""Wrapper for AutoGluon model to be logged as a PyFunc model in MLflow."""

def __init__(self, model_path):
self.model_path = model_path
self.predictor = None # Model will be loaded in predict method

def load_context(self, context):
"""Loads the AutoGluon model when MLflow loads the PyFunc model."""
self.predictor = TabularPredictor.load(self.model_path)

def predict(self, context, model_input):
"""
Predict probability scores for the given input.

model_input: Pandas DataFrame
Returns: Pandas DataFrame with probability scores
"""
if isinstance(model_input, pd.DataFrame):
predictions = self.predictor.predict_proba(model_input)
else:
predictions = self.predictor.predict_proba(pd.DataFrame(model_input))

# Get the class label for positive class dynamically
positive_class = predictions.columns[-1] # Last column is usually the positive class
return predictions[[positive_class]] # Return only probability of positive class

class AutoGluonMLflowClassifier:
def __init__(self, model_data: pd.DataFrame, target_col: str, experiment_name: str):
"""
Initializes the classifier with Databricks table name, target column, and MLflow experiment name.
"""
self.model_data = model_data
self.target_col = target_col
self.experiment_name = experiment_name
self.predictor = None
self.train_predictions = None
self.val_predictions = None
self._initialize_mlflow()

def _initialize_mlflow(self):
"""Sets up MLflow experiment dynamically in Databricks."""

# Define experiment path (store it in the user's workspace)
experiment_path = f"/Shared/automl_experiments/{self.experiment_name}"

# Check if the experiment already exists
experiment = mlflow.get_experiment_by_name(experiment_path)

if experiment is None:
# Create a new experiment if it does not exist
experiment_id = mlflow.create_experiment(experiment_path)
print(f"Created new MLflow experiment at: {experiment_path}")
else:
experiment_id = experiment.experiment_id
print(f"Using existing MLflow experiment: {experiment_path}")

# Set the experiment to use
mlflow.set_experiment(experiment_path)

def split_data(self):

self.train_data, self.val_data = train_test_split(self.model_data, test_size=0.2, random_state=42)
print(self.train_data.columns)
def train_model(self, time_limit: int = 200):
"""Trains AutoGluon model and logs parameters, metrics, and artifacts in MLflow."""
hyperparameters = {
"GBM": { # LightGBM
"num_boost_round": 1000, # More boosting rounds
"learning_rate": 0.02, # Lower learning rate for better generalization
"num_leaves": 31, # Leaf complexity
"feature_fraction": 0.8, # Feature bagging
"bagging_fraction": 0.8, # Sample bagging
"bagging_freq": 5, # Frequency of bagging
"min_data_in_leaf": 20, # Minimum samples per leaf
},
"XGB": { # XGBoost
"n_estimators": 1000,
"learning_rate": 0.02,
"max_depth": 6, # Controls complexity
"subsample": 0.8, # Sample fraction per tree
"colsample_bytree": 0.8, # Feature bagging
"gamma": 0.2, # Regularization
"lambda": 1, # L2 regularization
},
"CAT": { # CatBoost
"iterations": 1000,
"learning_rate": 0.02,
"depth": 6,
"l2_leaf_reg": 3, # L2 regularization
"border_count": 32, # Number of bins for numeric features
},
"NN_TORCH": { # Neural Network (PyTorch)
"num_epochs": 100, # Increase training epochs
"learning_rate": 0.001,
"dropout_prob": 0.1, # Dropout regularization
"weight_decay": 1e-5, # L2 weight regularization
"hidden_size": 256, # Hidden layer size
}
}

dbfs_model_path = "dbfs:/FileStore/automl/autogluon/"
local_model_path = "/Shared/automl_experiments/autogluon_model/"

with mlflow.start_run() as run:
# Training AutoGluon model with AUC as the evaluation metric
self.predictor = TabularPredictor(problem_type = "binary",
label = self.target_col,
eval_metric = "roc_auc",
path = local_model_path) \
.fit(self.train_data,
excluded_model_types = ["KNN", "RF"],
hyperparameters = hyperparameters,
presets = "best_quality",
num_bag_folds = 3,
num_stack_levels = 1,
time_limit = time_limit,
verbosity = 1, # Reduce logs
num_cpus = 4, # Limit CPU usage
num_gpus = 0,
ag_args_fit = {"num_cpus": 1, "num_gpus": 0} # Ensure sequential training
)

print(f"Model saved at: {local_model_path}")
dbutils.fs.rm(dbfs_model_path, recurse=True)
dbutils.fs.cp(f"file:{local_model_path}", dbfs_model_path, recurse=True)

# log dataset size
mlflow.log_params({"trainning_data_size": self.train_data.shape[0],
"validation_data_size": self.val_data.shape[0]})

# Making predictions on training and validation datasets
self.train_predictions = self.predictor.predict_proba(self.train_data.drop(columns = [self.target_col])).iloc[:, -1] # Get probabilities for positive class
self.val_predictions = self.predictor.predict_proba(self.val_data.drop(columns = [self.target_col])).iloc[:, -1] # Get probabilities for positive class
print("Training predictions:", self.train_predictions)

# Compute and log both training and validation metrics
self.compute_metrics(self.train_data[self.target_col], self.train_predictions, "train")
self.compute_metrics(self.val_data[self.target_col], self.val_predictions, "validation")

print("Logging model to MLflow...")
# generate the model signature
signature = infer_signature(model_input = self.train_data.drop(columns = [self.target_col]),
model_output = self.train_predictions)

model_wrapper = AutoGluonPyFuncWrapper(local_model_path)
artifacts = {"predictor_path": dbfs_model_path}
mlflow.pyfunc.log_model(artifact_path = "model",
python_model = model_wrapper,
input_example = self.X_train[:2],
signature = signature,
artifacts = artifacts)

self.run_id = run.info.run_id # Store run ID
print(f"Model logged successfully. Run ID: {self.run_id}")

# Calculating classification report
report = classification_report(self.val_data.drop(columns = [self.target_col]), self.val_predictions.round(), output_dict=True)
mlflow.log_dict(report, "classification_report.json")

# Define metric calculation function
def compute_metrics(self, y_true, y_pred, prefix):
"""Computes and logs metrics with a specified prefix (train/validation)."""
metrics = {
f"{prefix}_auc": roc_auc_score(y_true, y_pred),
f"{prefix}_average_precision": average_precision_score(y_true, y_pred),
f"{prefix}_f1_score": f1_score(y_true, y_pred > 0.5),
f"{prefix}_f2_score": fbeta_score(y_true, y_pred > 0.5, beta=2.0),
f"{prefix}_brier_score": brier_score_loss(y_true, y_pred > 0.5),
f"{prefix}_recall": recall_score(y_true, y_pred > 0.5),
f"{prefix}_precision": precision_score(y_true, y_pred > 0.5),
}
for metric_name, value in metrics.items():
mlflow.log_metric(metric_name, value)
return metrics

def evaluate_model(self):
"""Evaluate the model using AUC metric."""
y_pred_proba = self.predictor.predict_proba(self.X_train).iloc[:, -1]
auc_score = roc_auc_score(self.y_true, y_pred_proba)
print(f"Model AUC: {auc_score:.4f}")
return auc_score

def run_pipeline(self):
"""Complete pipeline: data generation, training, evaluation, logging, and loading."""
self.split_data()

self.train_model()
auc_score = self.evaluate_model()

from ucimlrepo import fetch_ucirepo

# fetch dataset
adult = fetch_ucirepo(id=2)

# data (as pandas dataframes)
X = adult.data.features
y = adult.data.targets

data = X.copy()
data['income'] = y['income']
data.head()
data['income'] = data['income'].replace({'<=50K.': '<=50K', '>50K.': '>50K'})
data['income'] = data['income'].replace({'<=50K': 0, '>50K': 1})
data['income'].value_counts()

# Example Usage:
classifier = AutoGluonMLflowClassifier(model_data = data,
target_col = "income",
experiment_name = "autogluon_sample_experiment")
classifier.run_pipeline()

stbjelcevic · ‎10-31-2025

Hi @cleversuresh

Thanks for sharing the code and the context. Here are the core issues I see and how to fix them so MLflow logging works reliably on Databricks.

What’s breaking MLflow logging in your code

Your PyFunc wrapper loads the AutoGluon model from a local path rather than from the MLflow model’s packaged artifacts. In PythonModel.load_context, you must read any files from context.artifacts[...]. Otherwise, loading or serving the model will fail when that local path doesn’t exist in the target environment.
The input_example and signature inference are misaligned. You pass self.X_train[:2], but self.X_train is never defined; also input_example must match the schema you infer with infer_signature(model_input=..., model_output=...). Use a small slice of train_features (DataFrame with target dropped) for both signature and example.
classification_report arguments are incorrect. It expects y_true and y_pred (discrete labels), but you pass X as y_true and rounded probabilities as y_pred. Pass self.val_data[self.target_col] and (self.val_predictions > 0.5).astype(int) (or a tuned threshold) instead.
brier_score_loss expects probabilities, not thresholded predictions. Use the raw positive-class probabilities y_pred_proba (shape (n_samples,)) for Brier, not (y_pred > 0.5). If you need 0–1 range, set scale_by_half=True (binary default is usually auto).
evaluate_model uses undefined attributes (self.X_train, self.y_true). Use your stored train/validation splits and compute AUC with roc_auc_score(y_true, y_score) where y_score are positive-class probabilities.
The AutoGluon path pointing to /Shared/... is a workspace path, not a filesystem location. Use a real local/temp directory (for example via tempfile.mkdtemp()), then package it into MLflow model artifacts with artifacts={"ag_predictor": <local_dir>} and load with context.artifacts[...] in your PyFunc.
Make sure to set the MLflow experiment to a workspace path (like /Shared/...), which is supported on Databricks; if you want artifacts stored in UC Volumes, create the experiment with a UC volume artifact location.
Finally, ensure runtime dependencies (AutoGluon + its model backends, e.g., LightGBM, XGBoost, CatBoost) are present when loading/serving the model. Use conda_env or extra_pip_requirements in mlflow.pyfunc.log_model so MLflow reproduces the environment cleanly.

Here are some code patches:

1) Fix PyFunc wrapper to read from packages artifacts:

import mlflow
import pandas as pd
from mlflow.pyfunc import PythonModel
from autogluon.tabular import TabularPredictor

class AutoGluonPyFuncWrapper(PythonModel):
    """Wrapper for AutoGluon model to be logged as a PyFunc model in MLflow."""

    def __init__(self):
        self.predictor = None

    def load_context(self, context):
        # Load the predictor directory that was logged as an artifact
        predictor_dir = context.artifacts["ag_predictor"]
        self.predictor = TabularPredictor.load(predictor_dir)

    def predict(self, context, model_input):
        # Accept dict/list; convert to DataFrame
        if not isinstance(model_input, pd.DataFrame):
            model_input = pd.DataFrame(model_input)

        # Probability of the positive class
        proba_df = self.predictor.predict_proba(model_input)

        # Choose positive label robustly (prefer 1 if present)
        class_labels = list(proba_df.columns)
        pos_label = 1 if 1 in class_labels else class_labels[-1]
        return proba_df[pos_label]  # Pandas Series of positive-class probabilities

2) Log AutoGluon predictor directory as an MLflow artifact and align the signature

import tempfile
import mlflow
from mlflow.models.signature import infer_signature

# Choose a real local directory for AutoGluon training output
local_model_dir = tempfile.mkdtemp(prefix="ag_predictor_")

with mlflow.start_run() as run:
    # Train AutoGluon
    self.predictor = TabularPredictor(
        problem_type="binary",
        label=self.target_col,
        eval_metric="roc_auc",
        path=local_model_dir
    ).fit(
        self.train_data,
        excluded_model_types=["KNN", "RF"],
        hyperparameters=hyperparameters,
        presets="best_quality",
        num_bag_folds=3,
        num_stack_levels=1,
        time_limit=time_limit,
        verbosity=1,
        num_cpus=4,
        num_gpus=0,
        ag_args_fit={"num_cpus": 1, "num_gpus": 0}
    )

    # Compute train/val probabilities for metrics
    train_X = self.train_data.drop(columns=[self.target_col])
    val_X = self.val_data.drop(columns=[self.target_col])
    self.train_predictions = self.predictor.predict_proba(train_X).iloc[:, -1]
    self.val_predictions = self.predictor.predict_proba(val_X).iloc[:, -1]

    # Metrics (see patch 3 below)
    self.compute_metrics(self.train_data[self.target_col], self.train_predictions, "train")
    self.compute_metrics(self.val_data[self.target_col], self.val_predictions, "validation")

    # Signature and input_example must match the wrapper’s input/output
    input_example = train_X.head(2)
    signature = infer_signature(model_input=input_example, model_output=self.train_predictions.head(2))

    # Log PyFunc model and the trained predictor directory as artifact
    mlflow.pyfunc.log_model(
        artifact_path="model",
        python_model=AutoGluonPyFuncWrapper(),
        artifacts={"ag_predictor": local_model_dir},
        signature=signature,
        input_example=input_example,
        # Strongly recommended: pin pip requirements to include AutoGluon & backends
        extra_pip_requirements=[
            "mlflow>=2.8.0",  # adjust to your workspace runtime
            "autogluon.tabular>=1.1.0",  # pin your version
            "xgboost>=1.7.0",
            "lightgbm>=3.3.5",
            "catboost>=1.2"
        ],
    )

    self.run_id = run.info.run_id

3) Correct your metric logging

from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    f1_score,
    fbeta_score,
    brier_score_loss,
    recall_score,
    precision_score,
    classification_report
)

def compute_metrics(self, y_true, y_pred_proba, prefix):
    # y_pred_proba: probabilities of positive class
    y_pred_bin = (y_pred_proba > 0.5).astype(int)

    metrics = {
        f"{prefix}_auc": roc_auc_score(y_true, y_pred_proba),
        f"{prefix}_average_precision": average_precision_score(y_true, y_pred_proba),
        f"{prefix}_f1_score": f1_score(y_true, y_pred_bin),
        f"{prefix}_f2_score": fbeta_score(y_true, y_pred_bin, beta=2.0),
        f"{prefix}_brier_score": brier_score_loss(y_true, y_pred_proba),
        f"{prefix}_recall": recall_score(y_true, y_pred_bin),
        f"{prefix}_precision": precision_score(y_true, y_pred_bin),
    }
    for k, v in metrics.items():
        mlflow.log_metric(k, float(v))
    return metrics

def log_classification_report(self):
    # Use validation set labels and thresholded predictions
    y_true = self.val_data[self.target_col]
    y_pred_bin = (self.val_predictions > 0.5).astype(int)
    report = classification_report(y_true, y_pred_bin, output_dict=True)
    mlflow.log_dict(report, "classification_report.json")

4) Fix evaluate_model to use your stored splits

def evaluate_model(self):
    # Use the validation set probabilities already computed
    auc_score = roc_auc_score(self.val_data[self.target_col], self.val_predictions)
    print(f"Model AUC (validation): {auc_score:.4f}")
    return auc_score

A couple of Databricks-specific practices to keep this robust

Set the workspace experiment path once (recommended):
mlflow.set_experiment(f"/Shared/automl_experiments/{self.experiment_name}"). If you want to store artifacts in UC Volumes, create the experiment with an artifact location at a UC Volume path first, then set it active by path.
Package all runtime deps with the model (pip/conda), especially AutoGluon and its tree learners. You can use extra_pip_requirements (shown above) or supply a conda_env dict if you prefer hard pinning Python and Conda channels.
Always load files via context.artifacts[...] in load_context. MLflow will download artifacts next to the model and pass you local paths at runtime; don’t assume workspace or DBFS paths exist when the model is rehydrated.
Align input_example with your signature and wrapper input type (DataFrame rows of features). Signature/input_example improves handoff, validation, and serving.

View solution in original post

stbjelcevic · ‎10-31-2025