Hello, Databricks Community,
I am experiencing an issue while trying to serve a quantized model in gguf format using Databricks serving with the llama-cpp-python library.
The model is registered using MLflow and pyfunc on Unity. The model loads without any issues using the load function, indicating that the registration and initial configuration are correct.
The problem arises during the creation of the inference endpoint. Although the model is registered and loaded, I am unable to create the endpoint necessary to perform predictions. The following logs are produced:
[4cmcn] [2025-02-03 15:02:53 +0000] [12033] [INFO] Booting worker with pid: 12033
[4cmcn] [2025-02-03 15:02:55 +0000] [9] [ERROR] Worker (pid:12018) was sent code 132!
[4cmcn] [2025-02-03 15:02:55 +0000] [12045] [INFO] Booting worker with pid: 12045
[4cmcn] [2025-02-03 15:02:55 +0000] [9] [ERROR] Worker (pid:12027) was sent code 132!
[4cmcn] [2025-02-03 15:02:55 +0000] [12049] [INFO] Booting worker with pid: 12049
[4cmcn] [2025-02-03 15:02:56 +0000] [9] [ERROR] Worker (pid:12030) was sent code 132!
[4cmcn] [2025-02-03 15:02:56 +0000] [12062] [INFO] Booting worker with pid: 12062
[4cmcn] [2025-02-03 15:02:56 +0000] [9] [ERROR] Worker (pid:12033) was sent code 132!
Code:
%pip install tkmacosx>=1.0.5
%pip install pynput>=1.7.7
%pip install llama-cpp-python>=0.3.6
%pip install pyperclip>=1.9.0
%pip install transformers>=4.46.2
%pip install pygments>=2.19.1
%pip install cloudpickle>=3.1.1
%pip install mlflow>=2.20.1
from mlflow.models.signature import infer_signature
import mlflow
from typing import Generator, List, Dict, Any, Union, Tuple
from llama_cpp import Llama
from collections import deque
import os
from pathlib import Path
class ChatModelWrapper(mlflow.pyfunc.PythonModel):
def __init__(self):
self.model_path = None
self.model = None
def load_context(self, context):
self.model_path = "/Volumes/ml_lab/generativo/models/granite-3.1-3b-a800m-instruct-Q6_K.gguf"
self.model = Llama(
self.model_path,
n_ctx=8192,
verbose=False,
n_threads=8
)
def create_chat_completion(
self,
messages: List[Dict[str, str]],
temperature: float = 0.4,
top_p: float = 0.9,
top_k: int = 50,
repeat_penalty: float = 1.2,
max_tokens: int = 256
) -> Generator[str, None, None]:
"""Helper method to create chat completions with standard parameters"""
if self.model is None:
raise ValueError("O modelo Llama deve ser fornecido como argumento.")
output = ""
for chunk in self.model.create_chat_completion(
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
top_k=top_k,
repeat_penalty=repeat_penalty,
stream=True
):
content = chunk['choices'][0]['delta'].get('content')
if content:
if content in ["<end_action>", "<|endoftext|>"]:
break
output += content
yield output
def process(self,messages):
"""Processa as mensagens e gera a resposta."""
response = ""
for chunk in self.create_chat_completion(messages, max_tokens=2024):
response = chunk
yield chunk
def get_answer(self,messages):
"""Retorna a resposta final da conversa."""
try:
return deque(self.process(messages), maxlen=1).pop()
except IndexError:
return ""
def predict(self, context, model_input: List[Dict[str,str]]) -> Dict[str,str]:
"""Gera respostas para multiplas entradas."""
return {"answer": self.get_answer(model_input)}
# Carregando o modelo Llama fora da funรงรฃo de prediรงรฃo
# Isso evita recarregar o modelo a cada prediรงรฃo e economiza recursos
messages = [{'role': 'system',
'content': 'Vocรช รฉ um assistente que fala portuguรชs e responde perguntas do usuario baseado no conteudo fornecido.'},
{'role': 'user', 'content': 'Oi'}]
signature = infer_signature(messages, {"answer": "Olรก, tudo bem?"})
mlflow.set_registry_uri("databricks-uc")
with mlflow.start_run():
model_info = mlflow.pyfunc.log_model(
python_model=ChatModelWrapper(),
artifact_path="model",
registered_model_name="ml_lab.generativo.granite",
pip_requirements=["tkmacosx>=1.0.5",
"pynput>=1.7.7",
"llama-cpp-python>=0.3.6",
"pyperclip>=1.9.0",
"transformers>=4.46.2",
"pygments>=2.19.1"],
)
model = mlflow.pyfunc.load_model(model_info.model_uri)
print(model.predict(messages))
I would like to know if there are any specific guidelines or known adjustments that could resolve this issue. Any help in diagnosing and resolving this matter would be greatly appreciated.