<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Logging spark pipeline model using mlflow spark , leads to PythonSecurityException in Machine Learning</title>
    <link>https://community.databricks.com/t5/machine-learning/logging-spark-pipeline-model-using-mlflow-spark-leads-to/m-p/9136#M421</link>
    <description>&lt;P&gt;Hello,&lt;/P&gt;&lt;P&gt;I am currently using a simple pyspark pipeline to transform my training data, fit model and log the model using mlflow.spark. But I get this following error (with mlflow.sklearn it works perfectly fine but due to size of my data I need to use pyspark ml library):&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;org.apache.spark.api.python.PythonSecurityException: Path 'mlflowdbfs:/artifacts?run_id=d2ecf91f0&amp;amp;path=/best_model/sparkml/metadata' uses an untrusted filesystem 'com.databricks.mlflowdbfs.MlflowdbfsFileSystem', but your administrator has configured Spark to only allow trusted filesystems: (com.databricks.s3a.S3AFileSystem, shaded.databricks.org.apache.hadoop.fs.s3a.S3AFileSystemHadoop3, shaded.databricks.azurebfs.org.apache.hadoop.fs.azurebfs.SecureAzureBlobFileSystem, shaded.databricks.v20180920_b33d810.org.apache.hadoop.fs.azurebfs.SecureAzureBlobFileSystem, shaded.databricks.com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem, com.databricks.adl.AdlFileSystem, shaded.databricks.azurebfs.org.apache.hadoop.fs.azurebfs.SecureAzureBlobFileSystemHadoop3, shaded.databricks.V2_1_4.com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem, shaded.databricks.org.apache.hadoop.fs.azure.NativeAzureFileSystem, shaded.databricks.com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemHadoop3, shaded.databricks.org.apache.hadoop.fs.s3a.S3AFileSystem)&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;here is the code that I use :&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import mlflow
from mlflow import spark
&amp;nbsp;
# Start an MLflow run and set experiment
with mlflow.start_run():
    mlflow.set_experiment("/Users/my-id/experiments")
&amp;nbsp;
    # Read in data from a CSV file
    data = spark.read.csv("dbfs:/FileStore/tables/data.csv", header=True, inferSchema=True)
&amp;nbsp;
    # Preprocess data
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
    assembler = VectorAssembler(inputCols=data.columns[:-1], outputCol="features")
    pipeline = Pipeline(stages=[labelIndexer, assembler])
    preprocessedData = pipeline.fit(data).transform(data)
&amp;nbsp;
    # Split data into training and test sets
    (trainingData, testData) = preprocessedData.randomSplit([0.7, 0.3])
&amp;nbsp;
    # Define model and hyperparameters to tune
    rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features")
    paramGrid = ParamGridBuilder() \
        .addGrid(rf.numTrees, [10, 20, 30]) \
        .addGrid(rf.maxDepth, [5, 10, 15]) \
        .build()
&amp;nbsp;
    # Evaluate model using area under ROC
    evaluator = BinaryClassificationEvaluator(labelCol="indexedLabel", metricName="areaUnderROC")
&amp;nbsp;
    # Perform cross-validation to tune hyperparameters
    cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
    cvModel = cv.fit(trainingData)
&amp;nbsp;
    # Log model and its metrics
    mlflow.spark.log_model(spark_model=cvModel.bestModel, artifact_path="best_model")&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;does anyone know how to solve this issue?&lt;/P&gt;&lt;P&gt;thanks in advance!&lt;/P&gt;</description>
    <pubDate>Fri, 17 Feb 2023 16:02:08 GMT</pubDate>
    <dc:creator>Saeid_H</dc:creator>
    <dc:date>2023-02-17T16:02:08Z</dc:date>
    <item>
      <title>Logging spark pipeline model using mlflow spark , leads to PythonSecurityException</title>
      <link>https://community.databricks.com/t5/machine-learning/logging-spark-pipeline-model-using-mlflow-spark-leads-to/m-p/9136#M421</link>
      <description>&lt;P&gt;Hello,&lt;/P&gt;&lt;P&gt;I am currently using a simple pyspark pipeline to transform my training data, fit model and log the model using mlflow.spark. But I get this following error (with mlflow.sklearn it works perfectly fine but due to size of my data I need to use pyspark ml library):&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;org.apache.spark.api.python.PythonSecurityException: Path 'mlflowdbfs:/artifacts?run_id=d2ecf91f0&amp;amp;path=/best_model/sparkml/metadata' uses an untrusted filesystem 'com.databricks.mlflowdbfs.MlflowdbfsFileSystem', but your administrator has configured Spark to only allow trusted filesystems: (com.databricks.s3a.S3AFileSystem, shaded.databricks.org.apache.hadoop.fs.s3a.S3AFileSystemHadoop3, shaded.databricks.azurebfs.org.apache.hadoop.fs.azurebfs.SecureAzureBlobFileSystem, shaded.databricks.v20180920_b33d810.org.apache.hadoop.fs.azurebfs.SecureAzureBlobFileSystem, shaded.databricks.com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem, com.databricks.adl.AdlFileSystem, shaded.databricks.azurebfs.org.apache.hadoop.fs.azurebfs.SecureAzureBlobFileSystemHadoop3, shaded.databricks.V2_1_4.com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem, shaded.databricks.org.apache.hadoop.fs.azure.NativeAzureFileSystem, shaded.databricks.com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystemHadoop3, shaded.databricks.org.apache.hadoop.fs.s3a.S3AFileSystem)&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;here is the code that I use :&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import mlflow
from mlflow import spark
&amp;nbsp;
# Start an MLflow run and set experiment
with mlflow.start_run():
    mlflow.set_experiment("/Users/my-id/experiments")
&amp;nbsp;
    # Read in data from a CSV file
    data = spark.read.csv("dbfs:/FileStore/tables/data.csv", header=True, inferSchema=True)
&amp;nbsp;
    # Preprocess data
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
    assembler = VectorAssembler(inputCols=data.columns[:-1], outputCol="features")
    pipeline = Pipeline(stages=[labelIndexer, assembler])
    preprocessedData = pipeline.fit(data).transform(data)
&amp;nbsp;
    # Split data into training and test sets
    (trainingData, testData) = preprocessedData.randomSplit([0.7, 0.3])
&amp;nbsp;
    # Define model and hyperparameters to tune
    rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features")
    paramGrid = ParamGridBuilder() \
        .addGrid(rf.numTrees, [10, 20, 30]) \
        .addGrid(rf.maxDepth, [5, 10, 15]) \
        .build()
&amp;nbsp;
    # Evaluate model using area under ROC
    evaluator = BinaryClassificationEvaluator(labelCol="indexedLabel", metricName="areaUnderROC")
&amp;nbsp;
    # Perform cross-validation to tune hyperparameters
    cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
    cvModel = cv.fit(trainingData)
&amp;nbsp;
    # Log model and its metrics
    mlflow.spark.log_model(spark_model=cvModel.bestModel, artifact_path="best_model")&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;does anyone know how to solve this issue?&lt;/P&gt;&lt;P&gt;thanks in advance!&lt;/P&gt;</description>
      <pubDate>Fri, 17 Feb 2023 16:02:08 GMT</pubDate>
      <guid>https://community.databricks.com/t5/machine-learning/logging-spark-pipeline-model-using-mlflow-spark-leads-to/m-p/9136#M421</guid>
      <dc:creator>Saeid_H</dc:creator>
      <dc:date>2023-02-17T16:02:08Z</dc:date>
    </item>
    <item>
      <title>Re: Logging spark pipeline model using mlflow spark , leads to PythonSecurityException</title>
      <link>https://community.databricks.com/t5/machine-learning/logging-spark-pipeline-model-using-mlflow-spark-leads-to/m-p/9137#M422</link>
      <description>&lt;P&gt;@Saeid Hedayati​&amp;nbsp;:&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;The error message indicates that the mlflow.spark.log_model function is attempting to save the model metadata to an untrusted filesystem called com.databricks.mlflowdbfs.MlflowdbfsFileSystem, but Spark has been configured to only allow trusted filesystems.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;One potential solution to this issue is to explicitly set the filesystem type used by MLflow to a trusted filesystem like S3 or Azure Blob Storage. You can do this by setting the MLFLOW_EXPERIMENT_STORAGE environment variable to the desired filesystem type.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;For example, if you are using S3 as your artifact store, you can set the MLFLOW_EXPERIMENT_STORAGE&lt;/P&gt;&lt;P&gt;environment variable as follows:&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;import os
os.environ['MLFLOW_EXPERIMENT_STORAGE'] = 's3://my-bucket/mlflow'&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;Replace my-bucket with the name of your S3 bucket and mlflow with the desired path in the bucket.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Alternatively, you can try saving the model metadata to a local filesystem instead of a DBFS path by specifying a local path for the artifact_uri parameter of the mlflow.start_run function:&lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;with mlflow.start_run(artifact_uri='/path/to/local/dir'):
    # ...
    mlflow.spark.log_model(spark_model=cvModel.bestModel, artifact_path="best_model")&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;Replace /path/to/local/dir with the path to a local directory where you want to save the model metadata.&lt;/P&gt;</description>
      <pubDate>Sun, 09 Apr 2023 15:16:26 GMT</pubDate>
      <guid>https://community.databricks.com/t5/machine-learning/logging-spark-pipeline-model-using-mlflow-spark-leads-to/m-p/9137#M422</guid>
      <dc:creator>Anonymous</dc:creator>
      <dc:date>2023-04-09T15:16:26Z</dc:date>
    </item>
    <item>
      <title>Re: Logging spark pipeline model using mlflow spark , leads to PythonSecurityException</title>
      <link>https://community.databricks.com/t5/machine-learning/logging-spark-pipeline-model-using-mlflow-spark-leads-to/m-p/9138#M423</link>
      <description>&lt;P&gt;Hi @Saeid Hedayati​&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Thank you for posting your question in our community! We are happy to assist you.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;To help us provide you with the most accurate information, could you please take a moment to review the responses and select the one that best answers your question?&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;This will also help other community members who may have similar questions in the future. Thank you for your participation and let us know if you need any further assistance!&amp;nbsp;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;</description>
      <pubDate>Fri, 21 Apr 2023 09:01:44 GMT</pubDate>
      <guid>https://community.databricks.com/t5/machine-learning/logging-spark-pipeline-model-using-mlflow-spark-leads-to/m-p/9138#M423</guid>
      <dc:creator>Anonymous</dc:creator>
      <dc:date>2023-04-21T09:01:44Z</dc:date>
    </item>
  </channel>
</rss>

