<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Re: sparkxgbregressor and RandomForestRegressor not able to deploy for inferencing in Machine Learning</title>
    <link>https://community.databricks.com/t5/machine-learning/sparkxgbregressor-and-randomforestregressor-not-able-to-deploy/m-p/37545#M1957</link>
    <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/63081"&gt;@Kumaran&lt;/a&gt;&amp;nbsp;Thanks for the reply kumaram &lt;span class="lia-unicode-emoji" title=":slightly_smiling_face:"&gt;🙂&lt;/span&gt;&lt;BR /&gt;&lt;BR /&gt;The deployment was finally successful for Random Forest algorithm, failing for sparkxgbregressor.&lt;BR /&gt;&lt;BR /&gt;Sharing code snippet:&lt;/P&gt;&lt;LI-CODE lang="python"&gt;from xgboost.spark import SparkXGBRegressor
vec_assembler = VectorAssembler(inputCols=train_df.columns[1:], outputCol="features")
#rf = RandomForestRegressor(labelCol="price", maxBins=260, seed=42)
xgbr = SparkXGBRegressor(num_workers=1, label_col="price", missing=0.0)
pipeline = Pipeline(stages=[vec_assembler, xgbr])
regression_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="price")
regression_evaluator2 = RegressionEvaluator(predictionCol="prediction", labelCol="price",metricName="r2")


def objective_function(params):    
    # set the hyperparameters that we want to tune
    max_depth = params["max_depth"]#rf,xgb
    #num_trees = params["num_trees"]#rf
    n_estimators = params["n_estimators"]#xgb

    with mlflow.start_run():
        #estimator = pipeline.copy({rf.maxDepth: max_depth, rf.numTrees: num_trees})#rf
        estimator = pipeline.copy({xgbr.max_depth: max_depth, xgbr.n_estimators: n_estimators})#xgbr
        model = estimator.fit(train_df)

        preds = model.transform(test_df)
        rmse = regression_evaluator.evaluate(preds)
        #r2 = regression_evaluator2.evaluate(preds)
        mlflow.log_metric("rmse", rmse)
        # mlflow.spark.log_model(model, "model",conda_env=mlflow.spark.get_default_conda_env())
        mlflow.spark.log_model(model, "model",conda_env=conda_env)

    return rmse

from hyperopt import hp
import numpy as np

search_space = {

    "max_depth" : hp.choice('max_depth', np.arange(5, 15, dtype=int)),

    "n_estimators": hp.choice('n_estimators', np.arange(70, 80, dtype=int))
}

from hyperopt import fmin, tpe, Trials
import numpy as np
import mlflow
import mlflow.spark
# mlflow.pyspark.ml.autolog(log_models=True)
mlflow.xgboost.autolog(log_models=True)
#mlflow.fastai.autolog(log_models=False)

num_evals = 1
trials = Trials()
best_hyperparam = fmin(fn=objective_function, 
                       space=search_space,
                       algo=tpe.suggest, 
                       max_evals=num_evals,
                       trials=trials,
                       rstate=np.random.default_rng(42))

# Retrain model on train &amp;amp; validation dataset and evaluate on test dataset
with mlflow.start_run():

    best_max_depth = best_hyperparam["max_depth"]#rf,xgb
    
    best_n_estimators = best_hyperparam["n_estimators"]#xgb
    estimator = pipeline.copy({xgbr.max_depth: best_max_depth, xgbr.n_estimators: best_n_estimators})#xgb


    pipeline_model = estimator.fit(train_df.limit(188123))
    pred_df = pipeline_model.transform(test_df)
    rmse = regression_evaluator.evaluate(pred_df)
    

    # Log param and metrics for the final model
    mlflow.log_param("maxDepth", best_max_depth)
    mlflow.log_param("n_estimators", best_n_estimators)
    
    mlflow.log_metric("rmse", rmse)
    &lt;/LI-CODE&gt;</description>
    <pubDate>Thu, 13 Jul 2023 07:25:05 GMT</pubDate>
    <dc:creator>raghagra</dc:creator>
    <dc:date>2023-07-13T07:25:05Z</dc:date>
    <item>
      <title>sparkxgbregressor and RandomForestRegressor not able to deploy for inferencing</title>
      <link>https://community.databricks.com/t5/machine-learning/sparkxgbregressor-and-randomforestregressor-not-able-to-deploy/m-p/37533#M1955</link>
      <description>&lt;P&gt;I have been trying to deploy spark ML Models from the experiement page via UI, the deployment gets aborted after a long run, any particular reason for why this might be happening? I have also taken care of dependencies still it is failing.&lt;/P&gt;&lt;P&gt;Dependency code block:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;PRE&gt;conda_env={&lt;BR /&gt;"dependencies":&lt;BR /&gt;[&lt;BR /&gt;"python=3.10.9"&lt;BR /&gt;{&lt;BR /&gt;"pip":["xgboost","pyspark==3.4.0","pip&amp;lt;=21.2.4"],&lt;BR /&gt;},&lt;BR /&gt;],&lt;BR /&gt;}&lt;/PRE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Thu, 13 Jul 2023 04:29:02 GMT</pubDate>
      <guid>https://community.databricks.com/t5/machine-learning/sparkxgbregressor-and-randomforestregressor-not-able-to-deploy/m-p/37533#M1955</guid>
      <dc:creator>raghagra</dc:creator>
      <dc:date>2023-07-13T04:29:02Z</dc:date>
    </item>
    <item>
      <title>Re: sparkxgbregressor and RandomForestRegressor not able to deploy for inferencing</title>
      <link>https://community.databricks.com/t5/machine-learning/sparkxgbregressor-and-randomforestregressor-not-able-to-deploy/m-p/37544#M1956</link>
      <description>&lt;P&gt;Hello &lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/84729"&gt;@raghagra&lt;/a&gt;&amp;nbsp;,&lt;/P&gt;&lt;P&gt;We appreciate your question posted in the Databricks community.&lt;/P&gt;&lt;P&gt;Without having a look at the code, it's difficult to determine the exact cause of the issue for the sparkxgbregressor. Could you kindly provide us with the code snippet that is causing this problem?&lt;/P&gt;&lt;P&gt;However, I recommend trying the Random Forest model by setting maxDepth to its default value, you may be able to mitigate the issue. According to the &lt;A href="https://api-docs.databricks.com/python/pyspark/latest/api/pyspark.ml.regression.RandomForestRegressor.html#pyspark.ml.regression.RandomForestRegressor.getMaxDepth" target="_self"&gt;documentation&lt;/A&gt;, the recommended value for maxDepth is 5.&lt;/P&gt;&lt;P&gt;Here's an example code snippet:&lt;/P&gt;&lt;P&gt;&lt;EM&gt;rfModel = RandomForestRegressor(featuresCol='features', labelCol=target, maxDepth=5, numTrees=50)&lt;/EM&gt;&lt;/P&gt;&lt;P&gt;Please give it a try and let us know if it helps resolve the issue.&lt;/P&gt;</description>
      <pubDate>Thu, 13 Jul 2023 06:54:23 GMT</pubDate>
      <guid>https://community.databricks.com/t5/machine-learning/sparkxgbregressor-and-randomforestregressor-not-able-to-deploy/m-p/37544#M1956</guid>
      <dc:creator>Kumaran</dc:creator>
      <dc:date>2023-07-13T06:54:23Z</dc:date>
    </item>
    <item>
      <title>Re: sparkxgbregressor and RandomForestRegressor not able to deploy for inferencing</title>
      <link>https://community.databricks.com/t5/machine-learning/sparkxgbregressor-and-randomforestregressor-not-able-to-deploy/m-p/37545#M1957</link>
      <description>&lt;P&gt;&lt;a href="https://community.databricks.com/t5/user/viewprofilepage/user-id/63081"&gt;@Kumaran&lt;/a&gt;&amp;nbsp;Thanks for the reply kumaram &lt;span class="lia-unicode-emoji" title=":slightly_smiling_face:"&gt;🙂&lt;/span&gt;&lt;BR /&gt;&lt;BR /&gt;The deployment was finally successful for Random Forest algorithm, failing for sparkxgbregressor.&lt;BR /&gt;&lt;BR /&gt;Sharing code snippet:&lt;/P&gt;&lt;LI-CODE lang="python"&gt;from xgboost.spark import SparkXGBRegressor
vec_assembler = VectorAssembler(inputCols=train_df.columns[1:], outputCol="features")
#rf = RandomForestRegressor(labelCol="price", maxBins=260, seed=42)
xgbr = SparkXGBRegressor(num_workers=1, label_col="price", missing=0.0)
pipeline = Pipeline(stages=[vec_assembler, xgbr])
regression_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="price")
regression_evaluator2 = RegressionEvaluator(predictionCol="prediction", labelCol="price",metricName="r2")


def objective_function(params):    
    # set the hyperparameters that we want to tune
    max_depth = params["max_depth"]#rf,xgb
    #num_trees = params["num_trees"]#rf
    n_estimators = params["n_estimators"]#xgb

    with mlflow.start_run():
        #estimator = pipeline.copy({rf.maxDepth: max_depth, rf.numTrees: num_trees})#rf
        estimator = pipeline.copy({xgbr.max_depth: max_depth, xgbr.n_estimators: n_estimators})#xgbr
        model = estimator.fit(train_df)

        preds = model.transform(test_df)
        rmse = regression_evaluator.evaluate(preds)
        #r2 = regression_evaluator2.evaluate(preds)
        mlflow.log_metric("rmse", rmse)
        # mlflow.spark.log_model(model, "model",conda_env=mlflow.spark.get_default_conda_env())
        mlflow.spark.log_model(model, "model",conda_env=conda_env)

    return rmse

from hyperopt import hp
import numpy as np

search_space = {

    "max_depth" : hp.choice('max_depth', np.arange(5, 15, dtype=int)),

    "n_estimators": hp.choice('n_estimators', np.arange(70, 80, dtype=int))
}

from hyperopt import fmin, tpe, Trials
import numpy as np
import mlflow
import mlflow.spark
# mlflow.pyspark.ml.autolog(log_models=True)
mlflow.xgboost.autolog(log_models=True)
#mlflow.fastai.autolog(log_models=False)

num_evals = 1
trials = Trials()
best_hyperparam = fmin(fn=objective_function, 
                       space=search_space,
                       algo=tpe.suggest, 
                       max_evals=num_evals,
                       trials=trials,
                       rstate=np.random.default_rng(42))

# Retrain model on train &amp;amp; validation dataset and evaluate on test dataset
with mlflow.start_run():

    best_max_depth = best_hyperparam["max_depth"]#rf,xgb
    
    best_n_estimators = best_hyperparam["n_estimators"]#xgb
    estimator = pipeline.copy({xgbr.max_depth: best_max_depth, xgbr.n_estimators: best_n_estimators})#xgb


    pipeline_model = estimator.fit(train_df.limit(188123))
    pred_df = pipeline_model.transform(test_df)
    rmse = regression_evaluator.evaluate(pred_df)
    

    # Log param and metrics for the final model
    mlflow.log_param("maxDepth", best_max_depth)
    mlflow.log_param("n_estimators", best_n_estimators)
    
    mlflow.log_metric("rmse", rmse)
    &lt;/LI-CODE&gt;</description>
      <pubDate>Thu, 13 Jul 2023 07:25:05 GMT</pubDate>
      <guid>https://community.databricks.com/t5/machine-learning/sparkxgbregressor-and-randomforestregressor-not-able-to-deploy/m-p/37545#M1957</guid>
      <dc:creator>raghagra</dc:creator>
      <dc:date>2023-07-13T07:25:05Z</dc:date>
    </item>
  </channel>
</rss>

