<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic How to implement early stop in SparkXGBRegressor with Pipeline? in Machine Learning</title>
    <link>https://community.databricks.com/t5/machine-learning/how-to-implement-early-stop-in-sparkxgbregressor-with-pipeline/m-p/75852#M3390</link>
    <description>&lt;P&gt;Trying to implement an Early Stopping mechanism&amp;nbsp;in SparkXGBRegressor model with Pipeline:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml import Pipeline, PipelineModel
from xgboost.spark import SparkXGBRegressor
from xgboost.callback import EarlyStopping

assembler = VectorAssembler() \
    .setInputCols(relevant_model_cols) \
    .setOutputCol("features") \
    .setHandleInvalid("keep")

early_stop = EarlyStopping(
    rounds=5,
    min_delta=1e-3,
    save_best=True,
    maximize=True,
    data_name='validation_0',
    metric_name="auc",
)

xgboost_regressor = SparkXGBRegressor()
xgboost_regressor.setParams(
    gamma=0.2,
    max_depth=6,
    objective="reg:logistic",       # logistic regression, output probability
    missing=MISSING_VALUE_NUM_DEFAULT,
    num_workers=60,
    subsample=0.5,
    colsample_bytree=0.7,
    learning_rate=0.01,
    random_state=1234,
    reg_alpha=0.35,
    reg_lambda=0.3,
    n_estimators=50,
    eval_metric='auc',
    callbacks=[early_stop]
    )

pipeline = (
    Pipeline()
    .setStages([assembler,
                xgboost_regressor])
)

trained_model = pipeline.fit(train_dataset)&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;But, get the error:&lt;/P&gt;&lt;P&gt;Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.&lt;/P&gt;&lt;P&gt;The same with a small dataset.&lt;/P&gt;&lt;P&gt;I also tried to use:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;xgboost_regressor.setParams(
    early_stopping_rounds=10,
    validation_indicator_col='validation_0')&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Wed, 26 Jun 2024 12:58:38 GMT</pubDate>
    <dc:creator>bbashuk</dc:creator>
    <dc:date>2024-06-26T12:58:38Z</dc:date>
    <item>
      <title>How to implement early stop in SparkXGBRegressor with Pipeline?</title>
      <link>https://community.databricks.com/t5/machine-learning/how-to-implement-early-stop-in-sparkxgbregressor-with-pipeline/m-p/75852#M3390</link>
      <description>&lt;P&gt;Trying to implement an Early Stopping mechanism&amp;nbsp;in SparkXGBRegressor model with Pipeline:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml import Pipeline, PipelineModel
from xgboost.spark import SparkXGBRegressor
from xgboost.callback import EarlyStopping

assembler = VectorAssembler() \
    .setInputCols(relevant_model_cols) \
    .setOutputCol("features") \
    .setHandleInvalid("keep")

early_stop = EarlyStopping(
    rounds=5,
    min_delta=1e-3,
    save_best=True,
    maximize=True,
    data_name='validation_0',
    metric_name="auc",
)

xgboost_regressor = SparkXGBRegressor()
xgboost_regressor.setParams(
    gamma=0.2,
    max_depth=6,
    objective="reg:logistic",       # logistic regression, output probability
    missing=MISSING_VALUE_NUM_DEFAULT,
    num_workers=60,
    subsample=0.5,
    colsample_bytree=0.7,
    learning_rate=0.01,
    random_state=1234,
    reg_alpha=0.35,
    reg_lambda=0.3,
    n_estimators=50,
    eval_metric='auc',
    callbacks=[early_stop]
    )

pipeline = (
    Pipeline()
    .setStages([assembler,
                xgboost_regressor])
)

trained_model = pipeline.fit(train_dataset)&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;But, get the error:&lt;/P&gt;&lt;P&gt;Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.&lt;/P&gt;&lt;P&gt;The same with a small dataset.&lt;/P&gt;&lt;P&gt;I also tried to use:&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;xgboost_regressor.setParams(
    early_stopping_rounds=10,
    validation_indicator_col='validation_0')&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 26 Jun 2024 12:58:38 GMT</pubDate>
      <guid>https://community.databricks.com/t5/machine-learning/how-to-implement-early-stop-in-sparkxgbregressor-with-pipeline/m-p/75852#M3390</guid>
      <dc:creator>bbashuk</dc:creator>
      <dc:date>2024-06-26T12:58:38Z</dc:date>
    </item>
    <item>
      <title>Re: How to implement early stop in SparkXGBRegressor with Pipeline?</title>
      <link>https://community.databricks.com/t5/machine-learning/how-to-implement-early-stop-in-sparkxgbregressor-with-pipeline/m-p/75868#M3392</link>
      <description>&lt;P&gt;Ok, I finally solved it -&amp;nbsp;added a column to the dataset validation_indicator_col='validation_0', and did not pass it the the VectorAssembler:&lt;/P&gt;&lt;LI-CODE lang="python"&gt;xgboost_regressor = SparkXGBRegressor()
xgboost_regressor.setParams(
    gamma=0.2,
    max_depth=6,
    objective="reg:logistic",       # logistic regression, output probability
    missing=MISSING_VALUE_NUM_DEFAULT,
    num_workers=60,
    subsample=0.5,
    colsample_bytree=0.7,
    learning_rate=0.01,
    random_state=1234,
    reg_alpha=0.35,
    reg_lambda=0.3,
    n_estimators=600,
    eval_metric='auc',
    early_stopping_rounds=5,
    validation_indicator_col='validation_0',
    maximize=True,
    verbose=True,
    )&lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Wed, 26 Jun 2024 15:30:39 GMT</pubDate>
      <guid>https://community.databricks.com/t5/machine-learning/how-to-implement-early-stop-in-sparkxgbregressor-with-pipeline/m-p/75868#M3392</guid>
      <dc:creator>bbashuk</dc:creator>
      <dc:date>2024-06-26T15:30:39Z</dc:date>
    </item>
  </channel>
</rss>

