<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic PYSPARK - Can't run DecisionTreeClassifier everytime in Machine Learning</title>
    <link>https://community.databricks.com/t5/machine-learning/pyspark-can-t-run-decisiontreeclassifier-everytime/m-p/17041#M913</link>
    <description>&lt;P&gt;Hello community,&lt;/P&gt;&lt;P&gt;It's my first time here and i have a poor english so sorry for the mistakes &lt;span class="lia-unicode-emoji" title=":winking_face:"&gt;😉&lt;/span&gt;&lt;/P&gt;&lt;P&gt;I want to make a decision tree in pyspark on a training data (then i would like to evaluate it on a testing data). My target is a variable with 2 values ("one", "two").&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;So first, i use String Indexer for my target.&lt;/P&gt;&lt;P&gt;Then for each categorical columns, i use string indexer and onehotencoder&lt;/P&gt;&lt;P&gt;Then for the quantitative columns, i just add them with the transform categorical columns in a VectorAssembler.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;After, i create my pipeline (fit it on my FULL database and then transform it).&lt;/P&gt;&lt;P&gt;Just after these steps, i do a random split (training : 70%, testing 30%) and i use the DecisionTreeClassifier on my training and i evaluate my model on the testing data.&lt;/P&gt;&lt;P&gt;BUT : I don't know why, sometimes it works, sometimes it doesn't. It's very random and i can't figure it out.&lt;/P&gt;&lt;P&gt;This is my Log Error : &lt;/P&gt;&lt;P&gt;&lt;B&gt;&lt;I&gt;org.apache.spark.SparkException: Job aborted due to stage failure: Task 13 in stage 6389.0 failed 4 times, most recent failure: Lost task 13.3 in stage 6389.0 (TID 85481) (10.0.3.8 executor 12): org.apache.spark.SparkException: Failed to execute user defined function(StringIndexerModel$$Lambda$1837/1490332726: (string) =&amp;amp;gt; double)&lt;/I&gt;&lt;/B&gt;&lt;/P&gt;&lt;P&gt;it's the same thing for the Tuning of my model...&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Thank you in advance for your help !!! Have a nice day &lt;span class="lia-unicode-emoji" title=":slightly_smiling_face:"&gt;🙂&lt;/span&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;This is my code : &lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;&amp;nbsp;#MY VARIABLES :
categoricalColumns = ["var1","var2"]
numericCols= ["var3","var4"]
&amp;nbsp;
#MY TARGET :
stages = [] 
label_stringIdx = StringIndexer(inputCol="target", outputCol="label")
stages += [label_stringIdx]
&amp;nbsp;
#CATEGORICAL COLUMNS :
for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "_Index")
    encoder = OneHotEncoder(inputCol=categoricalCol + "_Index", outputCol=categoricalCol + "_classVec")
    stages += [stringIndexer, encoder]
&amp;nbsp;
#ADD THE QUANTITATIVE COLUMNS :
assemblerInputs = [c + "_classVec" for c in categoricalColumns] + numericCols
&amp;nbsp;
#ASSEMBLER :
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]
&amp;nbsp;
#PIPELINE
pipeline = Pipeline().setStages(stages)
pipeline_Fited = pipeline.fit(my_database)
my_database_transf= pipeline_Fited.transform(my_database)
cols = my_database.columns
selectedcols = ["label", "features"] + cols
final_dataset = my_database_transf.select(selectedcols)
&amp;nbsp;
#RANDOM SPLIT FOR TRAINING AND TESTING DATA
(trainingData, testingData) = final_dataset.randomSplit([0.7, 0.3], seed=100)
&amp;nbsp;
&amp;nbsp;
&amp;nbsp;
#CONSTRUCTION OF THE DECISION TREE ON THE TRAINING 
dt = DecisionTreeClassifier(labelCol="label",featuresCol="features",impurity='gini',maxDepth=4)
dtModel = dt.fit(trainingData) #SOMETIMES I HAVE THE PROBLEM HERE (BUT NOT EVERY TIME)
&amp;nbsp;
#EVALUATE THE MODEL ON THE TESTING DATA :
predict_test = dtModel.transform(testingData)
dtevaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction") # Default metricName="areaUnderROC"
#evaluatorAUC.evaluate(dtModel), evaluatorAUC.evaluate(predict_test) #SOMETIMES I HAVE THE PROBLEM HERE (BUT NOT EVERY TIME)
print('Accuracy:', dtevaluator.evaluate(predict_test))
print('AUC:', BinaryClassificationMetrics(predict_test['label','prediction'].rdd).areaUnderROC)
&amp;nbsp;
&amp;nbsp;
#THE TUNING 
# Create ParamGrid for Cross Validation
dtparamGrid = (ParamGridBuilder()\
             .addGrid(dt.maxDepth, [2, 3, 5])\
             .addGrid(dt.maxBins, [4,5,6,7,8])\
             .build())
&amp;nbsp;
# Evaluate model
dtevaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
&amp;nbsp;
# Create 5-fold CrossValidator
dtcv = CrossValidator(estimator = dt, 
                      estimatorParamMaps = dtparamGrid,
                      evaluator = dtevaluator,
                      numFolds = 5)
&amp;nbsp;
# Run cross validations
cv_dtModel = dtcv.fit(trainingData) #SOMETIMES I HAVE THE PROBLEM HERE (BUT NOT EVERY TIME)
&amp;nbsp;
# Prediction
predict_train = cv_dtModel.transform(trainingData)
predict_test = cv_dtModel.transform(testingData)
&amp;nbsp;
# Evaluate model
evaluatorAUC = BinaryClassificationEvaluator() # Default metricName="areaUnderROC"
evaluatorACC= MulticlassClassificationEvaluator(metricName="accuracy")
&amp;nbsp;
Best_DT_AUC = evaluatorAUC.evaluate(predict_train) #I ALWAYS HAVE THE PROBLEM HERE
Best_DT_ACC = evaluatorACC.evaluate(predict_test) #I ALWAYS HAVE THE PROBLEM HERE
&amp;nbsp;&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;A&lt;/P&gt;</description>
    <pubDate>Tue, 13 Dec 2022 14:55:33 GMT</pubDate>
    <dc:creator>Alison7759</dc:creator>
    <dc:date>2022-12-13T14:55:33Z</dc:date>
    <item>
      <title>PYSPARK - Can't run DecisionTreeClassifier everytime</title>
      <link>https://community.databricks.com/t5/machine-learning/pyspark-can-t-run-decisiontreeclassifier-everytime/m-p/17041#M913</link>
      <description>&lt;P&gt;Hello community,&lt;/P&gt;&lt;P&gt;It's my first time here and i have a poor english so sorry for the mistakes &lt;span class="lia-unicode-emoji" title=":winking_face:"&gt;😉&lt;/span&gt;&lt;/P&gt;&lt;P&gt;I want to make a decision tree in pyspark on a training data (then i would like to evaluate it on a testing data). My target is a variable with 2 values ("one", "two").&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;So first, i use String Indexer for my target.&lt;/P&gt;&lt;P&gt;Then for each categorical columns, i use string indexer and onehotencoder&lt;/P&gt;&lt;P&gt;Then for the quantitative columns, i just add them with the transform categorical columns in a VectorAssembler.&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;After, i create my pipeline (fit it on my FULL database and then transform it).&lt;/P&gt;&lt;P&gt;Just after these steps, i do a random split (training : 70%, testing 30%) and i use the DecisionTreeClassifier on my training and i evaluate my model on the testing data.&lt;/P&gt;&lt;P&gt;BUT : I don't know why, sometimes it works, sometimes it doesn't. It's very random and i can't figure it out.&lt;/P&gt;&lt;P&gt;This is my Log Error : &lt;/P&gt;&lt;P&gt;&lt;B&gt;&lt;I&gt;org.apache.spark.SparkException: Job aborted due to stage failure: Task 13 in stage 6389.0 failed 4 times, most recent failure: Lost task 13.3 in stage 6389.0 (TID 85481) (10.0.3.8 executor 12): org.apache.spark.SparkException: Failed to execute user defined function(StringIndexerModel$$Lambda$1837/1490332726: (string) =&amp;amp;gt; double)&lt;/I&gt;&lt;/B&gt;&lt;/P&gt;&lt;P&gt;it's the same thing for the Tuning of my model...&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;Thank you in advance for your help !!! Have a nice day &lt;span class="lia-unicode-emoji" title=":slightly_smiling_face:"&gt;🙂&lt;/span&gt;&lt;/P&gt;&lt;P&gt;&lt;/P&gt;&lt;P&gt;This is my code : &lt;/P&gt;&lt;PRE&gt;&lt;CODE&gt;&amp;nbsp;#MY VARIABLES :
categoricalColumns = ["var1","var2"]
numericCols= ["var3","var4"]
&amp;nbsp;
#MY TARGET :
stages = [] 
label_stringIdx = StringIndexer(inputCol="target", outputCol="label")
stages += [label_stringIdx]
&amp;nbsp;
#CATEGORICAL COLUMNS :
for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "_Index")
    encoder = OneHotEncoder(inputCol=categoricalCol + "_Index", outputCol=categoricalCol + "_classVec")
    stages += [stringIndexer, encoder]
&amp;nbsp;
#ADD THE QUANTITATIVE COLUMNS :
assemblerInputs = [c + "_classVec" for c in categoricalColumns] + numericCols
&amp;nbsp;
#ASSEMBLER :
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]
&amp;nbsp;
#PIPELINE
pipeline = Pipeline().setStages(stages)
pipeline_Fited = pipeline.fit(my_database)
my_database_transf= pipeline_Fited.transform(my_database)
cols = my_database.columns
selectedcols = ["label", "features"] + cols
final_dataset = my_database_transf.select(selectedcols)
&amp;nbsp;
#RANDOM SPLIT FOR TRAINING AND TESTING DATA
(trainingData, testingData) = final_dataset.randomSplit([0.7, 0.3], seed=100)
&amp;nbsp;
&amp;nbsp;
&amp;nbsp;
#CONSTRUCTION OF THE DECISION TREE ON THE TRAINING 
dt = DecisionTreeClassifier(labelCol="label",featuresCol="features",impurity='gini',maxDepth=4)
dtModel = dt.fit(trainingData) #SOMETIMES I HAVE THE PROBLEM HERE (BUT NOT EVERY TIME)
&amp;nbsp;
#EVALUATE THE MODEL ON THE TESTING DATA :
predict_test = dtModel.transform(testingData)
dtevaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction") # Default metricName="areaUnderROC"
#evaluatorAUC.evaluate(dtModel), evaluatorAUC.evaluate(predict_test) #SOMETIMES I HAVE THE PROBLEM HERE (BUT NOT EVERY TIME)
print('Accuracy:', dtevaluator.evaluate(predict_test))
print('AUC:', BinaryClassificationMetrics(predict_test['label','prediction'].rdd).areaUnderROC)
&amp;nbsp;
&amp;nbsp;
#THE TUNING 
# Create ParamGrid for Cross Validation
dtparamGrid = (ParamGridBuilder()\
             .addGrid(dt.maxDepth, [2, 3, 5])\
             .addGrid(dt.maxBins, [4,5,6,7,8])\
             .build())
&amp;nbsp;
# Evaluate model
dtevaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
&amp;nbsp;
# Create 5-fold CrossValidator
dtcv = CrossValidator(estimator = dt, 
                      estimatorParamMaps = dtparamGrid,
                      evaluator = dtevaluator,
                      numFolds = 5)
&amp;nbsp;
# Run cross validations
cv_dtModel = dtcv.fit(trainingData) #SOMETIMES I HAVE THE PROBLEM HERE (BUT NOT EVERY TIME)
&amp;nbsp;
# Prediction
predict_train = cv_dtModel.transform(trainingData)
predict_test = cv_dtModel.transform(testingData)
&amp;nbsp;
# Evaluate model
evaluatorAUC = BinaryClassificationEvaluator() # Default metricName="areaUnderROC"
evaluatorACC= MulticlassClassificationEvaluator(metricName="accuracy")
&amp;nbsp;
Best_DT_AUC = evaluatorAUC.evaluate(predict_train) #I ALWAYS HAVE THE PROBLEM HERE
Best_DT_ACC = evaluatorACC.evaluate(predict_test) #I ALWAYS HAVE THE PROBLEM HERE
&amp;nbsp;&lt;/CODE&gt;&lt;/PRE&gt;&lt;P&gt;A&lt;/P&gt;</description>
      <pubDate>Tue, 13 Dec 2022 14:55:33 GMT</pubDate>
      <guid>https://community.databricks.com/t5/machine-learning/pyspark-can-t-run-decisiontreeclassifier-everytime/m-p/17041#M913</guid>
      <dc:creator>Alison7759</dc:creator>
      <dc:date>2022-12-13T14:55:33Z</dc:date>
    </item>
  </channel>
</rss>

