I'm not sure whether this is the right place, but we've encountered a bug in the datasets.py
(https://github.com/mlflow/mlflow/blob/master/mlflow/recipes/steps/ingest/datasets.py.). Anyone using recipes beware of forementioned.
def _convert_spark_df_to_pandas(self, spark_df):
import pandas as pd
datetime_cols = [
#this should befield.name for field in spark_df.schema.fields if str(field.dataType) == "DateType()"
field.name for field in spark_df.schema.fields if str(field.dataType) == "DateType"
]
pandas_df = spark_df.toPandas()
pandas_df[datetime_cols] = pandas_df[datetime_cols].apply(pd.to_datetime, errors="coerce")
return pandas_df