<?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" version="2.0">
  <channel>
    <title>topic Model Lineage with Feature Engineering is missing tables and notebooks in Machine Learning</title>
    <link>https://community.databricks.com/t5/machine-learning/model-lineage-with-feature-engineering-is-missing-tables-and/m-p/63250#M3093</link>
    <description>&lt;P&gt;I am trying to track the lineage of model and tables using the FeatureEngineeringClient. The table lineage shows the relevant tables and notebooks but the model lineage shows only the model. No notebook and tables. here is my code&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;fe = FeatureEngineeringClient() 
def split_data():
    spark = SparkSession.builder.getOrCreate()
    catalog_name = config["catalog_name"]
    gold_layer = config["gold_layer_name"]
    silver_layer = config["silver_layer_name"]
    user_item_table_name = config["user_item_table_name"]
    ft_user_item_name = config["ft_user_item_name"]
    
    SEED = 4
    df_ratings = spark.table(f"{catalog_name}.{silver_layer}.{user_item_table_name}")

    table_name = f"{catalog_name}.{gold_layer}.{ft_user_item_name}"
    lookup_key = config["ft_user_item_pk"]
    label = config["label_col"]
    model_feature_lookups = [FeatureLookup(table_name=table_name, lookup_key=lookup_key)]

    # fe.create_training_set looks up features in model_feature_lookups that match the primary key from df_ratings
    fe_data = fe.create_training_set(df=df_ratings, feature_lookups=model_feature_lookups, label=label, exclude_columns=["rating_date_dayofmonth","rating_date_month"])
    df_data = fe_data.load_df()
    df_data = df_data.na.drop()
    
    (df_train, df_test) = df_data.randomSplit([0.75,0.25],SEED)
    print(f'full dataset: {df_data.count()}' ,f'Training: {df_train.count()}', f'test: {df_test.count()}\n')
    return (fe_data, df_data, df_train, df_test) 


with mlflow.start_run(run_name="ALS_final_model") as run:
        fe_full_data, df_full_data, df_train, df_test = split_data()
        als = ALS()
        als.setMaxIter(MAX_ITER)\
        .setSeed(SEED)\
        .setRegParam(best_params["REG_PARAM"])\
        .setUserCol(COL_USER)\
        .setItemCol(COL_ITEM)\
        .setRatingCol(COL_LABEL)\
        .setRank(best_params["RANK"])

        mlflow.log_param("MAX_ITER", MAX_ITER)
        mlflow.log_param("RANK", best_params["RANK"])
        mlflow.log_param("REG_PARAM", best_params["REG_PARAM"])

        model = als.fit(df_full_data)
        model.setColdStartStrategy('drop') 
        predictions = model.transform(df_full_data)

        model_info = fe.log_model(model=model, 
                    artifact_path = model_name,
                    flavor=mlflow.spark,
                    training_set=fe_full_data,
                    conda_env=mlflow.spark.get_default_conda_env(),
                    registered_model_name= f"{catalog_name}.feature_store.{model_name}"
                    )

        evaluator = RegressionEvaluator(predictionCol=COL_PRED, labelCol=COL_LABEL)
        rmse = evaluator.setMetricName("rmse").evaluate(predictions)
        mlflow.log_metric('rmse', rmse) &lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Attached you see the screenshot&amp;nbsp; of my lineage graphs for model and tables.&lt;/P&gt;&lt;P&gt;Any idea what could the problem?&amp;nbsp;&lt;/P&gt;</description>
    <pubDate>Mon, 11 Mar 2024 14:59:47 GMT</pubDate>
    <dc:creator>MohsenJ</dc:creator>
    <dc:date>2024-03-11T14:59:47Z</dc:date>
    <item>
      <title>Model Lineage with Feature Engineering is missing tables and notebooks</title>
      <link>https://community.databricks.com/t5/machine-learning/model-lineage-with-feature-engineering-is-missing-tables-and/m-p/63250#M3093</link>
      <description>&lt;P&gt;I am trying to track the lineage of model and tables using the FeatureEngineeringClient. The table lineage shows the relevant tables and notebooks but the model lineage shows only the model. No notebook and tables. here is my code&lt;BR /&gt;&lt;BR /&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="python"&gt;fe = FeatureEngineeringClient() 
def split_data():
    spark = SparkSession.builder.getOrCreate()
    catalog_name = config["catalog_name"]
    gold_layer = config["gold_layer_name"]
    silver_layer = config["silver_layer_name"]
    user_item_table_name = config["user_item_table_name"]
    ft_user_item_name = config["ft_user_item_name"]
    
    SEED = 4
    df_ratings = spark.table(f"{catalog_name}.{silver_layer}.{user_item_table_name}")

    table_name = f"{catalog_name}.{gold_layer}.{ft_user_item_name}"
    lookup_key = config["ft_user_item_pk"]
    label = config["label_col"]
    model_feature_lookups = [FeatureLookup(table_name=table_name, lookup_key=lookup_key)]

    # fe.create_training_set looks up features in model_feature_lookups that match the primary key from df_ratings
    fe_data = fe.create_training_set(df=df_ratings, feature_lookups=model_feature_lookups, label=label, exclude_columns=["rating_date_dayofmonth","rating_date_month"])
    df_data = fe_data.load_df()
    df_data = df_data.na.drop()
    
    (df_train, df_test) = df_data.randomSplit([0.75,0.25],SEED)
    print(f'full dataset: {df_data.count()}' ,f'Training: {df_train.count()}', f'test: {df_test.count()}\n')
    return (fe_data, df_data, df_train, df_test) 


with mlflow.start_run(run_name="ALS_final_model") as run:
        fe_full_data, df_full_data, df_train, df_test = split_data()
        als = ALS()
        als.setMaxIter(MAX_ITER)\
        .setSeed(SEED)\
        .setRegParam(best_params["REG_PARAM"])\
        .setUserCol(COL_USER)\
        .setItemCol(COL_ITEM)\
        .setRatingCol(COL_LABEL)\
        .setRank(best_params["RANK"])

        mlflow.log_param("MAX_ITER", MAX_ITER)
        mlflow.log_param("RANK", best_params["RANK"])
        mlflow.log_param("REG_PARAM", best_params["REG_PARAM"])

        model = als.fit(df_full_data)
        model.setColdStartStrategy('drop') 
        predictions = model.transform(df_full_data)

        model_info = fe.log_model(model=model, 
                    artifact_path = model_name,
                    flavor=mlflow.spark,
                    training_set=fe_full_data,
                    conda_env=mlflow.spark.get_default_conda_env(),
                    registered_model_name= f"{catalog_name}.feature_store.{model_name}"
                    )

        evaluator = RegressionEvaluator(predictionCol=COL_PRED, labelCol=COL_LABEL)
        rmse = evaluator.setMetricName("rmse").evaluate(predictions)
        mlflow.log_metric('rmse', rmse) &lt;/LI-CODE&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;&lt;P&gt;Attached you see the screenshot&amp;nbsp; of my lineage graphs for model and tables.&lt;/P&gt;&lt;P&gt;Any idea what could the problem?&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Mon, 11 Mar 2024 14:59:47 GMT</pubDate>
      <guid>https://community.databricks.com/t5/machine-learning/model-lineage-with-feature-engineering-is-missing-tables-and/m-p/63250#M3093</guid>
      <dc:creator>MohsenJ</dc:creator>
      <dc:date>2024-03-11T14:59:47Z</dc:date>
    </item>
    <item>
      <title>Re: Model Lineage with Feature Engineering is missing tables and notebooks</title>
      <link>https://community.databricks.com/t5/machine-learning/model-lineage-with-feature-engineering-is-missing-tables-and/m-p/63355#M3097</link>
      <description>&lt;P&gt;I just checked the feature_spec.yml file in the model registry and realized my feature tables are not tracked but only the final dataset.&amp;nbsp;&lt;/P&gt;&lt;LI-CODE lang="javascript"&gt;input_columns:
- user_id:
    data_type: int
    topological_ordering: 2
    source: training_data
- item_id:
    data_type: int
    topological_ordering: 0
    source: training_data
- timestamp:
    data_type: timestamp
    topological_ordering: 1
    source: training_data
workspace_id: '329425024234434367'
feature_store_client_version: 0.14.3
serialization_version:&lt;/LI-CODE&gt;</description>
      <pubDate>Tue, 12 Mar 2024 09:06:34 GMT</pubDate>
      <guid>https://community.databricks.com/t5/machine-learning/model-lineage-with-feature-engineering-is-missing-tables-and/m-p/63355#M3097</guid>
      <dc:creator>MohsenJ</dc:creator>
      <dc:date>2024-03-12T09:06:34Z</dc:date>
    </item>
    <item>
      <title>Re: Model Lineage with Feature Engineering is missing tables and notebooks</title>
      <link>https://community.databricks.com/t5/machine-learning/model-lineage-with-feature-engineering-is-missing-tables-and/m-p/63402#M3100</link>
      <description>&lt;P&gt;ok I realized something else. That although I used FeatureEngineeringCient, MLflow model artifact suggest I used&amp;nbsp;&lt;SPAN&gt;FeatureStoreClient. Please see attachment.&amp;nbsp;&amp;nbsp;&lt;/SPAN&gt;&lt;/P&gt;&lt;P&gt;&amp;nbsp;&lt;/P&gt;</description>
      <pubDate>Tue, 12 Mar 2024 15:01:18 GMT</pubDate>
      <guid>https://community.databricks.com/t5/machine-learning/model-lineage-with-feature-engineering-is-missing-tables-and/m-p/63402#M3100</guid>
      <dc:creator>MohsenJ</dc:creator>
      <dc:date>2024-03-12T15:01:18Z</dc:date>
    </item>
  </channel>
</rss>

