Requirements¶
Python version >= 3.7.3
DataRobot API version >= 2.22.1.
For additional information, reference the Python package documentation.
This code example uses the MADLEON dataset from this paper. It can also be found here.
Import libraries and connect to DataRobot¶
Read more about different options for connecting to DataRobot from the client.
import datarobot as dr
import numpy as np
import pandas as pd
# If the config file is not in the default location described in the API Quickstart guide, '~/.config/datarobot/drconfig.yaml', then you will need to call
# dr.Client(config_path='path-to-drconfig.yaml')
FIRE feature selection function¶
def feature_importance_rank_ensembling(
project,
n_models=5,
metric=None,
by_partition="validation",
feature_list_name=None,
ratio=0.95,
model_search_params=None,
use_ranks=True,
):
"""
Function that implements the logic of Feature Selection using Feature Importance Rank Ensembling and restarts DR autopilot
Parameters:
-----------
project: DR project object,
n_models: int, get top N best models on the leaderboard to compute feature impact on. Default 5
metric: str, DR metric to check performance against. Default None. If Default, it will use DR project defined metric
by_partition: str, whether to use 'validation' or 'crossValidation' partition to get the best model on. Default 'validation'
feature_list_name: str, name of the feature list to start iterating from. Default None
ratio: float, ratio of total feature impact that new feature list will contain. Default 0.95
model_search_params: dict, dictonary of parameters to search the best model. See official DR python api docs. Default None
use_ranks: Boolean, True to use median rank aggregation or False to use total impact unnormalized. Default True
Returns:
-----------
dr.Model object
"""
models = get_best_models(
project,
metric=metric,
by_partition=by_partition,
start_featurelist_name=feature_list_name,
model_search_params=model_search_params,
)
models = models.values[:n_models]
all_impact = pd.DataFrame()
print("Request Feature Impact calculations")
# First, kick off all Feature Impact requests and let DataRobot handle parallelizing
for model in models:
try:
model.request_feature_impact()
except:
pass
for model in models:
# Allow time for DataRobot to compute Feature Impact
feature_impact = pd.DataFrame(
model.get_or_request_feature_impact(max_wait=60 * 15)
) # 15min
# Track model name and ID for auditing purposes
feature_impact["model_type"] = model.model_type
feature_impact["model_id"] = model.id
# By sorting and re-indexing, the new index becomes our 'ranking'
feature_impact = feature_impact.sort_values(
by="impactUnnormalized", ascending=False
).reset_index(drop=True)
feature_impact["rank"] = feature_impact.index.values
# Add to the master list of all models' feature ranks
all_impact = pd.concat([all_impact, feature_impact], ignore_index=True)
# You need to get a threshold number of features to select.
# The threshold is based on the cumulative sum of impact
all_impact_agg = (
all_impact.groupby("featureName")[["impactNormalized", "impactUnnormalized"]]
.sum()
.sort_values("impactUnnormalized", ascending=False)
.reset_index()
)
# Calculate cumulative Feature Impact and take the first features that possess <ratio> of total impact
all_impact_agg["impactCumulative"] = all_impact_agg["impactUnnormalized"].cumsum()
total_impact = all_impact_agg["impactCumulative"].max() * ratio
tmp_fl = list(
set(
all_impact_agg[all_impact_agg.impactCumulative <= total_impact][
"featureName"
].values.tolist()
)
)
# The number of features to use
n_feats = len(tmp_fl)
if use_ranks:
# Get the top features based on median rank
top_ranked_feats = list(
all_impact.groupby("featureName")
.median()
.sort_values("rank")
.head(n_feats)
.index.values
)
else:
# Otherwise, get features based just on the total unnormalized feature impact
top_ranked_feats = list(all_impact_agg.featureName.values[:n_feats])
# Create a new feature list
featurelist = project.create_modeling_featurelist(
f"Reduced FL by Median Rank, top{n_feats}", top_ranked_feats
)
featurelist_id = featurelist.id
# Start Autopilot
print("Starting AutoPilot on a reduced feature list")
project.start_autopilot(
featurelist_id=featurelist_id,
prepare_model_for_deployment=True,
blend_best_models=False,
)
project.wait_for_autopilot()
print("... AutoPilot is completed.")
# Return the previous best model
return models[0]
Get the best-performing models¶
Avoid using models trained on higher than 3rd stage of Autopilot sample size (80%, 100%). Blender and Frozen models are ignored, so DataRobot selects models trained on 64% percent of the data.
def get_best_models(
project,
metric=None,
by_partition="validation",
start_featurelist_name=None,
model_search_params=None,
):
"""
Gets pd.Series of DR model objects sorted by performance. Excludes blenders, frozend and on DR Reduced FL
Parameters:
-----------
project: DR project object
metric: str, metric to use for sorting models on lb, if None, default project metric will be used. Default None
by_partiton: boolean, whether to use 'validation' or 'crossValidation' partitioning. Default 'validation'
start_featurelist_name: str, initial featurelist name to get models on. Default None
model_search_params: dict to pass model search params. Default None
Returns:
-----------
pd.Series of dr.Model objects, not blender, not frozen and not on DR Reduced Feature List
"""
# A list of metrics that get better as their value increases
desc_metric_list = [
"AUC",
"Area Under PR Curve",
"Gini Norm",
"Kolmogorov-Smirnov",
"Max MCC",
"Rate@Top5%",
"Rate@Top10%",
"Rate@TopTenth%",
"R Squared",
"FVE Gamma",
"FVE Poisson",
"FVE Tweedie",
"Accuracy",
"Balanced Accuracy",
"FVE Multinomial",
"FVE Binomial",
]
if not metric:
metric = project.metric
if "Weighted" in metric:
desc_metric_list = ["Weighted " + metric for metric in desc_metric_list]
asc_flag = False if metric in desc_metric_list else True
if project.is_datetime_partitioned:
assert by_partition in [
"validation",
"backtesting",
"holdout",
], "Please specify correct partitioning, in datetime partitioned projects supported options are: 'validation', 'backtesting', 'holdout' "
models_df = pd.DataFrame(
[
[
model.metrics[metric]["validation"],
model.metrics[metric]["backtesting"],
model.model_category,
model.is_frozen,
model.featurelist_name,
model,
]
for model in project.get_datetime_models()
],
columns=[
"validation",
"backtesting",
"category",
"is_frozen",
"featurelist_name",
"model",
],
).sort_values([by_partition], ascending=asc_flag, na_position="last")
else:
assert by_partition in [
"validation",
"crossValidation",
"holdout",
], "Please specify correct partitioning, supported options are: 'validation', 'crossValidation', 'holdout' "
models_df = pd.DataFrame(
[
[
model.metrics[metric]["crossValidation"],
model.metrics[metric]["validation"],
model.model_category,
model.is_frozen,
model.featurelist_name,
model,
]
for model in project.get_models(
with_metric=metric, search_params=model_search_params
)
],
columns=[
"crossValidation",
"validation",
"category",
"is_frozen",
"featurelist_name",
"model",
],
).sort_values([by_partition], ascending=asc_flag, na_position="last")
if start_featurelist_name:
return models_df.loc[
(
(models_df.category == "model")
& (models_df.is_frozen == False)
& (models_df.featurelist_name == start_featurelist_name)
),
"model",
]
else:
return models_df.loc[
(
(models_df.category == "model")
& (models_df.is_frozen == False)
& (models_df.featurelist_name.str.contains("DR Reduced Features M") == False)
),
"model",
]
Primary FIRE function¶
This function automatically executes the FIRE feature selection algorithm on the top N models. Once the reduced feature list is created, DataRobot re-runs Autopilot and waits until it completes. DataRobot then automatically sorts the models based on the project metric, computes Feature Impact, and iterates over again. If the new feature list produces a model that ranks lower based on a metric, it will expend one "life". The algorithm will stop performing feature selection when no lives are available (you start with 3).
def main_feature_selection(
project_id,
start_featurelist_name=None,
lifes=2,
top_n_models=5,
partition="validation",
main_scoring_metric=None,
initial_impact_reduction_ratio=0.95,
best_model_search_params=None,
use_ranks=True,
):
"""
Main function. Meant to get the optimal shortest feature list by repeating the feature selection process until stop criteria is met.
Currently supports Binary, Regression, Multiclass, Datetime partitioned (OTV), and AutoTS DataRobot projects.
Example usage:
>> import datarobot as dr
>> dr.Client(config_path='PATH_TO_DR_CONFIG/drconfig.yaml')
TIP: set best_model_search_params = {'sample_pct__lte': 65} to avoid using models trained on a higher sample size than the third stage of Autopilot, which is typically ~64% of the data.
>> main_feature_reduction('INSERT_PROJECT_ID',
start_featurelist_name=None,
lifes=3,
top_n_models=5,
partition='validation',
main_scoring_metric=None,
initial_impact_reduction_ratio=0.95,
best_model_search_params=None,
use_ranks=True)
Parameters:
-----------
project_id: str, id of DR project,
start_featurelist_name: str, name of feature list to start iterating from. Default None
lifes: int, stopping criteria, if no best model produced after lifes iterations, stop feature reduction. Default 3
top_n_models: int, only for 'Rank Aggregation method', get top N best models on the leaderboard. Default 5
partition: str, whether to use 'validation','crossValidation' or 'backtesting' partition to get the best model on. Default 'validation'
main_scoring_metric: str, DR metric to check performance against, If None DR project metric will be used
initial_impact_reduction_ratio: float, ratio of total feature impact that new feature list will contain. Default 0.95
best_model_search_params: dict, dictonary of parameters to search the best model. See official DR python api docs. Default None
use_ranks: Boolean, True to use median rank aggregation or False to use total impact unnormalized. Default True
Returns:
----------
dr.Model object of the best model on the leaderboard
"""
project = dr.Project.get(project_id)
ratio = initial_impact_reduction_ratio
assert ratio < 1, "Please specify initial_impact_reduction_ratio < 1"
model_search_params = best_model_search_params
runs = 0
# Main function loop
while lifes > 0:
if runs > 0:
start_featurelist_name = None
try:
best_model = feature_importance_rank_ensembling(
project,
n_models=top_n_models,
metric=main_scoring_metric,
by_partition=partition,
feature_list_name=start_featurelist_name,
ratio=ratio,
model_search_params=best_model_search_params,
use_ranks=use_ranks,
)
except dr.errors.ClientError as e:
# decay the ratio
ratio *= ratio
print(e, f"\nWill try again with a ratio decay ... New ratio={ratio:.3f}")
continue
##############################
### GET THE NEW BEST MODEL ###
##############################
new_best_model = get_best_models(
project,
metric=main_scoring_metric,
by_partition=partition,
model_search_params=model_search_params,
).values[0]
#################################
##### PROCESS STOP CRITERIA #####
#################################
if best_model.id == new_best_model.id:
# If no better model is produced with a recent run, expend 1 life
lifes -= 1
# If no lives left -> stop
if lifes <= 0:
print(
"New model performs worse. No lives left.\nAUTOMATIC FEATURE SELECTION PROCESS HAS BEEN STOPPED"
)
return new_best_model
# Decay the ratio
ratio *= ratio
print(
f"New model performs worse. One life is burnt.\nRepeat again with decaying the cumulative impact ratio. New ratio={ratio:.3f}"
)
runs += 1
print("Run ", runs, " completed")
return new_best_model
Create a project and initiate Autopilot¶
project = dr.Project.create('https://s3.amazonaws.com/datarobot_public_datasets/madelon_combined_80.csv')
project.set_target(target='y',
project_name = 'FIRE'
mode=dr.AUTOPILOT_MODE.QUICK,
worker_count=-1,
)
# Wait for Autopilot to finish. You can set verbosity to 0 if you do not wish to see progress updates
project.wait_for_autopilot(verbosity=1)
print(project.id)
Feature selection¶
When Autopilot completes, perform feature selection. Then, start Autopilot again using a feature list based on the median rank aggregation of Feature Impact across the top 5 models trained on the "Informative Features" feature list.
# Adjust the function's parameters for your purposes
best_model = main_feature_selection(
project.id, partition="crossValidation", best_model_search_params={"sample_pct__lte": 65}
)
Report the most accurate model¶
print(
f"The best model has {project.metric} score = {best_model.metrics[project.metric]['crossValidation']} on the cross-validation partition \
on the list of {len(best_model.get_features_used())} features"
)
The best model has LogLoss score = 0.264978 on the cross-validation partition on the list of 13 features