Skip to content

Click in-app to access the full platform documentation for your version of DataRobot.

Advanced Feature Selection using Feature Importance Rank Ensembling (FIRE)

This notebook shows the benefits of FIRE, advanced feature selection that uses median rank aggregation of feature impacts across several models created during a run of Autopilot.

Download this notebook from the code examples home page.

Requirements

Python version >= 3.7.3
DataRobot API version >= 2.22.1.

For additional information, reference the Python package documentation.

This code example uses the MADLEON dataset from this paper. It can also be found here.

Import libraries and connect to DataRobot

import numpy as np
import pandas as pd
import datarobot as dr
# Connect to DataRobot using a config file containing your API Key and endpoint
dr.Client(config_path='/Users/nathan.goudreault/.config/datarobot/drconfig.yaml')

FIRE feature selection function

def feature_importance_rank_ensembling(project, 
                                          n_models=5, 
                                          metric=None, 
                                          by_partition='validation',
                                          feature_list_name=None,
                                          ratio=0.95,
                                          model_search_params=None, 
                                          use_ranks=True,
                                         ):   
    """
    Function that implements the logic of Feature Selection using Feature Importance Rank Ensembling and restarts DR autopilot

    Parameters:
    -----------
    project: DR project object, 
    n_models: int, get top N best models on the leaderboard to compute feature impact on. Default 5
    metric: str, DR metric to check performance against. Default None. If Default, it will use DR project defined metric
    by_partition: str, whether to use 'validation' or 'crossValidation' partition to get the best model on. Default 'validation'
    feature_list_name: str, name of the feature list to start iterating from. Default None
    ratio: float, ratio of total feature impact that new feature list will contain. Default 0.95
    model_search_params: dict, dictonary of parameters to search the best model. See official DR python api docs. Default None
    use_ranks: Boolean, True to use median rank aggregation or False to use total impact unnormalized. Default True

    Returns:
    -----------
    dr.Model object
    """

    models = get_best_models(project,
                             metric=metric, 
                             by_partition=by_partition, 
                             start_featurelist_name=feature_list_name,
                             model_search_params = model_search_params)

    models = models.values[:n_models]

    all_impact = pd.DataFrame()

    print("Request Feature Impact calculations")
    # First, kick off all Feature Impact requests and let DataRobot handle parallelizing
    for model in models:
        try:
            model.request_feature_impact()
        except:
            pass

    for model in models:
        # Allow time for DataRobot to compute Feature Impact
        feature_impact = pd.DataFrame(model.get_or_request_feature_impact(max_wait=60*15))#15min

        # Track model name and ID for auditing purposes
        feature_impact['model_type'] = model.model_type
        feature_impact['model_id'] = model.id
        # By sorting and re-indexing, the new index becomes our 'ranking'
        feature_impact = feature_impact.sort_values(by='impactUnnormalized', ascending=False).reset_index(drop=True)
        feature_impact['rank'] = feature_impact.index.values

        # Add to the master list of all models' feature ranks
        all_impact = pd.concat([all_impact, feature_impact], ignore_index=True)

    # You need to get a threshold number of features to select.
    # The threshold is based on the cumulative sum of impact
    all_impact_agg = all_impact\
            .groupby('featureName')[['impactNormalized','impactUnnormalized']]\
            .sum()\
            .sort_values('impactUnnormalized', ascending=False)\
            .reset_index()

    # Calculate cumulative Feature Impact and take the first features that possess <ratio> of total impact
    all_impact_agg['impactCumulative'] = all_impact_agg['impactUnnormalized'].cumsum()
    total_impact = all_impact_agg['impactCumulative'].max() * ratio
    tmp_fl = list(set(all_impact_agg[all_impact_agg.impactCumulative <= total_impact]['featureName'].values.tolist()))

    # The number of features to use
    n_feats = len(tmp_fl)

    if use_ranks:
        # Get the top features based on median rank
        top_ranked_feats = list(all_impact
                                .groupby('featureName')
                                .median()
                                .sort_values('rank')
                                .head(n_feats)
                                .index
                                .values)
    else:
        # Otherwise, get features based just on the total unnormalized feature impact
        top_ranked_feats = list(all_impact_agg.featureName.values[:n_feats])

    # Create a new feature list
    featurelist = project.create_modeling_featurelist(f'Reduced FL by Median Rank, top{n_feats}', top_ranked_feats)
    featurelist_id = featurelist.id
    # Start Autopilot
    print('Starting AutoPilot on a reduced feature list')
    project.start_autopilot(featurelist_id=featurelist_id, 
                            prepare_model_for_deployment=True,
                            blend_best_models=False,
                           )
    project.wait_for_autopilot()
    print('... AutoPilot is completed.')
    # Return the previous best model
    return models[0]

Get the best-performing models

Avoid using models trained on higher than 3rd stage of Autopilot sample size (80%, 100%). Blender and Frozen models are ignored, so DataRobot selects models trained on 64% percent of the data.

def get_best_models(project, 
                    metric=None, 
                    by_partition='validation',
                    start_featurelist_name=None,
                    model_search_params=None
                   ):
    '''
    Gets pd.Series of DR model objects sorted by performance. Excludes blenders, frozend and on DR Reduced FL

    Parameters:
    -----------
    project: DR project object
    metric: str, metric to use for sorting models on lb, if None, default project metric will be used. Default None
    by_partiton: boolean, whether to use 'validation' or 'crossValidation' partitioning. Default 'validation'
    start_featurelist_name: str, initial featurelist name to get models on. Default None
    model_search_params: dict to pass model search params. Default None

    Returns:
    -----------
    pd.Series of dr.Model objects, not blender, not frozen and not on DR Reduced Feature List
    '''

    # A list of metrics that get better as their value increases
    desc_metric_list = ['AUC', 'Area Under PR Curve', 'Gini Norm', 'Kolmogorov-Smirnov', 'Max MCC', 'Rate@Top5%',
                        'Rate@Top10%', 'Rate@TopTenth%', 'R Squared', 'FVE Gamma', 'FVE Poisson', 'FVE Tweedie',
                        'Accuracy', 'Balanced Accuracy', 'FVE Multinomial', 'FVE Binomial'
                       ]

    if not metric:
        metric = project.metric
        if 'Weighted' in metric:
            desc_metric_list = ['Weighted ' + metric for metric in desc_metric_list]

    asc_flag = False if metric in desc_metric_list else True

    if project.is_datetime_partitioned:
        assert by_partition in ['validation', 'backtesting', 'holdout'], "Please specify correct partitioning, in datetime partitioned projects supported options are: 'validation', 'backtesting', 'holdout' "
        models_df =  pd.DataFrame(
            [[model.metrics[metric]['validation'],
              model.metrics[metric]['backtesting'],
              model.model_category,
              model.is_frozen,
              model.featurelist_name,
              model,
             ] for model in project.get_datetime_models()],
            columns=['validation', 'backtesting', 'category', 'is_frozen', 'featurelist_name', 'model']
        ).sort_values([by_partition], ascending = asc_flag, na_position='last')

    else:
        assert by_partition in ['validation', 'crossValidation', 'holdout'], "Please specify correct partitioning, supported options are: 'validation', 'crossValidation', 'holdout' "
        models_df =  pd.DataFrame(
            [[model.metrics[metric]['crossValidation'],
              model.metrics[metric]['validation'],
              model.model_category,
              model.is_frozen,
              model.featurelist_name,
              model,
         ] for model in project.get_models(with_metric = metric, search_params = model_search_params)],
        columns=['crossValidation', 'validation', 'category', 'is_frozen', 'featurelist_name', 'model']
    ).sort_values([by_partition], ascending = asc_flag, na_position='last')


    if start_featurelist_name:
        return models_df.loc[((models_df.category == 'model')&\
                              (models_df.is_frozen == False)&\
                              (models_df.featurelist_name == start_featurelist_name)
                             ),'model']
    else:
        return models_df.loc[((models_df.category == 'model')&\
                              (models_df.is_frozen == False)&\
                              (models_df.featurelist_name.str.contains('DR Reduced Features M') == False)
                             ),'model']

Primary FIRE function

This function automatically executes the FIRE feature selection algorithm on the top N models. Once the reduced feature list is created, DataRobot re-runs Autopilot and waits until it completes. DataRobot then automatically sorts the models based on the project metric, computes Feature Impact, and iterates over again. If the new feature list produces a model that ranks lower based on a metric, it will expend one "life". The algorithm will stop performing feature selection when no lives are available (you start with 3).

def main_feature_selection(project_id,
                           start_featurelist_name=None,
                           lifes=3,
                           top_n_models=5,
                           partition='validation',
                           main_scoring_metric=None,
                           initial_impact_reduction_ratio=0.95,
                           best_model_search_params=None,
                           use_ranks=True,
                          ):
    '''
    Main function. Meant to get the optimal shortest feature list by repeating the feature selection process until stop criteria is met.
    Currently supports Binary, Regression, Multiclass, Datetime partitioned (OTV), and AutoTS DataRobot projects.

    Example usage:
    >> import datarobot as dr    
    >> dr.Client(config_path='PATH_TO_DR_CONFIG/drconfig.yaml')
    TIP: set best_model_search_params = {'sample_pct__lte': 65} to avoid using models trained on a higher sample size than the third stage of Autopilot, which is typically ~64% of the data.

    >> main_feature_reduction('INSERT_PROJECT_ID',
                              start_featurelist_name=None,
                              lifes=3,
                              top_n_models=5,
                              partition='validation',
                              main_scoring_metric=None,
                              initial_impact_reduction_ratio=0.95,
                              best_model_search_params=None,
                              use_ranks=True)

    Parameters:
    -----------
    project_id: str, id of DR project, 
    start_featurelist_name: str, name of feature list to start iterating from. Default None
    lifes: int, stopping criteria, if no best model produced after lifes iterations, stop feature reduction. Default 3
    top_n_models: int, only for 'Rank Aggregation method', get top N best models on the leaderboard. Default 5
    partition: str, whether to use 'validation','crossValidation' or 'backtesting' partition to get the best model on. Default 'validation'
    main_scoring_metric: str, DR metric to check performance against, If None DR project metric will be used
    initial_impact_reduction_ratio: float, ratio of total feature impact that new feature list will contain. Default 0.95
    best_model_search_params: dict, dictonary of parameters to search the best model. See official DR python api docs. Default None
    use_ranks: Boolean, True to use median rank aggregation or False to use total impact unnormalized. Default True

    Returns:
    ----------
    dr.Model object of the best model on the leaderboard
    '''
    project = dr.Project.get(project_id)

    ratio = initial_impact_reduction_ratio
    assert ratio < 1, "Please specify initial_impact_reduction_ratio < 1"

    model_search_params = best_model_search_params

    runs = 0
    # Main function loop
    while lifes > 0:
        if runs > 0:
            start_featurelist_name = None
        try:    
            best_model = feature_importance_rank_ensembling(project,
                                                               n_models=top_n_models,
                                                               metric=main_scoring_metric,
                                                               by_partition=partition,
                                                               feature_list_name=start_featurelist_name,
                                                               ratio=ratio,
                                                               model_search_params=best_model_search_params,
                                                               use_ranks=use_ranks
                                                              )
        except dr.errors.ClientError as e:
            #decay the ratio
            ratio *= ratio
            print(e, f'\nWill try again with a ratio decay ...  New ratio={ratio:.3f}')
            continue

        ##############################
        ### GET THE NEW BEST MODEL ###
        ##############################

        new_best_model = get_best_models(project,
                                         metric=main_scoring_metric, 
                                         by_partition = partition, 
                                         model_search_params=model_search_params).values[0]


        #################################
        ##### PROCESS STOP CRITERIA #####
        #################################

        if best_model.id == new_best_model.id:
            # If no better model is produced with a recent run, expend 1 life
            lifes -= 1

            # If no lives left -> stop
            if lifes <= 0:
                print('New model performs worse. No lives left.\nAUTOMATIC FEATURE SELECTION PROCESS HAS BEEN STOPPED')
                return new_best_model

            # Decay the ratio
            ratio *= ratio
            print(f'New model performs worse. One life is burnt.\nRepeat again with decaying the cumulative impact ratio. New ratio={ratio:.3f}')

        runs += 1
        print('Run ', runs, ' completed')

    return new_best_model

Create a project and initiate Autopilot

project = dr.Project.create('https://s3.amazonaws.com/datarobot_public_datasets/madelon_combined_80.csv')
project.set_target(target='y',
                   mode=dr.AUTOPILOT_MODE.FULL_AUTO,
                   worker_count=-1,
                  )
# Wait for Autopilot to finish. You can set verbosity to 0 if you do not wish to see progress updates
project.wait_for_autopilot(verbosity=1)
print(project.id)

Feature selection

When Autopilot completes, perform feature selection. Then, start Autopilot again using a feature list based on the median rank aggregation of Feature Impact across the top 5 models trained on the "Informative Features" feature list.

# Adjust the function's parameters for your purposes
best_model = main_feature_selection(project.id,
                                    partition='crossValidation',
                                    best_model_search_params={'sample_pct__lte': 65})

Report the most accurate model

print(f"The best model has {project.metric} score = {best_model.metrics[project.metric]['crossValidation']} on the cross-validation partition \
on the list of {len(best_model.get_features_used())} features")
The best model has LogLoss score = 0.264978 on the cross-validation partition on the list of 13 features


Updated March 28, 2022
Back to top