Skip to content

Click in-app to access the full platform documentation for your version of DataRobot.

Model insights

After modeling is complete, you can generate insights for the top-performing model using the code in this notebook. After reviewing the insights, you can generate new, reduced feature lists to improve the modeling process.

Compute Feature Impact

SEGMENT_NAME = "segment_name"

combined_models = project.get_combined_models()
current_combined_model = combined_models[0]
segments_info = current_combined_model.get_segments_info()
segment_metadata = {x.segment: x for x in segments_info}

model = dr.Model.get(segment_metadata["SEGMENT_NAME"].project_id, segment_metadata["SEGMENT_NAME"].model_id)
child_project = dr.Project.get(project_id = segment_metadata["SEGMENT_NAME"].project_id)
feature_impacts = model.get_or_request_feature_impact()
dr_dark_blue = '#08233F'
dr_blue      = '#1F77B4'
dr_orange    = '#FF7F0E'
dr_red       = '#BE3C28'
percent_tick_fmt = mtick.PercentFormatter(xmax=1.0)

impact_df = pd.DataFrame(feature_impacts)
impact_df.sort_values(by='impactNormalized', ascending=True, inplace=True)

# Positive values are blue, negative are red
bar_colors = impact_df.impactNormalized.apply(lambda x: dr_red if x < 0
                                              else dr_blue)

ax = impact_df.plot.barh(x='featureName', y='impactNormalized',
                         legend=False,
                         color=bar_colors,
                         figsize=(12, 14))
ax.xaxis.set_major_formatter(percent_tick_fmt)
ax.xaxis.set_tick_params(labeltop=True)
ax.xaxis.grid(True, alpha=0.2)
ax.set_facecolor(dr_dark_blue)

plt.ylabel('')
plt.xlabel('Normalized Impact')
plt.xlim((None, 1))  # Allow for negative impact
plt.title('Feature Impact', y=1.04);

Create a histogram

def matplotlib_pair_histogram(labels, counts, target_avgs,
                              bin_count, ax1, feature):

#     ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
#     ax.xaxis.set_major_formatter(FormatStrFormatter('%.2f'))

    # Rotate categorical labels
    if feature.feature_type in ['Categorical', 'Text', 'Numeric']:
        ax1.tick_params(axis='x', rotation=45)

    ax1.set_ylabel(feature.name, color=dr_blue)
    ax1.bar(labels, counts, color=dr_blue)
    ax1.set_xticklabels([str(round(float(label), 2)) for label in labels])

    # Instantiate a second axes that shares the same x-axis
    ax2 = ax1.twinx()
    ax2.set_ylabel(TARGET, color=dr_orange)
    ax2.plot(labels, target_avgs, marker='o', lw=1, color=dr_orange)
    ax1.set_facecolor(dr_dark_blue)
    title = 'Histogram for {} ({} bins)'.format(feature.name, bin_count)
    ax1.set_title(title)

def draw_feature_histogram(feature_name, bin_count):
    feature = dr.Feature.get(project.id, feature_name)
    # Retrieve downsampled histogram data
    # Based on desired bin count
    data = feature.get_histogram(bin_count).plot

    data = pd.DataFrame(data, columns=['label', 'count', 'target'])
    data['label'] = data['label'].astype(float).astype(int).astype(str)
    data = data.to_dict(orient='records')

    labels = [row['label'] for row in data]
    counts = [row['count'] for row in data]
    target_averages = [row['target'] for row in data]
    f, axarr = plt.subplots()
    f.set_size_inches((10, 4))
    matplotlib_pair_histogram(labels, counts, target_averages,
                              bin_count, axarr, feature)

Use the code below to list a project's features.

SEGMENT_NAME = "segment_name"

combined_models = project.get_combined_models()
current_combined_model = combined_models[0]
segments_info = current_combined_model.get_segments_info()

for segment in segments_info:
    if segment.segment == SEGMENT_NAME:
        child_project = dr.Project.get(project_id = segment.project_id)
        break

child_project.get_features()

Create a feature histogram

draw_feature_histogram('Date (Day of Month)', 10)

Curate feature lists

Choose one of the following methods to generate a reduced feature list.

  • Percent of Top Features
  • Number of Top Features
  • Manually Specifying Features

Create a dataframe to store feature list names and IDs.

feature_lists_df = pd.DataFrame()

You can then retrieve the top x percent of features.

THRESHOLD = 0.95
print(f'Collecting Feature Impact for M{model.model_number} in child_project "{project.project_name}"')

impact = pd.DataFrame.from_records(model.get_or_request_feature_impact())
impact['cumulative_impact'] = impact['impactUnnormalized'].cumsum() / impact['impactUnnormalized'].sum()

to_keep = np.where(impact['cumulative_impact'] <= THRESHOLD)[0]
if len(to_keep) < 1:
    print('Applying this threshold would result in a featurelist with no features')


idx = np.max(to_keep)

selected_features = impact.loc[0:idx, 'featureName'].to_list()
feature_list = project.create_modeling_featurelist(f'Top {len(selected_features)} features M{model.model_number}', 
                                                   selected_features)


cust_feat_list = pd.DataFrame(
    [
        {
            'Name': feature_list.name,
            'ID': feature_list.id
        }
    ]
)
feature_lists_df = feature_lists_df.append(cust_feat_list, sort=False).reset_index(drop=True) 

feature_lists_df

You can raise the feature threshold and create another feature list with those features.

THRESHOLD = 0.99
print(f'Collecting Feature Impact for M{model.model_number} in child_project "{project.project_name}"')

impact = pd.DataFrame.from_records(model.get_or_request_feature_impact())
impact['cumulative_impact'] = impact['impactUnnormalized'].cumsum() / impact['impactUnnormalized'].sum()

to_keep = np.where(impact['cumulative_impact'] <= THRESHOLD)[0]
if len(to_keep) < 1:
    print('Applying this threshold would result in a featurelist with no features')


idx = np.max(to_keep)

selected_features = impact.loc[0:idx, 'featureName'].to_list()
feature_list = project.create_modeling_featurelist(f'Top {len(selected_features)} features M{model.model_number}', 
                                                   selected_features)


cust_feat_list = pd.DataFrame(
    [
        {
            'Name': feature_list.name,
            'ID': feature_list.id
        }
    ]
)
feature_lists_df = feature_lists_df.append(cust_feat_list, sort=False).reset_index(drop=True) 

feature_lists_df

To retrieve the top x features:

MAX_FEATURES = 12
print(f'Collecting Feature Impact for M{model.model_number} in project "{project.project_name}"')

impact = model.get_or_request_feature_impact()

impact.sort(key=lambda x: x['impactNormalized'], reverse=True)
feature_list_items = [f['featureName'] for f in impact[:MAX_FEATURES]]

feature_list = project.create_modeling_featurelist(f'Top {MAX_FEATURES} features M{model.model_number}', 
                                                   feature_list_items)


cust_feat_list = pd.DataFrame(
    [
        {
            'Name': feature_list.name,
            'ID': feature_list.id
        }
    ]
)

feature_lists_df = feature_lists_df.append(cust_feat_list, sort=False).reset_index(drop=True) 

feature_lists_df

To manually select features:

project.get_modeling_features()

To list features by normalized impact:

feature_impacts = model.get_or_request_feature_impact()

feature_impacts.sort(key=lambda x: x['impactNormalized'], reverse=True)
feature_impacts

To manually select features and put them into a list, copy them from the output above and post them into the following code block.

FEATURES = ['Date (Day of Week) (actual)', 'Date (days from previous calendar event) (actual)',
            'SalesQty (7 day mean)', 'SalesQty (naive latest value)', 
            'SalesQty (14 day mean)', 'Advertised (actual)']

print(len(FEATURES))
feature_list = project.create_modeling_featurelist(f'Manual Feature Selection {len(FEATURES)}', FEATURES)


cust_feat_list = pd.DataFrame(
    [
        {
            'Name': feature_list.name,
            'ID': feature_list.id
        }
    ]
)

feature_lists_df = feature_lists_df.append(cust_feat_list, sort=False).reset_index(drop=True) 

feature_lists_df

To create another manually selected feature list:

FEATURES = ['Date (Day of Week) (actual)', 'Date (days from previous calendar event) (actual)', 
            'SalesQty (7 day mean)', 'SalesQty (naive latest value)', 
            'SalesQty (14 day mean)', 'Advertised (actual)', 'numTransactions (28 day median)',
            'FrontPg (actual)', 'SalesQty (14 day std)']

print(len(FEATURES))
feature_list = project.create_modeling_featurelist(f'Manual Feature Selection {len(FEATURES)}', FEATURES)

cust_feat_list = pd.DataFrame(
    [
        {
            'Name': feature_list.name,
            'ID': feature_list.id
        }
    ]
)

feature_lists_df = feature_lists_df.append(cust_feat_list, sort=False).reset_index(drop=True) 

feature_lists_df

To list all feature lists:

project.get_modeling_featurelists()

Run model blueprints on a new feature list

feature_lists_df

To get models to run on the newly created feature lists, select the top five blueprints and put the Model IDs in a list.

MODELS = scores.loc[scores['Project_ID'] == PID]\
.sort_values(by=['RMSE_All_BT']).head(5)['Model_ID']

MODELS
# Manually Select Model IDs
# If you want to manually select the Models to run against the new Feature Lists
# Fill in the Models ID's below
# MODELS = ['', '', '']

Then, run the new feature lists against the selected blueprints.

DURATION = dr.helpers.partitioning_methods.construct_duration_string(years=2, months=8, days=18)
for m in MODELS :
    project = Project.get(PID)
    model   = dr.Model.get(project, m)

    for fl in feature_lists_df.values:
        fl_id = fl[1] 
        try:
                ## Add Duration to train_datetime
            model.train_datetime(featurelist_id = fl_id, 
                                 training_duration=DURATION)
            print(f"Running Feature List {fl[0]} on Model {model.model_type}")
        except dr.errors.ClientError:
            pass
        print(f"Feature List {fl[0]} already run on Model {model.model_type}")
        print(' ')

Create a feature list for the selected project

selected_project = filter(lambda p: p[1] == PID, lst)
selected_project = list(selected_project)

selected_project

Next steps

After reviewing model insights and running the newly-created feature lists, you can generate predictions and initiate model deployment.


Updated May 2, 2022
Back to top