Model insights¶
After modeling is complete, you can generate insights for the top-performing model using the code in this notebook. After reviewing the insights, you can generate new, reduced feature lists to improve the modeling process.
Compute Feature Impact¶
SEGMENT_NAME = "segment_name"
combined_models = project.get_combined_models()
current_combined_model = combined_models[0]
segments_info = current_combined_model.get_segments_info()
segment_metadata = {x.segment: x for x in segments_info}
model = dr.Model.get(segment_metadata["SEGMENT_NAME"].project_id, segment_metadata["SEGMENT_NAME"].model_id)
child_project = dr.Project.get(project_id = segment_metadata["SEGMENT_NAME"].project_id)
feature_impacts = model.get_or_request_feature_impact()
dr_dark_blue = '#08233F'
dr_blue = '#1F77B4'
dr_orange = '#FF7F0E'
dr_red = '#BE3C28'
percent_tick_fmt = mtick.PercentFormatter(xmax=1.0)
impact_df = pd.DataFrame(feature_impacts)
impact_df.sort_values(by='impactNormalized', ascending=True, inplace=True)
# Positive values are blue, negative are red
bar_colors = impact_df.impactNormalized.apply(lambda x: dr_red if x < 0
else dr_blue)
ax = impact_df.plot.barh(x='featureName', y='impactNormalized',
legend=False,
color=bar_colors,
figsize=(12, 14))
ax.xaxis.set_major_formatter(percent_tick_fmt)
ax.xaxis.set_tick_params(labeltop=True)
ax.xaxis.grid(True, alpha=0.2)
ax.set_facecolor(dr_dark_blue)
plt.ylabel('')
plt.xlabel('Normalized Impact')
plt.xlim((None, 1)) # Allow for negative impact
plt.title('Feature Impact', y=1.04);
Create a histogram¶
def matplotlib_pair_histogram(labels, counts, target_avgs,
bin_count, ax1, feature):
# ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
# ax.xaxis.set_major_formatter(FormatStrFormatter('%.2f'))
# Rotate categorical labels
if feature.feature_type in ['Categorical', 'Text', 'Numeric']:
ax1.tick_params(axis='x', rotation=45)
ax1.set_ylabel(feature.name, color=dr_blue)
ax1.bar(labels, counts, color=dr_blue)
ax1.set_xticklabels([str(round(float(label), 2)) for label in labels])
# Instantiate a second axes that shares the same x-axis
ax2 = ax1.twinx()
ax2.set_ylabel(TARGET, color=dr_orange)
ax2.plot(labels, target_avgs, marker='o', lw=1, color=dr_orange)
ax1.set_facecolor(dr_dark_blue)
title = 'Histogram for {} ({} bins)'.format(feature.name, bin_count)
ax1.set_title(title)
def draw_feature_histogram(feature_name, bin_count):
feature = dr.Feature.get(project.id, feature_name)
# Retrieve downsampled histogram data
# Based on desired bin count
data = feature.get_histogram(bin_count).plot
data = pd.DataFrame(data, columns=['label', 'count', 'target'])
data['label'] = data['label'].astype(float).astype(int).astype(str)
data = data.to_dict(orient='records')
labels = [row['label'] for row in data]
counts = [row['count'] for row in data]
target_averages = [row['target'] for row in data]
f, axarr = plt.subplots()
f.set_size_inches((10, 4))
matplotlib_pair_histogram(labels, counts, target_averages,
bin_count, axarr, feature)
Use the code below to list a project's features.
SEGMENT_NAME = "segment_name"
combined_models = project.get_combined_models()
current_combined_model = combined_models[0]
segments_info = current_combined_model.get_segments_info()
for segment in segments_info:
if segment.segment == SEGMENT_NAME:
child_project = dr.Project.get(project_id = segment.project_id)
break
child_project.get_features()
Create a feature histogram¶
draw_feature_histogram('Date (Day of Month)', 10)
Curate feature lists¶
Choose one of the following methods to generate a reduced feature list.
- Percent of Top Features
- Number of Top Features
- Manually Specifying Features
Create a dataframe to store feature list names and IDs.
feature_lists_df = pd.DataFrame()
You can then retrieve the top x
percent of features.
THRESHOLD = 0.95
print(f'Collecting Feature Impact for M{model.model_number} in child_project "{project.project_name}"')
impact = pd.DataFrame.from_records(model.get_or_request_feature_impact())
impact['cumulative_impact'] = impact['impactUnnormalized'].cumsum() / impact['impactUnnormalized'].sum()
to_keep = np.where(impact['cumulative_impact'] <= THRESHOLD)[0]
if len(to_keep) < 1:
print('Applying this threshold would result in a featurelist with no features')
idx = np.max(to_keep)
selected_features = impact.loc[0:idx, 'featureName'].to_list()
feature_list = project.create_modeling_featurelist(f'Top {len(selected_features)} features M{model.model_number}',
selected_features)
cust_feat_list = pd.DataFrame(
[
{
'Name': feature_list.name,
'ID': feature_list.id
}
]
)
feature_lists_df = feature_lists_df.append(cust_feat_list, sort=False).reset_index(drop=True)
feature_lists_df
You can raise the feature threshold and create another feature list with those features.
THRESHOLD = 0.99
print(f'Collecting Feature Impact for M{model.model_number} in child_project "{project.project_name}"')
impact = pd.DataFrame.from_records(model.get_or_request_feature_impact())
impact['cumulative_impact'] = impact['impactUnnormalized'].cumsum() / impact['impactUnnormalized'].sum()
to_keep = np.where(impact['cumulative_impact'] <= THRESHOLD)[0]
if len(to_keep) < 1:
print('Applying this threshold would result in a featurelist with no features')
idx = np.max(to_keep)
selected_features = impact.loc[0:idx, 'featureName'].to_list()
feature_list = project.create_modeling_featurelist(f'Top {len(selected_features)} features M{model.model_number}',
selected_features)
cust_feat_list = pd.DataFrame(
[
{
'Name': feature_list.name,
'ID': feature_list.id
}
]
)
feature_lists_df = feature_lists_df.append(cust_feat_list, sort=False).reset_index(drop=True)
feature_lists_df
To retrieve the top x features:
MAX_FEATURES = 12
print(f'Collecting Feature Impact for M{model.model_number} in project "{project.project_name}"')
impact = model.get_or_request_feature_impact()
impact.sort(key=lambda x: x['impactNormalized'], reverse=True)
feature_list_items = [f['featureName'] for f in impact[:MAX_FEATURES]]
feature_list = project.create_modeling_featurelist(f'Top {MAX_FEATURES} features M{model.model_number}',
feature_list_items)
cust_feat_list = pd.DataFrame(
[
{
'Name': feature_list.name,
'ID': feature_list.id
}
]
)
feature_lists_df = feature_lists_df.append(cust_feat_list, sort=False).reset_index(drop=True)
feature_lists_df
To manually select features:
project.get_modeling_features()
To list features by normalized impact:
feature_impacts = model.get_or_request_feature_impact()
feature_impacts.sort(key=lambda x: x['impactNormalized'], reverse=True)
feature_impacts
To manually select features and put them into a list, copy them from the output above and post them into the following code block.
FEATURES = ['Date (Day of Week) (actual)', 'Date (days from previous calendar event) (actual)',
'SalesQty (7 day mean)', 'SalesQty (naive latest value)',
'SalesQty (14 day mean)', 'Advertised (actual)']
print(len(FEATURES))
feature_list = project.create_modeling_featurelist(f'Manual Feature Selection {len(FEATURES)}', FEATURES)
cust_feat_list = pd.DataFrame(
[
{
'Name': feature_list.name,
'ID': feature_list.id
}
]
)
feature_lists_df = feature_lists_df.append(cust_feat_list, sort=False).reset_index(drop=True)
feature_lists_df
To create another manually selected feature list:
FEATURES = ['Date (Day of Week) (actual)', 'Date (days from previous calendar event) (actual)',
'SalesQty (7 day mean)', 'SalesQty (naive latest value)',
'SalesQty (14 day mean)', 'Advertised (actual)', 'numTransactions (28 day median)',
'FrontPg (actual)', 'SalesQty (14 day std)']
print(len(FEATURES))
feature_list = project.create_modeling_featurelist(f'Manual Feature Selection {len(FEATURES)}', FEATURES)
cust_feat_list = pd.DataFrame(
[
{
'Name': feature_list.name,
'ID': feature_list.id
}
]
)
feature_lists_df = feature_lists_df.append(cust_feat_list, sort=False).reset_index(drop=True)
feature_lists_df
To list all feature lists:
project.get_modeling_featurelists()
Run model blueprints on a new feature list¶
feature_lists_df
To get models to run on the newly created feature lists, select the top five blueprints and put the Model IDs in a list.
MODELS = scores.loc[scores['Project_ID'] == PID]\
.sort_values(by=['RMSE_All_BT']).head(5)['Model_ID']
MODELS
# Manually Select Model IDs
# If you want to manually select the Models to run against the new Feature Lists
# Fill in the Models ID's below
# MODELS = ['', '', '']
Then, run the new feature lists against the selected blueprints.
DURATION = dr.helpers.partitioning_methods.construct_duration_string(years=2, months=8, days=18)
for m in MODELS :
project = Project.get(PID)
model = dr.Model.get(project, m)
for fl in feature_lists_df.values:
fl_id = fl[1]
try:
## Add Duration to train_datetime
model.train_datetime(featurelist_id = fl_id,
training_duration=DURATION)
print(f"Running Feature List {fl[0]} on Model {model.model_type}")
except dr.errors.ClientError:
pass
print(f"Feature List {fl[0]} already run on Model {model.model_type}")
print(' ')
Create a feature list for the selected project¶
selected_project = filter(lambda p: p[1] == PID, lst)
selected_project = list(selected_project)
selected_project
Next steps¶
After reviewing model insights and running the newly-created feature lists, you can generate predictions and initiate model deployment.