Predict fraudulent medical claims¶
This notebook outlines a use case for identifying fraudulent medical claims using the DataRobot Python package.
You can download a sample dataset for this workflow here.
Requirements¶
- Python version 3.8.12
- DataRobot API version 2.27.2.
For additional information, reference the Python package documentation.
from datetime import datetime
import json
import datarobot as dr
import numpy as np
import pandas as pd
import requests
import yaml
Connect to DataRobot¶
# If the config file is not in the default location described in the API Quickstart guide, '~/.config/datarobot/drconfig.yaml', then you will need to call
# dr.Client(config_path='path-to-drconfig.yaml')
Define variables¶
Use the code below to define variables used to make API calls to a deployment in later steps of this notebook.
You must provide a YAML file description including the required tags:
- token: a DR API key
- username: a login to DR
You can read more about different options for connecting to DataRobot from the client.
project_name = "Medical_Insurance_Fraud"
target = "iFraud"
metric = "LogLoss"
autopilot_mode = dr.enums.AUTOPILOT_MODE.QUICK
Define functions¶
def get_model_score(mod, metric):
res = {}
res["model_number"] = mod.model_number
res["model_type"] = mod.model_type
res["model"] = mod
res["sample_pct"] = mod.sample_pct
res["metric_v"] = mod.metrics.get(metric, {}).get("validation")
res["metric_cv"] = mod.metrics.get(metric, {}).get("crossValidation")
return res
def get_model_scores(
proj,
metric=None,
ascending=True,
search_params={"sample_pct__gt": 63.0, "sample_pct__lt": 65.0},
):
"""
iterate trough the project models and get their performance metric
"""
if metric is None:
metric = proj.metric
models = proj.get_models(search_params=search_params)
model_scores = [get_model_score(m, metric) for m in models if m.model_category != "blend"]
df = pd.DataFrame(model_scores)
df = df.sort_values(["metric_cv", "metric_v"], ascending=ascending, na_position="last")
df = df.reset_index(drop=True)
return df
def get_train_preds(mod):
"""
request and/or retrieve training predictions for a given model
"""
try:
# Request training predictions and get job IDs
pred_job = mod.request_training_predictions(dr.enums.DATA_SUBSET.ALL)
preds = pred_job.get_result_when_complete().get_all_as_dataframe()
return preds
except:
# Retrieve training predictions if they were already requested
train_preds = dr.TrainingPredictions.list(mod.project_id)
for train_pred in train_preds:
if train_pred.model_id == mod.id and train_pred.data_subset == "all":
preds = dr.TrainingPredictions.get(
mod.project_id, train_pred.prediction_id
).get_all_as_dataframe()
return preds
def prep_ad_preds(mods, prediction_col="prediction"):
"""
Preprocess training predictions from anomaly detection models
mods: a list of anomaly detection models
"""
preds = get_train_preds(mods[0])
preds.set_index("row_id", inplace=True)
preds = preds[["partition_id", prediction_col]].copy()
preds.rename(columns={prediction_col: f"{mods[0].model_type}_prediction"}, inplace=True)
for mod in mods[1:]:
preds_tmp = get_train_preds(mod)
preds_tmp.set_index("row_id", inplace=True)
preds = preds.merge(preds_tmp[[prediction_col]], left_index=True, right_index=True)
preds.rename(columns={prediction_col: f"{mod.model_type}_prediction"}, inplace=True)
preds["partition_id"] = preds.partition_id.replace("Holdout", "5.0").astype(float).astype(int)
return preds
def predict_deployment_expl(deployment, data):
# Get the DataRobot key, prediction server url, and deployment ID
pred_server = deployment.default_prediction_server
datarobot_key = pred_server["datarobot-key"]
deployment_url = pred_server["url"]
deployment_id = deployment.id
# Set HTTP headers. The charset should match the contents of the file.
headers = {"Content-Type": "application/json; charset=UTF-8", "datarobot-key": datarobot_key}
# Prediction Explanations parameters
params = {
"maxCodes": 3,
"thresholdHigh": 0.1,
"thresholdLow": 0.01,
}
url = f"{deployment_url}/predApi/v1.0/deployments/{deployment_id}/predictionExplanations"
# Make an API request for predictions
predictions_response = requests.post(
url, auth=(creds["username"], creds["token"]), data=data, headers=headers, params=params
)
return predictions_response.json()
# The prediction API returns values in nested JSON objects. The function below flattens the results so you can use them in tabular format.
def flatten_json(y):
out = {}
def flatten(x, name=""):
if type(x) is dict:
for a in x:
flatten(x[a], name + a + "_")
elif type(x) is list:
i = 0
for a in x:
flatten(a, name + str(i) + "_")
i += 1
else:
out[name[:-1]] = x
flatten(y)
return out
Read the data¶
You can alternatively provide data with other methods.
data_path = "https://docs.datarobot.com/en/docs/api/guide/common-case/fraud-claim.csv"
df = pd.read_csv(data_path) # Add your dataset here "file.csv"
df.head()
Create and run a project¶
# Set the target and initiate Autopilot
project_clf = dr.Project.create(sourcedata=df, project_name=project_name)
project_clf.set_target(target=target, mode=autopilot_mode, metric=metric, worker_count=-1)
project_clf.wait_for_autopilot(verbosity=0)
# Get leaderboard and the top performing model
model_scores_clf = get_model_scores(project_clf)
model_clf = model_scores_clf["model"].values.tolist()[0]
# Request and get training predictions to obtain partiotioning scheme
model_clf_preds = get_train_preds(model_clf)
model_clf_preds.set_index("row_id", inplace=True)
model_clf_preds = model_clf_preds[["partition_id"]].copy()
model_clf_preds["partition_id"] = np.where(
model_clf_preds["partition_id"] == "Holdout", 5, model_clf_preds["partition_id"]
)
print(model_clf_preds.shape)
(10000, 1)
Create and run an anomaly detection project¶
# Add partitioning to ensure it's the same between the projects
df_ad = df.merge(model_clf_preds, left_index=True, right_index=True)
print(df_ad.shape)
(10000, 25)
# Set the target and initiate Autopilot
project_ad = dr.Project.create(sourcedata=df_ad, project_name=f"{project_name}_ad")
part_ad = dr.UserCV(user_partition_col="partition_id", cv_holdout_level=5)
project_ad.set_target(
mode=autopilot_mode, worker_count=-1, unsupervised_mode=True, partitioning_method=part_ad
)
project_ad.wait_for_autopilot(verbosity=0)
# Get leaderboard and the top 3 performing models
model_scores_ad = get_model_scores(project_ad, ascending=False)
models_ad = model_scores_ad["model"].values.tolist()[:3]
# Request and get training predictions
models_ad_preds = prep_ad_preds(models_ad)
print(models_ad_preds.shape)
(10000, 4)
Create and run a project with anomaly predictions as features¶
Use the code below to test anomaly detection predictions with the training dataset.
# Add anomaly predictions and partitioning
df_ad_preds = df.merge(models_ad_preds, left_index=True, right_index=True)
print(df_ad_preds.shape)
(10000, 28)
Now, you can create a project and re-run Autopilot with additional features that uses the same partitioning. In the previous steps, you created projects with the exact same partitioning in order to compare results. Use the code below so that you can compare models trained with the new feature lists to the previous project.
# Set the target and initiate Autopilot
project_clf_ad = dr.Project.create(sourcedata=df_ad_preds, project_name=f"{project_name}_clf_ad")
part_clf_ad = dr.UserCV(user_partition_col="partition_id", cv_holdout_level=5)
project_clf_ad.set_target(
target=target,
mode=autopilot_mode,
metric=metric,
worker_count=-1,
partitioning_method=part_clf_ad,
)
project_clf_ad.wait_for_autopilot(verbosity=0)
# Get leaderboard and the top performing model
model_scores_clf_ad = get_model_scores(project_clf_ad)
model_clf_ad = model_scores_clf_ad["model"].values.tolist()[0]
Compare model performance¶
Use the snippets below to check whether including the anomaly score as a feature will improve supervised model results.
# View results of the top performing models across the each run of Autopilot.
print("AUC supervised :", model_clf.metrics["AUC"]["crossValidation"])
print("AUC supervised with anomaly features:", model_clf_ad.metrics["AUC"]["crossValidation"])
print("LogLoss supervised :", model_clf.metrics["LogLoss"]["crossValidation"])
print(
"LogLoss supervised with anomaly features:", model_clf_ad.metrics["LogLoss"]["crossValidation"]
)
AUC supervised : 0.8860379999999999 AUC supervised with anomaly features: 0.9677899999999999 LogLoss supervised : 0.256484 LogLoss supervised with anomaly features: 0.15887199999999999
model_clf_depl = dr.ModelRecommendation.get(project_clf.id).get_model()
Deploy a model and make predictions¶
After selecting the model you want to use in production, you can deploy it to a production environment and make predictions using the functions defined below.
Deploy a model¶
pred_serv_id = dr.PredictionServer.list()[0].id
deployment = dr.Deployment.create_from_learning_model(
model_id=model_clf_depl.id,
label=f"{project_name}_clf_depl",
default_prediction_server_id=pred_serv_id,
)
# To compute Feature Impact:
feature_impacts = model_clf_depl.get_or_request_feature_impact()
# To initialize Prediction Explanations:
pei_job = dr.PredictionExplanationsInitialization.create(project_clf.id, model_clf_depl.id)
pei_job.wait_for_completion()
Make predictions with a deployment¶
Once deployed, prepare a dataset to use for scoring. You can download a sample scoring dataset here.
scoring_path = "https://docs.datarobot.com/en/docs/api/guide/common-case/fraud-scoring.csv"
df_scoring = pd.read_csv(scoring_path)
data_to_pred = json.dumps(df_scoring.to_dict(orient="records"))
print(df_scoring.shape)
# Get and process predictions with explanations
print(str(datetime.now()))
preds_raw = predict_deployment_expl(deployment, data_to_pred)
preds_lst = [flatten_json(row) for row in preds_raw["data"]]
df_preds = pd.DataFrame(preds_lst)
print(str(datetime.now()))
cols_to_rename = {
"predictionValues_0_value": "Prediction",
"predictionExplanations_0_feature": "Primary Feature",
"predictionExplanations_0_featureValue": "Primary Feature Value",
"predictionExplanations_0_qualitativeStrength": "Primary Feature Strength",
"predictionExplanations_1_feature": "Secondary Feature",
"predictionExplanations_1_featureValue": "Secondary Feature Value",
"predictionExplanations_1_qualitativeStrength": "Secondary Feature Strength",
"predictionExplanations_2_feature": "Tertiary Feature",
"predictionExplanations_2_featureValue": "Tertiary Feature Value",
"predictionExplanations_2_qualitativeStrength": "Tertiary Feature Strength",
}
df_preds.rename(columns=cols_to_rename, inplace=True)
print(df_scoring.shape, df_preds.shape)
cols_to_add = [
"rowId",
"Prediction",
"Primary Feature Value",
"Primary Feature Strength",
"Primary Feature",
"Secondary Feature Value",
"Secondary Feature Strength",
"Secondary Feature",
"Tertiary Feature Value",
"Tertiary Feature Strength",
"Tertiary Feature",
]
df_scoring = df_scoring.merge(df_preds[cols_to_add], on="rowId")
df_scoring["Prediction Category"] = df_scoring.Prediction.apply(
lambda x: "High" if x >= 0.2 else "Medium" if x >= 0.1 else "Low"
)
print(df_scoring.shape)
Use the code below to save prediction results.
df_scoring.to_csv("fraud-scoring.csv", index=False)