DataRobot API resources > API user guide > Common use cases > Predict fraudulent medical claims

Predict fraudulent medical claims¶

This notebook outlines a use case for identifying fraudulent medical claims using version 3.0 of the DataRobot Python package.

You can download a sample dataset for this workflow here.

Requirements¶

Python version 3.7+.
DataRobot API version 2.27.2.

For additional information, reference the Python package documentation.

Setup¶

Import Libraries¶

In [1]:

Copied!





from datetime import datetime
import json

import datarobot as dr
import numpy as np
import pandas as pd
import requests
import yaml
from datetime import datetime
import json

import datarobot as dr
import numpy as np
import pandas as pd
import requests
import yaml

Connect to DataRobot¶

In [ ]:

Copied!

# If the config file is not in the default location described in the API Quickstart guide, '~/.config/datarobot/drconfig.yaml', then you will need to call
# dr.Client(config_path='path-to-drconfig.yaml')
# If the config file is not in the default location described in the API Quickstart guide, '~/.config/datarobot/drconfig.yaml', then you will need to call
# dr.Client(config_path='path-to-drconfig.yaml')

Define variables¶

Use the code below to define variables used to make API calls to a deployment in later steps of this notebook.

You must provide a YAML file description including the required tags:

token: a DR API key
username: a login to DR

You can read more about different options for connecting to DataRobot from the client.

In [12]:

Copied!





project_name = "Medical_Insurance_Fraud"
target = "iFraud"
metric = "LogLoss"
autopilot_mode = dr.enums.AUTOPILOT_MODE.QUICK
project_name = "Medical_Insurance_Fraud"
target = "iFraud"
metric = "LogLoss"
autopilot_mode = dr.enums.AUTOPILOT_MODE.QUICK

Define functions¶

In [4]:

Copied!





def get_model_score(mod, metric):
    res = {}
    res["model_number"] = mod.model_number
    res["model_type"] = mod.model_type
    res["model"] = mod
    res["sample_pct"] = mod.sample_pct

    res["metric_v"] = mod.metrics.get(metric, {}).get("validation")
    res["metric_cv"] = mod.metrics.get(metric, {}).get("crossValidation")

    return res


def get_model_scores(
    proj,
    metric=None,
    ascending=True,
    search_params={"sample_pct__gt": 63.0, "sample_pct__lt": 65.0},
):
    """
    iterate trough the project models and get their performance metric
    """
    if metric is None:
        metric = proj.metric
    models = proj.get_models(search_params=search_params)
    model_scores = [get_model_score(m, metric) for m in models if m.model_category != "blend"]
    df = pd.DataFrame(model_scores)
    df = df.sort_values(["metric_cv", "metric_v"], ascending=ascending, na_position="last")
    df = df.reset_index(drop=True)
    return df
def get_model_score(mod, metric):
    res = {}
    res["model_number"] = mod.model_number
    res["model_type"] = mod.model_type
    res["model"] = mod
    res["sample_pct"] = mod.sample_pct

    res["metric_v"] = mod.metrics.get(metric, {}).get("validation")
    res["metric_cv"] = mod.metrics.get(metric, {}).get("crossValidation")

    return res


def get_model_scores(
    proj,
    metric=None,
    ascending=True,
    search_params={"sample_pct__gt": 63.0, "sample_pct__lt": 65.0},
):
    """
    iterate trough the project models and get their performance metric
    """
    if metric is None:
        metric = proj.metric
    models = proj.get_models(search_params=search_params)
    model_scores = [get_model_score(m, metric) for m in models if m.model_category != "blend"]
    df = pd.DataFrame(model_scores)
    df = df.sort_values(["metric_cv", "metric_v"], ascending=ascending, na_position="last")
    df = df.reset_index(drop=True)
    return df

In [5]:

Copied!





def get_train_preds(mod):
    """
    request and/or retrieve training predictions for a given model
    """
    try:
        # Request training predictions and get job IDs
        pred_job = mod.request_training_predictions(dr.enums.DATA_SUBSET.ALL)
        preds = pred_job.get_result_when_complete().get_all_as_dataframe()
        return preds
    except:
        # Retrieve training predictions if they were already requested
        train_preds = dr.TrainingPredictions.list(mod.project_id)
        for train_pred in train_preds:
            if train_pred.model_id == mod.id and train_pred.data_subset == "all":
                preds = dr.TrainingPredictions.get(
                    mod.project_id, train_pred.prediction_id
                ).get_all_as_dataframe()
                return preds
def get_train_preds(mod):
    """
    request and/or retrieve training predictions for a given model
    """
    try:
        # Request training predictions and get job IDs
        pred_job = mod.request_training_predictions(dr.enums.DATA_SUBSET.ALL)
        preds = pred_job.get_result_when_complete().get_all_as_dataframe()
        return preds
    except:
        # Retrieve training predictions if they were already requested
        train_preds = dr.TrainingPredictions.list(mod.project_id)
        for train_pred in train_preds:
            if train_pred.model_id == mod.id and train_pred.data_subset == "all":
                preds = dr.TrainingPredictions.get(
                    mod.project_id, train_pred.prediction_id
                ).get_all_as_dataframe()
                return preds

In [6]:

Copied!





def prep_ad_preds(mods, prediction_col="prediction"):
    """
    Preprocess training predictions from anomaly detection models
    mods: a list of anomaly detection models
    """
    preds = get_train_preds(mods[0])
    preds.set_index("row_id", inplace=True)
    preds = preds[["partition_id", prediction_col]].copy()
    preds.rename(columns={prediction_col: f"{mods[0].model_type}_prediction"}, inplace=True)
    for mod in mods[1:]:
        preds_tmp = get_train_preds(mod)
        preds_tmp.set_index("row_id", inplace=True)
        preds = preds.merge(preds_tmp[[prediction_col]], left_index=True, right_index=True)
        preds.rename(columns={prediction_col: f"{mod.model_type}_prediction"}, inplace=True)

    preds["partition_id"] = preds.partition_id.replace("Holdout", "5.0").astype(float).astype(int)
    return preds
def prep_ad_preds(mods, prediction_col="prediction"):
    """
    Preprocess training predictions from anomaly detection models
    mods: a list of anomaly detection models
    """
    preds = get_train_preds(mods[0])
    preds.set_index("row_id", inplace=True)
    preds = preds[["partition_id", prediction_col]].copy()
    preds.rename(columns={prediction_col: f"{mods[0].model_type}_prediction"}, inplace=True)
    for mod in mods[1:]:
        preds_tmp = get_train_preds(mod)
        preds_tmp.set_index("row_id", inplace=True)
        preds = preds.merge(preds_tmp[[prediction_col]], left_index=True, right_index=True)
        preds.rename(columns={prediction_col: f"{mod.model_type}_prediction"}, inplace=True)

    preds["partition_id"] = preds.partition_id.replace("Holdout", "5.0").astype(float).astype(int)
    return preds

In [7]:

Copied!





def predict_deployment_expl(deployment, data):
    # Get the DataRobot key, prediction server url, and deployment ID
    pred_server = deployment.default_prediction_server
    datarobot_key = pred_server["datarobot-key"]
    deployment_url = pred_server["url"]
    deployment_id = deployment.id

    # Set HTTP headers. The charset should match the contents of the file.
    headers = {"Content-Type": "application/json; charset=UTF-8", "datarobot-key": datarobot_key}

    # Prediction Explanations parameters
    params = {
        "maxCodes": 3,
        "thresholdHigh": 0.1,
        "thresholdLow": 0.01,
    }

    url = f"{deployment_url}/predApi/v1.0/deployments/{deployment_id}/predictionExplanations"

    # Make an API request for predictions
    predictions_response = requests.post(
        url, auth=(creds["username"], creds["token"]), data=data, headers=headers, params=params
    )

    return predictions_response.json()
def predict_deployment_expl(deployment, data):
    # Get the DataRobot key, prediction server url, and deployment ID
    pred_server = deployment.default_prediction_server
    datarobot_key = pred_server["datarobot-key"]
    deployment_url = pred_server["url"]
    deployment_id = deployment.id

    # Set HTTP headers. The charset should match the contents of the file.
    headers = {"Content-Type": "application/json; charset=UTF-8", "datarobot-key": datarobot_key}

    # Prediction Explanations parameters
    params = {
        "maxCodes": 3,
        "thresholdHigh": 0.1,
        "thresholdLow": 0.01,
    }

    url = f"{deployment_url}/predApi/v1.0/deployments/{deployment_id}/predictionExplanations"

    # Make an API request for predictions
    predictions_response = requests.post(
        url, auth=(creds["username"], creds["token"]), data=data, headers=headers, params=params
    )

    return predictions_response.json()

In [8]:

Copied!





# The prediction API returns values in nested JSON objects. The function below flattens the results so you can use them in tabular format.


def flatten_json(y):
    out = {}

    def flatten(x, name=""):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + "_")
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + "_")
                i += 1
        else:
            out[name[:-1]] = x

    flatten(y)
    return out
# The prediction API returns values in nested JSON objects. The function below flattens the results so you can use them in tabular format.


def flatten_json(y):
    out = {}

    def flatten(x, name=""):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + "_")
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + "_")
                i += 1
        else:
            out[name[:-1]] = x

    flatten(y)
    return out

Read the data¶

You can alternatively provide data with other methods.

In [ ]:

Copied!

data_path = "https://docs.datarobot.com/en/docs/api/guide/common-case/fraud-claim.csv"

df = pd.read_csv(data_path)  # Add your dataset here "file.csv"
df.head()
data_path = "https://docs.datarobot.com/en/docs/api/guide/common-case/fraud-claim.csv"

df = pd.read_csv(data_path)  # Add your dataset here "file.csv"
df.head()

Create and run a project¶

In [13]:

Copied!

# Set the target and initiate Autopilot
project_clf = dr.Project.create(sourcedata=df, project_name=project_name)

project_clf.analyze_and_model(target=target, mode=autopilot_mode, metric=metric, worker_count=-1)

project_clf.wait_for_autopilot(verbosity=0)
# Set the target and initiate Autopilot
project_clf = dr.Project.create(sourcedata=df, project_name=project_name)

project_clf.analyze_and_model(target=target, mode=autopilot_mode, metric=metric, worker_count=-1)

project_clf.wait_for_autopilot(verbosity=0)

In [14]:

Copied!

# Get leaderboard and the top performing model
model_scores_clf = get_model_scores(project_clf)
model_clf = model_scores_clf["model"].values.tolist()[0]
# Get leaderboard and the top performing model
model_scores_clf = get_model_scores(project_clf)
model_clf = model_scores_clf["model"].values.tolist()[0]

In [15]:

Copied!





# Request and get training predictions to obtain partiotioning scheme
model_clf_preds = get_train_preds(model_clf)
model_clf_preds.set_index("row_id", inplace=True)
model_clf_preds = model_clf_preds[["partition_id"]].copy()
model_clf_preds["partition_id"] = np.where(
    model_clf_preds["partition_id"] == "Holdout", 5, model_clf_preds["partition_id"]
)
print(model_clf_preds.shape)
# Request and get training predictions to obtain partiotioning scheme
model_clf_preds = get_train_preds(model_clf)
model_clf_preds.set_index("row_id", inplace=True)
model_clf_preds = model_clf_preds[["partition_id"]].copy()
model_clf_preds["partition_id"] = np.where(
    model_clf_preds["partition_id"] == "Holdout", 5, model_clf_preds["partition_id"]
)
print(model_clf_preds.shape)

(10000, 1)

Create and run an anomaly detection project¶

In [20]:

Copied!

# Add partitioning to ensure it's the same between the projects
df_ad = df.merge(model_clf_preds, left_index=True, right_index=True)
print(df_ad.shape)
# Add partitioning to ensure it's the same between the projects
df_ad = df.merge(model_clf_preds, left_index=True, right_index=True)
print(df_ad.shape)

(10000, 25)

In [21]:

Copied!





# Set the target and initiate Autopilot
project_ad = dr.Project.create(sourcedata=df_ad, project_name=f"{project_name}_ad")

part_ad = dr.UserCV(user_partition_col="partition_id", cv_holdout_level=5)

project_ad.analyze_and_model(
    mode=autopilot_mode, worker_count=-1, unsupervised_mode=True, partitioning_method=part_ad
)
project_ad.wait_for_autopilot(verbosity=0)
# Set the target and initiate Autopilot
project_ad = dr.Project.create(sourcedata=df_ad, project_name=f"{project_name}_ad")

part_ad = dr.UserCV(user_partition_col="partition_id", cv_holdout_level=5)

project_ad.analyze_and_model(
    mode=autopilot_mode, worker_count=-1, unsupervised_mode=True, partitioning_method=part_ad
)
project_ad.wait_for_autopilot(verbosity=0)

In [22]:

Copied!

# Get leaderboard and the top 3 performing models
model_scores_ad = get_model_scores(project_ad, ascending=False)
models_ad = model_scores_ad["model"].values.tolist()[:3]
# Get leaderboard and the top 3 performing models
model_scores_ad = get_model_scores(project_ad, ascending=False)
models_ad = model_scores_ad["model"].values.tolist()[:3]

In [23]:

Copied!

# Request and get training predictions
models_ad_preds = prep_ad_preds(models_ad)
print(models_ad_preds.shape)
# Request and get training predictions
models_ad_preds = prep_ad_preds(models_ad)
print(models_ad_preds.shape)

(10000, 4)

Create and run a project with anomaly predictions as features¶

Use the code below to test anomaly detection predictions with the training dataset.

In [24]:

Copied!

# Add anomaly predictions and partitioning
df_ad_preds = df.merge(models_ad_preds, left_index=True, right_index=True)
print(df_ad_preds.shape)
# Add anomaly predictions and partitioning
df_ad_preds = df.merge(models_ad_preds, left_index=True, right_index=True)
print(df_ad_preds.shape)

(10000, 28)

Now, you can create a project and re-run Autopilot with additional features using the same partitioning. In the previous steps, you created projects with the exact same partitioning to compare results. In this step, use the code below to compare models trained with the new feature lists to the previous project.

In [25]:

Copied!





# Set the target and initiate Autopilot
project_clf_ad = dr.Project.create(sourcedata=df_ad_preds, project_name=f"{project_name}_clf_ad")

part_clf_ad = dr.UserCV(user_partition_col="partition_id", cv_holdout_level=5)

project_clf_ad.analyze_and_model(
    target=target,
    mode=autopilot_mode,
    metric=metric,
    worker_count=-1,
    partitioning_method=part_clf_ad,
)

project_clf_ad.wait_for_autopilot(verbosity=0)
# Set the target and initiate Autopilot
project_clf_ad = dr.Project.create(sourcedata=df_ad_preds, project_name=f"{project_name}_clf_ad")

part_clf_ad = dr.UserCV(user_partition_col="partition_id", cv_holdout_level=5)

project_clf_ad.analyze_and_model(
    target=target,
    mode=autopilot_mode,
    metric=metric,
    worker_count=-1,
    partitioning_method=part_clf_ad,
)

project_clf_ad.wait_for_autopilot(verbosity=0)

In [26]:

Copied!

# Get leaderboard and the top performing model
model_scores_clf_ad = get_model_scores(project_clf_ad)
model_clf_ad = model_scores_clf_ad["model"].values.tolist()[0]
# Get leaderboard and the top performing model
model_scores_clf_ad = get_model_scores(project_clf_ad)
model_clf_ad = model_scores_clf_ad["model"].values.tolist()[0]

Compare model performance¶

Use the snippets below to check whether including the anomaly score as a feature will improve supervised model results.

In [27]:

Copied!





# View results of the top performing models across the each run of Autopilot.
print("AUC supervised                      :", model_clf.metrics["AUC"]["crossValidation"])
print("AUC supervised with anomaly features:", model_clf_ad.metrics["AUC"]["crossValidation"])

print("LogLoss supervised                      :", model_clf.metrics["LogLoss"]["crossValidation"])
print(
    "LogLoss supervised with anomaly features:", model_clf_ad.metrics["LogLoss"]["crossValidation"]
)
# View results of the top performing models across the each run of Autopilot.
print("AUC supervised                      :", model_clf.metrics["AUC"]["crossValidation"])
print("AUC supervised with anomaly features:", model_clf_ad.metrics["AUC"]["crossValidation"])

print("LogLoss supervised                      :", model_clf.metrics["LogLoss"]["crossValidation"])
print(
    "LogLoss supervised with anomaly features:", model_clf_ad.metrics["LogLoss"]["crossValidation"]
)

AUC supervised                      : 0.8860379999999999
AUC supervised with anomaly features: 0.9677899999999999
LogLoss supervised                      : 0.256484
LogLoss supervised with anomaly features: 0.15887199999999999

In [28]:

Copied!

model_clf_depl = dr.ModelRecommendation.get(project_clf.id).get_model()
model_clf_depl = dr.ModelRecommendation.get(project_clf.id).get_model()

Deploy a model and make predictions¶

After selecting the model you want to use in production, you can deploy it to a production environment and make predictions using the functions defined below.

Deploy a model¶

In [ ]:

Copied!





pred_serv_id = dr.PredictionServer.list()[0].id
deployment = dr.Deployment.create_from_learning_model(
    model_id=model_clf_depl.id,
    label=f"{project_name}_clf_depl",
    default_prediction_server_id=pred_serv_id,
)
pred_serv_id = dr.PredictionServer.list()[0].id
deployment = dr.Deployment.create_from_learning_model(
    model_id=model_clf_depl.id,
    label=f"{project_name}_clf_depl",
    default_prediction_server_id=pred_serv_id,
)

In [ ]:

Copied!





# To compute Feature Impact:
feature_impacts = model_clf_depl.get_or_request_feature_impact()

# To initialize Prediction Explanations:
pei_job = dr.PredictionExplanationsInitialization.create(project_clf.id, model_clf_depl.id)
pei_job.wait_for_completion()
# To compute Feature Impact:
feature_impacts = model_clf_depl.get_or_request_feature_impact()

# To initialize Prediction Explanations:
pei_job = dr.PredictionExplanationsInitialization.create(project_clf.id, model_clf_depl.id)
pei_job.wait_for_completion()

Make predictions with a deployment¶

Once deployed, prepare a dataset to use for scoring. You can download a sample scoring dataset here.

In [ ]:

Copied!





scoring_path = "https://docs.datarobot.com/en/docs/api/guide/common-case/fraud-scoring.csv"
df_scoring = pd.read_csv(scoring_path)
data_to_pred = json.dumps(df_scoring.to_dict(orient="records"))
print(df_scoring.shape)
scoring_path = "https://docs.datarobot.com/en/docs/api/guide/common-case/fraud-scoring.csv"
df_scoring = pd.read_csv(scoring_path)
data_to_pred = json.dumps(df_scoring.to_dict(orient="records"))
print(df_scoring.shape)

In [ ]:

Copied!





# Get and process predictions with explanations
print(str(datetime.now()))
preds_raw = predict_deployment_expl(deployment, data_to_pred)

preds_lst = [flatten_json(row) for row in preds_raw["data"]]
df_preds = pd.DataFrame(preds_lst)
print(str(datetime.now()))
# Get and process predictions with explanations
print(str(datetime.now()))
preds_raw = predict_deployment_expl(deployment, data_to_pred)

preds_lst = [flatten_json(row) for row in preds_raw["data"]]
df_preds = pd.DataFrame(preds_lst)
print(str(datetime.now()))

In [ ]:

Copied!





cols_to_rename = {
    "predictionValues_0_value": "Prediction",
    "predictionExplanations_0_feature": "Primary Feature",
    "predictionExplanations_0_featureValue": "Primary Feature Value",
    "predictionExplanations_0_qualitativeStrength": "Primary Feature Strength",
    "predictionExplanations_1_feature": "Secondary Feature",
    "predictionExplanations_1_featureValue": "Secondary Feature Value",
    "predictionExplanations_1_qualitativeStrength": "Secondary Feature Strength",
    "predictionExplanations_2_feature": "Tertiary Feature",
    "predictionExplanations_2_featureValue": "Tertiary Feature Value",
    "predictionExplanations_2_qualitativeStrength": "Tertiary Feature Strength",
}

df_preds.rename(columns=cols_to_rename, inplace=True)
cols_to_rename = {
    "predictionValues_0_value": "Prediction",
    "predictionExplanations_0_feature": "Primary Feature",
    "predictionExplanations_0_featureValue": "Primary Feature Value",
    "predictionExplanations_0_qualitativeStrength": "Primary Feature Strength",
    "predictionExplanations_1_feature": "Secondary Feature",
    "predictionExplanations_1_featureValue": "Secondary Feature Value",
    "predictionExplanations_1_qualitativeStrength": "Secondary Feature Strength",
    "predictionExplanations_2_feature": "Tertiary Feature",
    "predictionExplanations_2_featureValue": "Tertiary Feature Value",
    "predictionExplanations_2_qualitativeStrength": "Tertiary Feature Strength",
}

df_preds.rename(columns=cols_to_rename, inplace=True)

In [ ]:

Copied!





print(df_scoring.shape, df_preds.shape)
cols_to_add = [
    "rowId",
    "Prediction",
    "Primary Feature Value",
    "Primary Feature Strength",
    "Primary Feature",
    "Secondary Feature Value",
    "Secondary Feature Strength",
    "Secondary Feature",
    "Tertiary Feature Value",
    "Tertiary Feature Strength",
    "Tertiary Feature",
]
df_scoring = df_scoring.merge(df_preds[cols_to_add], on="rowId")
df_scoring["Prediction Category"] = df_scoring.Prediction.apply(
    lambda x: "High" if x >= 0.2 else "Medium" if x >= 0.1 else "Low"
)
print(df_scoring.shape)
print(df_scoring.shape, df_preds.shape)
cols_to_add = [
    "rowId",
    "Prediction",
    "Primary Feature Value",
    "Primary Feature Strength",
    "Primary Feature",
    "Secondary Feature Value",
    "Secondary Feature Strength",
    "Secondary Feature",
    "Tertiary Feature Value",
    "Tertiary Feature Strength",
    "Tertiary Feature",
]
df_scoring = df_scoring.merge(df_preds[cols_to_add], on="rowId")
df_scoring["Prediction Category"] = df_scoring.Prediction.apply(
    lambda x: "High" if x >= 0.2 else "Medium" if x >= 0.1 else "Low"
)
print(df_scoring.shape)

Use the code below to save prediction results.

In [ ]:

Copied!

df_scoring.to_csv("fraud-scoring.csv", index=False)
df_scoring.to_csv("fraud-scoring.csv", index=False)

Predict fraudulent medical claims¶

Requirements¶

Setup¶

Import Libraries¶

Connect to DataRobot¶

Define variables¶

Define functions¶

Read the data¶

Create and run a project¶

Create and run an anomaly detection project¶

Create and run a project with anomaly predictions as features¶

Compare model performance¶

Deploy a model and make predictions¶

Deploy a model¶

Make predictions with a deployment¶

Was this page helpful?

Great! Let us know what you found helpful.

What can we do to improve the content?