Skip to content

Click in-app to access the full platform documentation for your version of DataRobot.

Predict fraudulent medical claims

This notebook outlines a use case for identifying fraudulent medical claims using the DataRobot Python package.

You can download a sample dataset for this workflow here.

Requirements

  • Python version 3.8.12
  • DataRobot API version 2.27.2.

For additional information, reference the Python package documentation.

Setup

Import Libraries

import pandas as pd
import numpy as np
import datarobot as dr
import yaml
import json
import requests

from datetime import datetime

Define variables

Use the code below to define variables used to make API calls to a deployment in later steps of this notebook.

You must provide a YAML file description including the required tags:

  • token: a DR API key
  • username: a login to DR

You can read more about different options for connecting to DataRobot from the client.

with open('<file-path-to-drconfig.yaml>', 'r') as stream:
    creds = yaml.safe_load(stream)

project_name = 'Medical_Insurance_Fraud'
target = 'fraud'
metric = 'LogLoss'
autopilot_mode = dr.enums.AUTOPILOT_MODE.QUICK

Connect to DataRobot

# To connect to a Zepl notebook:
# dr.Client(token=z.getDatasource("datarobot_api")['token'] , endpoint='https://app.datarobot.com/api/v2')

# To connect to a Jupyter notebook:
dr.Client(config_path = '<file-path-to-drconfig.yaml>')

Define functions

def get_model_score(mod, metric):
    res = {}
    res['model_number'] = mod.model_number
    res['model_type'] = mod.model_type
    res['model'] = mod
    res['sample_pct'] = mod.sample_pct

    res['metric_v'] = mod.metrics.get(metric, {}).get('validation')
    res['metric_cv'] = mod.metrics.get(metric, {}).get('crossValidation')

    return res

def get_model_scores(proj, metric=None, ascending=True, 
                     search_params={'sample_pct__gt': 63.0, 'sample_pct__lt': 65.0}):
    """
    iterate trough the project models and get their performance metric
    """
    if metric is None:
        metric = proj.metric
    models = proj.get_models(search_params=search_params)
    model_scores = [get_model_score(m, metric) for m in models if m.model_category != 'blend']
    df = pd.DataFrame(model_scores)
    df = df.sort_values(['metric_cv', 'metric_v'], ascending=ascending, na_position='last')
    df = df.reset_index(drop=True)
    return df
def get_train_preds(mod):
    """
    request and/or retrieve training predictions for a given model
    """
    try:
        # Request training predictions and get job IDs
        pred_job = mod.request_training_predictions(dr.enums.DATA_SUBSET.ALL)
        preds = pred_job.get_result_when_complete().get_all_as_dataframe()
        return preds
    except:
        # Retrieve training predictions if they were already requested
        train_preds = dr.TrainingPredictions.list(mod.project_id)
        for train_pred in train_preds:
            if train_pred.model_id == mod.id and train_pred.data_subset == 'all':
                preds = dr.TrainingPredictions.get(mod.project_id, train_pred.prediction_id).get_all_as_dataframe()
                return preds
def prep_ad_preds(mods, prediction_col='prediction'):
    """
    Preprocess training predictions from anomaly detection models
    mods: a list of anomaly detection models
    """
    preds = get_train_preds(mods[0])
    preds.set_index('row_id', inplace=True)
    preds = preds[['partition_id', prediction_col]].copy()
    preds.rename(columns={prediction_col: f'{mods[0].model_type}_prediction'}, inplace=True)
    for mod in mods[1:]:
        preds_tmp = get_train_preds(mod)
        preds_tmp.set_index('row_id', inplace=True)
        preds = preds.merge(preds_tmp[[prediction_col]], left_index=True, right_index=True)
        preds.rename(columns={prediction_col: f'{mod.model_type}_prediction'}, inplace=True)

    preds['partition_id'] = preds.partition_id.replace('Holdout', '5.0').astype(float).astype(int)
    return preds
def predict_deployment_expl(deployment, data):
    # Get the DataRobot key, prediction server url, and deployment ID
    pred_server = deployment.default_prediction_server
    datarobot_key = pred_server['datarobot-key']
    deployment_url = pred_server['url']
    deployment_id = deployment.id

    # Set HTTP headers. The charset should match the contents of the file.
    headers = {'Content-Type': 'application/json; charset=UTF-8', 'datarobot-key': datarobot_key}

    # Prediction Explanations parameters
    params = {
            'maxCodes': 3,
            'thresholdHigh': 0.1,
            'thresholdLow': 0.01,
        }

    url = f'{deployment_url}/predApi/v1.0/deployments/{deployment_id}/predictionExplanations'

    # Make an API request for predictions
    predictions_response = requests.post(
        url,
        auth=(creds['username'], creds['token']),
        data=data,
        headers=headers,
        params=params
    )

    return predictions_response.json()
# The prediction API returns values in nested JSON objects. The function below flattens the results so you can use them in tabular format.
def flatten_json(y):
    out = {}

    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(y)
    return out

Read the data

You can alternatively provide data with other methods.

df = pd.read_csv('data/DR_Demo_Medical_Fraud.csv')
df_ad = df.drop(columns=['fraud'])
print(df.shape, df_ad.shape)

Create and run a project

# Set the target and initiate Autopilot
project_clf = dr.Project.create(sourcedata=df, project_name=project_name)

project_clf.set_target(target=target,
                       mode=autopilot_mode,
                       metric=metric,
                       worker_count=-1)

project_clf.wait_for_autopilot(verbosity=0)
# Get leaderboard and the top performing model
model_scores_clf = get_model_scores(project_clf)
model_clf = model_scores_clf['model'].values.tolist()[0]
# Request and get training predictions to obtain partiotioning scheme
model_clf_preds = get_train_preds(model_clf)
model_clf_preds.set_index('row_id', inplace=True)
model_clf_preds = model_clf_preds[['partition_id']].copy()
model_clf_preds['partition_id'] = np.where(model_clf_preds['partition_id'] == 'Holdout', 5, model_clf_preds['partition_id'])
print(model_clf_preds.shape)

Create and run an anomaly detection project

# Add partitioning to ensure it's the same between the projects 
df_ad = df_ad.merge(model_clf_preds, left_index=True, right_index=True)
print(df_ad.shape)
# Set the target and initiate Autopilot
project_ad = dr.Project.create(sourcedata=df_ad, project_name=f'{project_name}_ad')

part_ad = dr.UserCV(user_partition_col='partition_id', cv_holdout_level=5)

project_ad.set_target(mode=autopilot_mode,
                      worker_count=-1,
                      unsupervised_mode=True,
                      partitioning_method=part_ad)
project_ad.wait_for_autopilot(verbosity=0)
# Get leaderboard and the top 3 performing models
model_scores_ad = get_model_scores(project_ad, ascending=False)
models_ad = model_scores_ad['model'].values.tolist()[:3]
# Request and get training predictions
models_ad_preds = prep_ad_preds(models_ad)
print(models_ad_preds.shape)

Create and run a project with anomaly predictions as features

Use the code below to test anomaly detection predictions with the training dataset.

# Add anomaly predictions and partitioning
df_ad_preds = df.merge(models_ad_preds, left_index=True, right_index=True)
print(df_ad_preds.shape)

Now, you can create a project and re-run Autopilot with additional features that uses the same partitioning. In the previous steps, you created projects with the exact same partitioning in order to compare results. Use the code below so that you can compare models trained with the new feature lists to the previous project.

# Set the target and initiate Autopilot
project_clf_ad = dr.Project.create(sourcedata=df_ad_preds, project_name=f'{project_name}_clf_ad')

part_clf_ad = dr.UserCV(user_partition_col='partition_id',
                        cv_holdout_level=5)

project_clf_ad.set_target(target=target,
                          mode=autopilot_mode,
                          metric=metric,
                          worker_count=-1, 
                          partitioning_method=part_clf_ad)

project_clf_ad.wait_for_autopilot(verbosity=0)
# Get leaderboard and the top performing model
model_scores_clf_ad = get_model_scores(project_clf_ad)
model_clf_ad = model_scores_clf_ad['model'].values.tolist()[0]

Compare model performance

Use the snippets below to check whether including the anomaly score as a feature will improve supervised model results.

# View results of the top performing models across the each run of Autopilot.
print('AUC supervised                      :', model_clf.metrics['AUC']['crossValidation'])
print('AUC supervised with anomaly features:', model_clf_ad.metrics['AUC']['crossValidation'])

print('LogLoss supervised                      :', model_clf.metrics['LogLoss']['crossValidation'])
print('LogLoss supervised with anomaly features:', model_clf_ad.metrics['LogLoss']['crossValidation'])
model_clf_depl = dr.ModelRecommendation.get(project_clf.id).get_model()

Deploy a model and make predictions

After selecting the model you want to use in production, you can deploy it to a production environment and make predictions using the functions defined below.

Deploy a model

pred_serv_id = dr.PredictionServer.list()[0].id
deployment = dr.Deployment.create_from_learning_model(model_id=model_clf_depl.id, 
                                                      label=f'{project_name}_clf_depl',
                                                      default_prediction_server_id=pred_serv_id)
# To compute Feature Impact:
feature_impacts = model_clf_depl.get_or_request_feature_impact()

# To initialize Prediction Explanations:
pei_job = dr.PredictionExplanationsInitialization.create(project_clf.id, model_clf_depl.id)
pei_job.wait_for_completion()

Make predictions with a deployment

Once deployed, prepare a dataset to use for scoring.

df_scoring = pd.read_csv('data/DR_Demo_Medical_Fraud_scoring.csv')
data_to_pred = json.dumps(df_scoring.to_dict(orient='records'))
print(df_scoring.shape)
# Get and process predictions with explanations
print(str(datetime.now()))
preds_raw = predict_deployment_expl(deployment, data_to_pred)

preds_lst = [flatten_json(row) for row in preds_raw['data']]
df_preds = pd.DataFrame(preds_lst)
print(str(datetime.now()))
cols_to_rename = {'predictionValues_0_value': 'Prediction', 
                  'predictionExplanations_0_feature': 'Primary Feature',
                  'predictionExplanations_0_featureValue': 'Primary Feature Value',
                  'predictionExplanations_0_qualitativeStrength':'Primary Feature Strength',

                  'predictionExplanations_1_feature': 'Secondary Feature',
                  'predictionExplanations_1_featureValue': 'Secondary Feature Value',
                  'predictionExplanations_1_qualitativeStrength':'Secondary Feature Strength',

                  'predictionExplanations_2_feature': 'Tertiary Feature',
                  'predictionExplanations_2_featureValue': 'Tertiary Feature Value',
                  'predictionExplanations_2_qualitativeStrength':'Tertiary Feature Strength'}

df_preds.rename(columns=cols_to_rename, inplace=True)
print(df_scoring.shape, df_preds.shape)
cols_to_add = ['rowId', 'Prediction', 
               'Primary Feature Value', 'Primary Feature Strength', 'Primary Feature', 
               'Secondary Feature Value', 'Secondary Feature Strength', 'Secondary Feature',
               'Tertiary Feature Value', 'Tertiary Feature Strength', 'Tertiary Feature',
              ]
df_scoring = df_scoring.merge(df_preds[cols_to_add], on='rowId')
df_scoring['Prediction Category'] = df_scoring.Prediction.apply(lambda x:
                                                                'High' if x >= 0.2 else
                                                                'Medium' if x >= 0.1 else 'Low')
print(df_scoring.shape)

Use the code below to save prediction results.

df_scoring.to_csv('data/DR_Demo_Medical_Fraud_scoring_predictions.csv', index=False)

Updated March 28, 2022
Back to top