source=DataRobotSource(deployment_id=DEPLOYMENT_ID,start=datetime.utcnow()-timedelta(hours=3),end=datetime.utcnow(),)metric=LogLossFromSklearn()me=MetricEvaluator(metric=metric,source=source,time_bucket=TimeBucket.HOUR)aggregated_metric_per_time_bucket=me.score()print(aggregated_metric_per_time_bucket.to_string())timestampsampleslog_loss02023-09-1413:29:48.065000+00:004990.53931512023-09-1414:01:51.484000+00:004990.539397# we can see the evaluator's statisticsstats=me.stats()print(stats)totalrows:998,scorecalls:2,reducecalls:2
test_df=gen_dataframe_for_accuracy_metric(nr_rows=10,rows_per_time_bucket=5,prediction_value=1,time_bucket=TimeBucket.HOUR,)test_df["actuals"].loc[2]=Nonetest_df["actuals"].loc[5]=Noneprint(test_df)timestamppredictionsactuals001/06/200513:00:00.00000010.999101/06/200513:00:00.00000010.999201/06/200513:00:00.0000001NaN301/06/200513:00:00.00000010.999401/06/200513:00:00.00000010.999501/06/200514:00:00.0000001NaN601/06/200514:00:00.00000010.999701/06/200514:00:00.00000010.999801/06/200514:00:00.00000010.999901/06/200514:00:00.00000010.999source=DataFrameSource(df=test_df)metric=MedianAbsoluteError()me=MetricEvaluator(metric=metric,source=source,time_bucket=TimeBucket.HOUR)aggregated_metric_per_time_bucket=me.score()"ValueError: Could not apply metric median_absolute_error, make sure you are passing the right data (see the sklearn docs).Theerrormessagewas:InputcontainsNaN."
前の結果を、filter_actualsフラグを有効化したときの結果と比較します。
me=MetricEvaluator(metric=metric,source=source,time_bucket=TimeBucket.HOUR,filter_actuals=True)aggregated_metric_per_time_bucket=me.score()"removed 1 rows out of 5 in the data chunk before scoring, due to missing values in ['actuals'] data""removed 1 rows out of 5 in the data chunk before scoring, due to missing values in ['actuals'] data"print(aggregated_metric_per_time_bucket.to_string())timestampsamplesmedian_absolute_error001/06/200513:00:00.00000040.001101/06/200514:00:00.00000040.001
test_df=gen_dataframe_for_accuracy_metric(nr_rows=4,rows_per_time_bucket=2,prediction_value=1,time_bucket=TimeBucket.HOUR,)test_df["actuals"].loc[0]=Nonetest_df["actuals"].loc[1]=Noneprint(test_df)timestamppredictionsactuals001/06/200513:00:00.0000001NaN101/06/200513:00:00.0000001NaN201/06/200514:00:00.00000010.999301/06/200514:00:00.00000010.999source=DataFrameSource(df=test_df)metric=MedianAbsoluteError()me=MetricEvaluator(metric=metric,source=source,time_bucket=TimeBucket.HOUR,filter_actuals=True)aggregated_metric_per_time_bucket=me.score()"removed 2 rows out of 2 in the data chunk before scoring, due to missing values in ['actuals'] data""data chunk is empty, skipping scoring..."print(aggregated_metric_per_time_bucket.to_string())timestampsamplesmedian_absolute_error101/06/200514:00:00.00000020.001
metrics=LogLossFromSklearn()me=MetricEvaluator(metric=metric,source=source,time_bucket=TimeBucket.HOUR,segment_attribute="insulin",segment_value="Down",)aggregated_metric_per_time_bucket=me.score()print(aggregated_metric_per_time_bucket.to_string())timestampsampleslog_loss[Down]02023-09-1413:29:49.737000+00:00490.59448312023-09-1414:01:52.437000+00:00490.594483# passing more than one segment valueme=MetricEvaluator(metric=metric,source=source,time_bucket=TimeBucket.HOUR,segment_attribute="insulin",segment_value=["Down","Steady"],)aggregated_metric_per_time_bucket=me.score()print(aggregated_metric_per_time_bucket.to_string())timestampsampleslog_loss[Down]log_loss[Steady]02023-09-1413:29:48.502000+00:001990.5944830.51581112023-09-1414:01:51.758000+00:001990.5944830.515811# passing more than one segment value and more than one metricme=MetricEvaluator(metric=[LogLossFromSklearn(),RocAuc()],source=source,time_bucket=TimeBucket.HOUR,segment_attribute="insulin",segment_value=["Down","Steady"],)aggregated_metric_per_time_bucket=me.score()print(aggregated_metric_per_time_bucket.to_string())timestampsampleslog_loss[Down]log_loss[Steady]roc_auc_score[Down]roc_auc_score[Steady]02023-09-1413:29:48.502000+00:001990.5944830.5158110.7833330.82663212023-09-1414:01:51.758000+00:001990.5944830.5158110.7833330.826632
fromitertoolsimportzip_longestfromtypingimportListfromdatetimeimportdatetimefromdatetimeimporttimedeltafromdmmimportCustomMetricfromdmmimportDataRobotSourcefromdmmimportSingleMetricResultfromdmm.individual_metric_evaluatorimportIndividualMetricEvaluatorfromdmm.metricimportLLMMetricBasefromnltkimportsent_tokenizeimportnumpyasnpimportpandasaspdsource=DataRobotSource(deployment_id=DEPLOYMENT_ID,start=datetime.utcnow()-timedelta(weeks=1),end=datetime.utcnow(),)custom_metric=CustomMetric.from_id()classSentenceCount(LLMMetricBase):""" Calculates the total number of sentences created while working with the LLM model. Returns the sum of the number of sentences from prompts and completions. """def__init__(self):super().__init__(name=custom_metric.name,description="Calculates the total number of sentences created while working with the LLM model.",need_training_data=False,)self.prompt_column="promptColumn"defscore(self,scoring_data:pd.DataFrame,predictions:np.ndarray,timestamps:np.ndarray,association_ids:np.ndarray,**kwargs,)->List[SingleMetricResult]:ifself.prompt_columnnotinscoring_data.columns:raiseValueError(f"Prompt column {self.prompt_column} not found in the exported data, "f"modify 'PROMPT_COLUMN' runtime parameter")prompts=scoring_data[self.prompt_column].to_numpy()sentence_count=[]forprompt,completion,ts,a_idinzip_longest(prompts,predictions,timestamps,association_ids):ifnotisinstance(prompt,str)ornotisinstance(completion,str):continuevalue=len(sent_tokenize(prompt))+len(sent_tokenize(completion))sentence_count.append(SingleMetricResult(value=value,timestamp=ts,association_id=a_id))returnsentence_countsentence_count_evaluator=IndividualMetricEvaluator(metric=SentenceCount(),source=source,)metric_results=sentence_count_evaluator.score()