From 02a20ef573a573a022541a1f4cf39af7155cb788 Mon Sep 17 00:00:00 2001 From: Charles Teague Date: Thu, 31 Oct 2024 11:11:37 -0400 Subject: [PATCH] Better support string values for scores MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a user produces a score whose value is a string, when that value is ‘reduced’ using the default mean reducer, it is coerced to a float. For strings thing means when the Score arrives at the custom metric, it will carry the reduced value which has been coerced to a float. This fix is minimal - it implements support for string values in the mean reducer, providing the most common string value (or the first string value if non are most common). Fixes #775 --- CHANGELOG.md | 1 + src/inspect_ai/_view/www/.gitignore | 3 +- src/inspect_ai/scorer/_reducer/reducer.py | 2 ++ tests/scorer/test_metric.py | 44 +++++++++++++++++++++++ 4 files changed, 49 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5c5cac426..1471502c5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ - Use user_data_dir rather than user_runtime_dir for view notifications. - Implement `read_eval_log_sample()` for JSON log files. - Log the list of dataset sample IDs. +- Fix an issue which forced all values passed to a custom metric to a float value (https://github.com/UKGovernmentBEIS/inspect_ai/issues/775) ## v0.3.42 (23 October 2024) diff --git a/src/inspect_ai/_view/www/.gitignore b/src/inspect_ai/_view/www/.gitignore index 42679e49b..dace292b2 100644 --- a/src/inspect_ai/_view/www/.gitignore +++ b/src/inspect_ai/_view/www/.gitignore @@ -1,4 +1,5 @@ node_modules/ .env __pycache__/ -dist/assets/*.js.map \ No newline at end of file +dist/assets/*.js.map +logs/ diff --git a/src/inspect_ai/scorer/_reducer/reducer.py b/src/inspect_ai/scorer/_reducer/reducer.py index 430b1ba78..65df267bb 100644 --- a/src/inspect_ai/scorer/_reducer/reducer.py +++ b/src/inspect_ai/scorer/_reducer/reducer.py @@ -46,6 +46,8 @@ def reduce(scores: list[Score]) -> Score: return _compute_dict_stat(scores, value_to_float, statistics.mean) elif isinstance(scores[0].value, list): return _compute_list_stat(scores, value_to_float, statistics.mean) + elif isinstance(scores[0].value, str): + return mode_score()(scores) else: return _compute_scalar_stat(scores, value_to_float, statistics.mean) diff --git a/tests/scorer/test_metric.py b/tests/scorer/test_metric.py index 0272ca61e..aa1614446 100644 --- a/tests/scorer/test_metric.py +++ b/tests/scorer/test_metric.py @@ -211,6 +211,50 @@ def check_log(log): check_log(log) +@metric +def is_string() -> Metric: + """Demonstrates that a string arrives on the scene.""" + + def metric(scores: list[Score]) -> float: + string_count = 0 + for s in scores: + if isinstance(s.value, str): + string_count = string_count + 1 + return string_count / len(scores) + + return metric + + +@scorer(metrics=[is_string()]) +def string_scorer() -> Scorer: + async def score(state: TaskState, target: Target) -> Score: + return Score(value="e") + + return score + + +def test_string_score_metric() -> None: + def check_log(log): + assert ( + log.results + and (list(log.results.scores[0].metrics.keys()) == ["is_string"]) + and (log.results.scores[0].metrics["is_string"].value == 1.0) + ) + + task = Task( + dataset=[ + Sample( + input="What is the fifth letter of the US alphabet?", target=["e", "E"] + ) + ], + scorer=string_scorer(), + ) + + # normal eval + log = eval(tasks=task, model="mockllm/model")[0] + check_log(log) + + def registry_assert(metric: Metric | Callable[..., Metric], name: str) -> None: info = registry_info(metric) assert info.name == name