From 02a20ef573a573a022541a1f4cf39af7155cb788 Mon Sep 17 00:00:00 2001
From: Charles Teague <cteague@gmail.com>
Date: Thu, 31 Oct 2024 11:11:37 -0400
Subject: [PATCH] Better support string values for scores
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If a user produces a score whose value is a string, when that value is ‘reduced’ using the default mean reducer, it is coerced to a float. For strings thing means when the Score arrives at the custom metric, it will carry the reduced value which has been coerced to a float.

This fix is minimal - it implements support for string values in the mean reducer, providing the most common string value (or the first string value if non are most common).

Fixes #775
---
 CHANGELOG.md                              |  1 +
 src/inspect_ai/_view/www/.gitignore       |  3 +-
 src/inspect_ai/scorer/_reducer/reducer.py |  2 ++
 tests/scorer/test_metric.py               | 44 +++++++++++++++++++++++
 4 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5c5cac426..1471502c5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,7 @@
 - Use user_data_dir rather than user_runtime_dir for view notifications.
 - Implement `read_eval_log_sample()` for JSON log files.
 - Log the list of dataset sample IDs.
+- Fix an issue which forced all values passed to a custom metric to a float value (https://github.com/UKGovernmentBEIS/inspect_ai/issues/775)
 
 ## v0.3.42 (23 October 2024)
 
diff --git a/src/inspect_ai/_view/www/.gitignore b/src/inspect_ai/_view/www/.gitignore
index 42679e49b..dace292b2 100644
--- a/src/inspect_ai/_view/www/.gitignore
+++ b/src/inspect_ai/_view/www/.gitignore
@@ -1,4 +1,5 @@
 node_modules/
 .env
 __pycache__/
-dist/assets/*.js.map
\ No newline at end of file
+dist/assets/*.js.map
+logs/
diff --git a/src/inspect_ai/scorer/_reducer/reducer.py b/src/inspect_ai/scorer/_reducer/reducer.py
index 430b1ba78..65df267bb 100644
--- a/src/inspect_ai/scorer/_reducer/reducer.py
+++ b/src/inspect_ai/scorer/_reducer/reducer.py
@@ -46,6 +46,8 @@ def reduce(scores: list[Score]) -> Score:
             return _compute_dict_stat(scores, value_to_float, statistics.mean)
         elif isinstance(scores[0].value, list):
             return _compute_list_stat(scores, value_to_float, statistics.mean)
+        elif isinstance(scores[0].value, str):
+            return mode_score()(scores)
         else:
             return _compute_scalar_stat(scores, value_to_float, statistics.mean)
 
diff --git a/tests/scorer/test_metric.py b/tests/scorer/test_metric.py
index 0272ca61e..aa1614446 100644
--- a/tests/scorer/test_metric.py
+++ b/tests/scorer/test_metric.py
@@ -211,6 +211,50 @@ def check_log(log):
     check_log(log)
 
 
+@metric
+def is_string() -> Metric:
+    """Demonstrates that a string arrives on the scene."""
+
+    def metric(scores: list[Score]) -> float:
+        string_count = 0
+        for s in scores:
+            if isinstance(s.value, str):
+                string_count = string_count + 1
+        return string_count / len(scores)
+
+    return metric
+
+
+@scorer(metrics=[is_string()])
+def string_scorer() -> Scorer:
+    async def score(state: TaskState, target: Target) -> Score:
+        return Score(value="e")
+
+    return score
+
+
+def test_string_score_metric() -> None:
+    def check_log(log):
+        assert (
+            log.results
+            and (list(log.results.scores[0].metrics.keys()) == ["is_string"])
+            and (log.results.scores[0].metrics["is_string"].value == 1.0)
+        )
+
+    task = Task(
+        dataset=[
+            Sample(
+                input="What is the fifth letter of the US alphabet?", target=["e", "E"]
+            )
+        ],
+        scorer=string_scorer(),
+    )
+
+    # normal eval
+    log = eval(tasks=task, model="mockllm/model")[0]
+    check_log(log)
+
+
 def registry_assert(metric: Metric | Callable[..., Metric], name: str) -> None:
     info = registry_info(metric)
     assert info.name == name