UKGovernmentBEIS · jjallaire · Oct 31, 2024 · Oct 31, 2024 · Oct 31, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,7 @@
 - Use user_data_dir rather than user_runtime_dir for view notifications.
 - Implement `read_eval_log_sample()` for JSON log files.
 - Log the list of dataset sample IDs.
+- Fix an issue which forced all values passed to a custom metric to a float value (https://github.com/UKGovernmentBEIS/inspect_ai/issues/775)
 
 ## v0.3.42 (23 October 2024)
 

diff --git a/src/inspect_ai/_view/www/.gitignore b/src/inspect_ai/_view/www/.gitignore
@@ -1,4 +1,5 @@
 node_modules/
 .env
 __pycache__/
-dist/assets/*.js.map
+dist/assets/*.js.map
+logs/
diff --git a/src/inspect_ai/scorer/_reducer/reducer.py b/src/inspect_ai/scorer/_reducer/reducer.py
@@ -46,6 +46,8 @@ def reduce(scores: list[Score]) -> Score:
             return _compute_dict_stat(scores, value_to_float, statistics.mean)
         elif isinstance(scores[0].value, list):
             return _compute_list_stat(scores, value_to_float, statistics.mean)
+        elif isinstance(scores[0].value, str):
+            return mode_score()(scores)
         else:
             return _compute_scalar_stat(scores, value_to_float, statistics.mean)
 

diff --git a/tests/scorer/test_metric.py b/tests/scorer/test_metric.py
@@ -211,6 +211,50 @@ def check_log(log):
     check_log(log)
 
 
+@metric
+def is_string() -> Metric:
+    """Demonstrates that a string arrives on the scene."""
+
+    def metric(scores: list[Score]) -> float:
+        string_count = 0
+        for s in scores:
+            if isinstance(s.value, str):
+                string_count = string_count + 1
+        return string_count / len(scores)
+
+    return metric
+
+
+@scorer(metrics=[is_string()])
+def string_scorer() -> Scorer:
+    async def score(state: TaskState, target: Target) -> Score:
+        return Score(value="e")
+
+    return score
+
+
+def test_string_score_metric() -> None:
+    def check_log(log):
+        assert (
+            log.results
+            and (list(log.results.scores[0].metrics.keys()) == ["is_string"])
+            and (log.results.scores[0].metrics["is_string"].value == 1.0)
+        )
+
+    task = Task(
+        dataset=[
+            Sample(
+                input="What is the fifth letter of the US alphabet?", target=["e", "E"]
+            )
+        ],
+        scorer=string_scorer(),
+    )
+
+    # normal eval
+    log = eval(tasks=task, model="mockllm/model")[0]
+    check_log(log)
+
+
 def registry_assert(metric: Metric | Callable[..., Metric], name: str) -> None:
     info = registry_info(metric)
     assert info.name == name