Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Fix-up event_fields in filters #15607

Merged
merged 15 commits into from
May 22, 2023
1 change: 1 addition & 0 deletions changelog.d/15607.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix a long-standing bug where filters with multiple backslashes were rejected.
15 changes: 1 addition & 14 deletions synapse/api/filtering.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,20 +128,7 @@
"account_data": {"$ref": "#/definitions/filter"},
"room": {"$ref": "#/definitions/room_filter"},
"event_format": {"type": "string", "enum": ["client", "federation"]},
"event_fields": {
"type": "array",
"items": {
"type": "string",
# Don't allow '\\' in event field filters. This makes matching
# events a lot easier as we can then use a negative lookbehind
# assertion to split '\.' If we allowed \\ then it would
# incorrectly split '\\.' See synapse.events.utils.serialize_event
#
# Note that because this is a regular expression, we have to escape
# each backslash in the pattern.
"pattern": r"^((?!\\\\).)*$",
},
},
"event_fields": {"type": "array", "items": {"type": "string"}},
},
"additionalProperties": True, # Allow new fields for forward compatibility
}
Expand Down
72 changes: 58 additions & 14 deletions synapse/events/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
Iterable,
List,
Mapping,
Match,
MutableMapping,
Optional,
Union,
Expand All @@ -46,12 +47,10 @@
from synapse.handlers.relations import BundledAggregations


# Split strings on "." but not "\." This uses a negative lookbehind assertion for '\'
# (?<!stuff) matches if the current position in the string is not preceded
# by a match for 'stuff'.
# TODO: This is fast, but fails to handle "foo\\.bar" which should be treated as
# the literal fields "foo\" and "bar" but will instead be treated as "foo\\.bar"
SPLIT_FIELD_REGEX = re.compile(r"(?<!\\)\.")
# Split strings on "." but not "\." (or "\\\.").
SPLIT_FIELD_REGEX = re.compile(r"\\*\.")
# Find escaped characters, e.g. those with a \ in front of them.
ESCAPE_SEQUENCE_PATTERN = re.compile(r"\\(.)")

CANONICALJSON_MAX_INT = (2**53) - 1
CANONICALJSON_MIN_INT = -CANONICALJSON_MAX_INT
Expand Down Expand Up @@ -253,14 +252,65 @@ def _copy_field(src: JsonDict, dst: JsonDict, field: List[str]) -> None:
sub_out_dict[key_to_move] = sub_dict[key_to_move]


def _escape_slash(m: Match[str]) -> str:
"""
Replacement function; replace a backslash-backslash or backslash-dot with the
second character. Leaves any other string alone.
"""
if m.group(1) in ("\\", "."):
return m.group(1)
return m.group(0)


def _split_field(field: str) -> List[str]:
"""
Splits strings on unescaped dots and removes escaping.

Args:
field: A string representing a path to a field.

Returns:
A list of nested fields to traverse.
"""

# Convert the field and remove escaping:
#
# 1. "content.body.thing\.with\.dots"
# 2. ["content", "body", "thing\.with\.dots"]
# 3. ["content", "body", "thing.with.dots"]

# Find all dots (and their preceding backslashes). If the dot is unescaped
# then emit a new field part.
result = []
prev_start = 0
for match in SPLIT_FIELD_REGEX.finditer(field):
# If the match is an *even* number of characters than the dot was escaped.
if len(match.group()) % 2 == 0:
continue

# Add a new part (up to the dot, exclusive) after escaping.
result.append(
ESCAPE_SEQUENCE_PATTERN.sub(
_escape_slash, field[prev_start : match.end() - 1]
)
)
prev_start = match.end()

# Add any part of the field after the last unescaped dot. (Note that if the
# character is a dot this correctly adds a blank string.)
result.append(re.sub(r"\\(.)", _escape_slash, field[prev_start:]))

return result


def only_fields(dictionary: JsonDict, fields: List[str]) -> JsonDict:
"""Return a new dict with only the fields in 'dictionary' which are present
in 'fields'.

If there are no event fields specified then all fields are included.
The entries may include '.' characters to indicate sub-fields.
So ['content.body'] will include the 'body' field of the 'content' object.
A literal '.' character in a field name may be escaped using a '\'.
A literal '.' or '\' character in a field name may be escaped using a '\'.

Args:
dictionary: The dictionary to read from.
Expand All @@ -275,13 +325,7 @@ def only_fields(dictionary: JsonDict, fields: List[str]) -> JsonDict:

# for each field, convert it:
# ["content.body.thing\.with\.dots"] => [["content", "body", "thing\.with\.dots"]]
split_fields = [SPLIT_FIELD_REGEX.split(f) for f in fields]

# for each element of the output array of arrays:
# remove escaping so we can use the right key names.
split_fields[:] = [
[f.replace(r"\.", r".") for f in field_array] for field_array in split_fields
]
split_fields = [_split_field(f) for f in fields]

output: JsonDict = {}
for field_array in split_fields:
Expand Down
6 changes: 0 additions & 6 deletions tests/api/test_filtering.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,6 @@ def test_errors_on_invalid_filters(self) -> None:
invalid_filters: List[JsonDict] = [
# `account_data` must be a dictionary
{"account_data": "Hello World"},
# `event_fields` entries must not contain backslashes
{"event_fields": [r"\\foo"]},
# `event_format` must be "client" or "federation"
{"event_format": "other"},
# `not_rooms` must contain valid room IDs
Expand Down Expand Up @@ -114,10 +112,6 @@ def test_valid_filters(self) -> None:
"event_format": "client",
"event_fields": ["type", "content", "sender"],
},
# a single backslash should be permitted (though it is debatable whether
# it should be permitted before anything other than `.`, and what that
# actually means)
#
# (note that event_fields is implemented in
# synapse.events.utils.serialize_event, and so whether this actually works
# is tested elsewhere. We just want to check that it is allowed through the
Expand Down
39 changes: 39 additions & 0 deletions tests/events/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,15 @@
from typing import Any, List, Mapping, Optional

import attr
from parameterized import parameterized

from synapse.api.constants import EventContentFields
from synapse.api.room_versions import RoomVersions
from synapse.events import EventBase, make_event_from_dict
from synapse.events.utils import (
PowerLevelsContent,
SerializeEventConfig,
_split_field,
copy_and_fixup_power_levels_contents,
maybe_upsert_event_field,
prune_event,
Expand Down Expand Up @@ -794,3 +796,40 @@ def test_invalid_types_raise_type_error(self) -> None:
def test_invalid_nesting_raises_type_error(self) -> None:
with self.assertRaises(TypeError):
copy_and_fixup_power_levels_contents({"a": {"b": {"c": 1}}}) # type: ignore[dict-item]


class SplitFieldTestCase(stdlib_unittest.TestCase):
@parameterized.expand(
[
# A field with no dots.
["m", ["m"]],
# Simple dotted fields.
["m.foo", ["m", "foo"]],
["m.foo.bar", ["m", "foo", "bar"]],
# Backslash is used as an escape character.
[r"m\.foo", ["m.foo"]],
[r"m\\.foo", ["m\\", "foo"]],
[r"m\\\.foo", [r"m\.foo"]],
[r"m\\\\.foo", ["m\\\\", "foo"]],
[r"m\foo", [r"m\foo"]],
[r"m\\foo", [r"m\foo"]],
[r"m\\\foo", [r"m\\foo"]],
[r"m\\\\foo", [r"m\\foo"]],
# Ensure that escapes at the end don't cause issues.
["m.foo\\", ["m", "foo\\"]],
["m.foo\\", ["m", "foo\\"]],
[r"m.foo\.", ["m", "foo."]],
[r"m.foo\\.", ["m", "foo\\", ""]],
[r"m.foo\\\.", ["m", r"foo\."]],
# Empty parts (corresponding to properties which are an empty string) are allowed.
[".m", ["", "m"]],
["..m", ["", "", "m"]],
["m.", ["m", ""]],
["m..", ["m", "", ""]],
["m..foo", ["m", "", "foo"]],
# Invalid escape sequences.
[r"\m", [r"\m"]],
]
)
def test_split_field(self, input: str, expected: str) -> None:
self.assertEqual(_split_field(input), expected)