Skip to content

Commit

Permalink
fix(similarity): use get_primary_hash in backfill (#72022)
Browse files Browse the repository at this point in the history
- previously we were getting a random hash associated with the group.
- now, we simply use `event.get_primary_hash()`
- removes passing down these hashes through the various functions as its
not needed now
- deletes a few tests where we were going through edge cases where we
weren't finding a group_id associated with the hash .
  • Loading branch information
JoshFerge authored Jun 4, 2024
1 parent 96d41ce commit 27a2f5e
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 138 deletions.
52 changes: 32 additions & 20 deletions src/sentry/tasks/backfill_seer_grouping_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from sentry.issues.grouptype import ErrorGroupType
from sentry.issues.occurrence_consumer import EventLookupError
from sentry.models.group import Group, GroupStatus
from sentry.models.grouphash import GroupHash
from sentry.models.project import Project
from sentry.seer.similarity.backfill import (
CreateGroupingRecordData,
Expand Down Expand Up @@ -221,12 +220,8 @@ def backfill_seer_grouping_records(
group_id_batch.remove(group_id)
del group_id_message_batch_filtered[group_id]

group_hashes = GroupHash.objects.filter(
project_id=project.id, group_id__in=group_id_batch
).distinct("group_id")
group_hashes_dict = {group_hash.group_id: group_hash.hash for group_hash in group_hashes}
data = lookup_group_data_stacktrace_bulk_with_fallback(
project, rows, group_id_message_batch_filtered, group_hashes_dict
project, rows, group_id_message_batch_filtered
)

# If nodestore returns no data
Expand All @@ -245,6 +240,11 @@ def backfill_seer_grouping_records(
)
return

group_hashes_dict = {
group_stacktrace_data["group_id"]: group_stacktrace_data["hash"]
for group_stacktrace_data in data["data"]
}

with metrics.timer(f"{BACKFILL_NAME}.post_bulk_grouping_records", sample_rate=1.0):
response = post_bulk_grouping_records(
CreateGroupingRecordsRequest(
Expand All @@ -258,7 +258,7 @@ def backfill_seer_grouping_records(
groups_with_neighbor = response["groups_with_neighbor"]
groups = Group.objects.filter(project_id=project.id, id__in=group_id_batch)
for group in groups:
seer_similarity = {
seer_similarity: dict[str, Any] = {
"similarity_model_version": SEER_SIMILARITY_MODEL_VERSION,
"request_hash": group_hashes_dict[group.id],
}
Expand Down Expand Up @@ -332,19 +332,19 @@ def backfill_seer_grouping_records(


def lookup_group_data_stacktrace_bulk_with_fallback(
project: Project, rows: list[GroupEventRow], messages: dict[int, str], hashes: dict[int, str]
project: Project, rows: list[GroupEventRow], messages: dict[int, str]
) -> GroupStacktraceData:
(
bulk_event_ids,
invalid_event_ids,
bulk_group_data_stacktraces,
) = lookup_group_data_stacktrace_bulk(project, rows, messages, hashes)
) = lookup_group_data_stacktrace_bulk(project, rows, messages)
for row in rows:
event_id, group_id = row["event_id"], row["group_id"]
if event_id not in bulk_event_ids and event_id not in invalid_event_ids:
try:
group_data, stacktrace_string = lookup_group_data_stacktrace_single(
project, event_id, int(group_id), messages[group_id], hashes[group_id]
project, event_id, int(group_id), messages[group_id]
)
if group_data and stacktrace_string:
bulk_group_data_stacktraces["data"].append(group_data)
Expand Down Expand Up @@ -374,7 +374,7 @@ def lookup_group_data_stacktrace_bulk_with_fallback(

@metrics.wraps(f"{BACKFILL_NAME}.lookup_event_bulk", sample_rate=1.0)
def lookup_group_data_stacktrace_bulk(
project: Project, rows: list[GroupEventRow], messages: dict[int, str], hashes: dict[int, str]
project: Project, rows: list[GroupEventRow], messages: dict[int, str]
) -> tuple[set[str], set[str], GroupStacktraceData]:
project_id = project.id
node_id_to_group_data = {
Expand Down Expand Up @@ -421,18 +421,23 @@ def lookup_group_data_stacktrace_bulk(
)
event = Event(event_id=event_id, project_id=project_id, group_id=group_id)
event.data = data
if event and event.data and event.data.get("exception") and hashes.get(group_id):
if event and event.data and event.data.get("exception"):
grouping_info = get_grouping_info(None, project=project, event=event)
stacktrace_string = get_stacktrace_string(grouping_info)
if stacktrace_string == "":
invalid_event_ids.add(event_id)
continue
primary_hash = event.get_primary_hash()
if not primary_hash:
invalid_event_ids.add(event_id)
continue

group_data.append(
CreateGroupingRecordData(
group_id=group_id,
project_id=project_id,
message=messages[group_id],
hash=hashes[group_id],
hash=primary_hash,
)
)
stacktrace_strings.append(stacktrace_string)
Expand All @@ -455,7 +460,7 @@ def lookup_group_data_stacktrace_bulk(

@metrics.wraps(f"{BACKFILL_NAME}.lookup_event_single")
def lookup_group_data_stacktrace_single(
project: Project, event_id: str, group_id: int, message: str, hash: str
project: Project, event_id: str, group_id: int, message: str
) -> tuple[CreateGroupingRecordData | None, str]:
project_id = project.id
try:
Expand Down Expand Up @@ -485,13 +490,20 @@ def lookup_group_data_stacktrace_single(
with sentry_sdk.start_transaction(op="embeddings_grouping.get_latest_event"):
grouping_info = get_grouping_info(None, project=project, event=event)
stacktrace_string = get_stacktrace_string(grouping_info)
group_data = (
CreateGroupingRecordData(
group_id=group_id, hash=hash, project_id=project_id, message=message
primary_hash = event.get_primary_hash()
if not primary_hash:
group_data = None
else:
group_data = (
CreateGroupingRecordData(
group_id=group_id,
hash=primary_hash,
project_id=project_id,
message=message,
)
if stacktrace_string != ""
else None
)
if stacktrace_string != ""
else None
)

return (group_data, stacktrace_string)

Expand Down
128 changes: 10 additions & 118 deletions tests/sentry/tasks/test_backfill_seer_grouping_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def test_lookup_group_data_stacktrace_single_success(self):
event = self.event
hash = self.group_hashes[event.group.id]
group_data, stacktrace_string = lookup_group_data_stacktrace_single(
self.project, event.event_id, event.group_id, event.group.message, hash
self.project, event.event_id, event.group_id, event.group.message
)
expected_group_data = CreateGroupingRecordData(
group_id=event.group.id,
Expand Down Expand Up @@ -200,7 +200,6 @@ def test_lookup_group_data_stacktrace_single_exceptions(
event.event_id,
event.group_id,
event.group.message,
self.group_hashes[event.group.id],
)
mock_logger.exception.assert_called_with(
"tasks.backfill_seer_grouping_records.event_lookup_exception",
Expand All @@ -221,18 +220,16 @@ def test_lookup_group_data_stacktrace_single_not_stacktrace_grouping(self):
project_id=self.project.id,
assert_no_errors=False,
)
hash = GroupHash.objects.get(group_id=event.group.id)
group_data, stacktrace_string = lookup_group_data_stacktrace_single(
self.project, event.event_id, event.group_id, event.group.message, hash
self.project, event.event_id, event.group_id, event.group.message
)
assert (group_data, stacktrace_string) == (None, "")

def test_lookup_group_data_stacktrace_single_no_stacktrace(self):
"""Test that no data is returned if the event has no stacktrace"""
event = self.store_event(data={}, project_id=self.project.id, assert_no_errors=False)
hash = GroupHash.objects.get(group_id=event.group.id)
group_data, stacktrace_string = lookup_group_data_stacktrace_single(
self.project, event.event_id, event.group_id, event.group.message, hash
self.project, event.event_id, event.group_id, event.group.message
)
assert (group_data, stacktrace_string) == (None, "")

Expand All @@ -244,7 +241,7 @@ def test_lookup_group_data_stacktrace_bulk_success(self, mock_metrics):
bulk_event_ids,
invalid_event_ids,
bulk_group_data_stacktraces,
) = lookup_group_data_stacktrace_bulk(self.project, rows, messages, self.group_hashes)
) = lookup_group_data_stacktrace_bulk(self.project, rows, messages)

expected_event_ids = {event.event_id for event in events}
expected_group_data = [
Expand Down Expand Up @@ -287,7 +284,7 @@ def test_lookup_group_data_stacktrace_bulk_exceptions(
for exception in exceptions:
mock_get_multi.side_effect = exception
with pytest.raises(Exception):
lookup_group_data_stacktrace_bulk(self.project, rows, messages, self.group_hashes)
lookup_group_data_stacktrace_bulk(self.project, rows, messages)
mock_logger.exception.assert_called_with(
"tasks.backfill_seer_grouping_records.bulk_event_lookup_exception",
extra={
Expand Down Expand Up @@ -323,7 +320,7 @@ def test_lookup_group_data_stacktrace_bulk_not_stacktrace_grouping(self):
bulk_event_ids,
invalid_event_ids,
bulk_group_data_stacktraces,
) = lookup_group_data_stacktrace_bulk(self.project, rows, messages, hashes)
) = lookup_group_data_stacktrace_bulk(self.project, rows, messages)
expected_group_data = [
CreateGroupingRecordData(
group_id=event.group.id,
Expand Down Expand Up @@ -363,46 +360,7 @@ def test_lookup_group_data_stacktrace_bulk_no_stacktrace_exception(self):
bulk_event_ids,
invalid_event_ids,
bulk_group_data_stacktraces,
) = lookup_group_data_stacktrace_bulk(self.project, rows, messages, hashes)
expected_group_data = [
CreateGroupingRecordData(
group_id=event.group.id,
hash=hashes[event.group.id],
project_id=self.project.id,
message=event.group.message,
)
for event in events
]
expected_stacktraces = [
f'Error{i}: error with value\n File "function_{i}.py", function function_{i}'
for i in range(2)
]
assert bulk_event_ids == {event.event_id for event in events}
assert invalid_event_ids == {event.event_id}
assert bulk_group_data_stacktraces["data"] == expected_group_data
assert bulk_group_data_stacktraces["stacktrace_list"] == expected_stacktraces

def test_lookup_group_data_stacktrace_bulk_no_hash(self):
"""
Test that if a group does not have a hash (for whatever reason), its data is not included
in the bulk lookup result
"""
# Use 2 events
rows, events, messages, hashes = self.bulk_rows[:2], self.bulk_events[:2], {}, {}
group_ids = [row["group_id"] for row in rows]
for group_id in group_ids:
messages.update({group_id: self.bulk_messages[group_id]})
hashes.update({group_id: self.group_hashes[group_id]})
# Create one event with no hash
event = self.store_event(data={}, project_id=self.project.id, assert_no_errors=False)
rows.append({"event_id": event.event_id, "group_id": event.group_id})
messages.update({event.group_id: event.group.message})

(
bulk_event_ids,
invalid_event_ids,
bulk_group_data_stacktraces,
) = lookup_group_data_stacktrace_bulk(self.project, rows, messages, hashes)
) = lookup_group_data_stacktrace_bulk(self.project, rows, messages)
expected_group_data = [
CreateGroupingRecordData(
group_id=event.group.id,
Expand Down Expand Up @@ -430,7 +388,7 @@ def test_lookup_group_data_stacktrace_bulk_with_fallback_success(self):
self.group_hashes,
)
bulk_group_data_stacktraces = lookup_group_data_stacktrace_bulk_with_fallback(
self.project, rows, messages, hashes
self.project, rows, messages
)

expected_group_data = [
Expand Down Expand Up @@ -480,7 +438,7 @@ def test_lookup_group_data_stacktrace_bulk_with_fallback_use_single_fallback(

rows, messages, hashes = self.bulk_rows, self.bulk_messages, self.group_hashes
bulk_group_data_stacktraces = lookup_group_data_stacktrace_bulk_with_fallback(
self.project, rows, messages, hashes=hashes
self.project, rows, messages
)

events = self.bulk_events
Expand All @@ -500,72 +458,6 @@ def test_lookup_group_data_stacktrace_bulk_with_fallback_use_single_fallback(
assert bulk_group_data_stacktraces["data"] == expected_group_data
assert bulk_group_data_stacktraces["stacktrace_list"] == expected_stacktraces

@patch("sentry.tasks.backfill_seer_grouping_records.logger")
@patch("sentry.tasks.backfill_seer_grouping_records.lookup_group_data_stacktrace_bulk")
def test_lookup_group_data_stacktrace_bulk_with_fallback_no_hash(
self, mock_lookup_group_data_stacktrace_bulk, mock_logger
):
"""
Test that if a group does not have a hash (for whatever reason), we do not attempt the
fallback and we log it
"""
# Purposely exclude one event from being included in the bulk lookup response, so that the fallback is used
events_missing = self.bulk_events[:-1]
group_data, stacktrace_strings = [], []
for event in events_missing:
grouping_info = get_grouping_info(None, project=self.project, event=event)
stacktrace_string = get_stacktrace_string(grouping_info)
group_data.append(
CreateGroupingRecordData(
group_id=event.group.id,
hash=self.group_hashes[event.group.id],
project_id=self.project.id,
message=event.group.message,
)
)
stacktrace_strings.append(stacktrace_string)
mock_lookup_group_data_stacktrace_bulk.return_value = (
{event.event_id for event in events_missing},
set(),
GroupStacktraceData(data=group_data, stacktrace_list=stacktrace_strings),
)

# Purposely remove the hash for the missing event
hashes = copy.deepcopy(self.group_hashes)
del hashes[self.bulk_events[-1].group.id]

rows, messages = self.bulk_rows, self.bulk_messages
bulk_group_data_stacktraces = lookup_group_data_stacktrace_bulk_with_fallback(
self.project, rows, messages, hashes=hashes
)

events = self.bulk_events[:-1]
expected_group_data = [
CreateGroupingRecordData(
group_id=event.group.id,
hash=hashes[event.group.id],
project_id=self.project.id,
message=event.group.message,
)
for event in events
]
expected_stacktraces = [
f'Error{i}: error with value\n File "function_{i}.py", function function_{i}'
for i in range(4)
]
assert bulk_group_data_stacktraces["data"] == expected_group_data
assert bulk_group_data_stacktraces["stacktrace_list"] == expected_stacktraces
assert bulk_group_data_stacktraces["data"] == expected_group_data
assert bulk_group_data_stacktraces["stacktrace_list"] == expected_stacktraces
mock_logger.exception.assert_called_with(
"tasks.backfill_seer_grouping_records.no_group_hash",
extra={
"organization_id": self.project.organization.id,
"project_id": self.project.id,
"group_id": self.bulk_events[-1].group_id,
},
)

@patch("sentry.tasks.backfill_seer_grouping_records.logger")
def test_lookup_group_data_stacktrace_bulk_with_fallback_event_lookup_error(self, mock_logger):
"""
Expand All @@ -581,7 +473,7 @@ def test_lookup_group_data_stacktrace_bulk_with_fallback_event_lookup_error(self
rows[-1]["event_id"] = 10000

bulk_group_data_stacktraces = lookup_group_data_stacktrace_bulk_with_fallback(
self.project, rows, messages, hashes
self.project, rows, messages
)

events = self.bulk_events[:-1]
Expand Down

0 comments on commit 27a2f5e

Please sign in to comment.