From 41a724038467c275129fd016bd9a541fe07883b6 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Mon, 10 Jul 2023 13:04:50 +0100 Subject: [PATCH 1/3] Report a metric for the size of gappy state blocks --- sync2/poller.go | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/sync2/poller.go b/sync2/poller.go index aa52baa7..27a97bf1 100644 --- a/sync2/poller.go +++ b/sync2/poller.go @@ -69,6 +69,7 @@ type PollerMap struct { executorRunning bool processHistogramVec *prometheus.HistogramVec timelineSizeHistogramVec *prometheus.HistogramVec + gappyStateSizeVec *prometheus.HistogramVec numOutstandingSyncReqsGauge prometheus.Gauge totalNumPollsCounter prometheus.Counter } @@ -121,6 +122,14 @@ func NewPollerMap(v2Client Client, enablePrometheus bool) *PollerMap { Buckets: []float64{0.0, 1.0, 2.0, 5.0, 10.0, 20.0, 50.0}, }, []string{"limited"}) prometheus.MustRegister(pm.timelineSizeHistogramVec) + pm.gappyStateSizeVec = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "sliding_sync", + Subsystem: "poller", + Name: "gappy_state_size", + Help: "Number of events in a state block during a sync v2 gappy sync", + Buckets: []float64{1.0, 10.0, 100.0, 1000.0, 10000.0}, + }, nil) + prometheus.MustRegister(pm.gappyStateSizeVec) pm.totalNumPollsCounter = prometheus.NewCounter(prometheus.CounterOpts{ Namespace: "sliding_sync", Subsystem: "poller", @@ -156,6 +165,9 @@ func (h *PollerMap) Terminate() { if h.timelineSizeHistogramVec != nil { prometheus.Unregister(h.timelineSizeHistogramVec) } + if h.gappyStateSizeVec != nil { + prometheus.Unregister(h.gappyStateSizeVec) + } if h.totalNumPollsCounter != nil { prometheus.Unregister(h.totalNumPollsCounter) } @@ -221,6 +233,7 @@ func (h *PollerMap) EnsurePolling(pid PollerID, accessToken, v2since string, isS poller = newPoller(pid, accessToken, h.v2Client, h, logger, !needToWait && !isStartup) poller.processHistogramVec = h.processHistogramVec poller.timelineSizeVec = h.timelineSizeHistogramVec + poller.gappyStateSizeVec = h.gappyStateSizeVec poller.numOutstandingSyncReqs = h.numOutstandingSyncReqsGauge poller.totalNumPolls = h.totalNumPollsCounter go poller.Poll(v2since) @@ -377,6 +390,7 @@ type poller struct { pollHistogramVec *prometheus.HistogramVec processHistogramVec *prometheus.HistogramVec timelineSizeVec *prometheus.HistogramVec + gappyStateSizeVec *prometheus.HistogramVec numOutstandingSyncReqs prometheus.Gauge totalNumPolls prometheus.Counter } @@ -741,3 +755,10 @@ func (p *poller) trackTimelineSize(size int, limited bool) { } p.timelineSizeVec.WithLabelValues(label).Observe(float64(size)) } + +func (p *poller) trackGappyStateSize(size int) { + if p.gappyStateSizeVec == nil { + return + } + p.gappyStateSizeVec.WithLabelValues().Observe(float64(size)) +} From 5064f64b351e60bba1f31ce8f584a84e53a1d988 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Mon, 10 Jul 2023 14:23:48 +0100 Subject: [PATCH 2/3] Log error message to stdout if poller panics otherwise we only see the error message if we're using sentry. --- sync2/poller.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sync2/poller.go b/sync2/poller.go index 27a97bf1..44a370f2 100644 --- a/sync2/poller.go +++ b/sync2/poller.go @@ -447,7 +447,7 @@ func (p *poller) Poll(since string) { defer func() { panicErr := recover() if panicErr != nil { - logger.Error().Str("user", p.userID).Str("device", p.deviceID).Msg(string(debug.Stack())) + logger.Error().Str("user", p.userID).Str("device", p.deviceID).Msgf("%s. Traceback:\n%s", panicErr, debug.Stack()) internal.GetSentryHubFromContextOrDefault(ctx).RecoverWithContext(ctx, panicErr) } p.receiver.OnTerminated(ctx, p.userID, p.deviceID) From dcf8db347236d8e61eacaf2aa3c571adb902ff67 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Mon, 10 Jul 2023 16:19:05 +0100 Subject: [PATCH 3/3] Actually observe the new metric --- sync2/poller.go | 1 + 1 file changed, 1 insertion(+) diff --git a/sync2/poller.go b/sync2/poller.go index 44a370f2..9da833d3 100644 --- a/sync2/poller.go +++ b/sync2/poller.go @@ -666,6 +666,7 @@ func (p *poller) parseRoomsResponse(ctx context.Context, res *SyncResponse) { }) hub.CaptureMessage(warnMsg) }) + p.trackGappyStateSize(len(prependStateEvents)) roomData.Timeline.Events = append(prependStateEvents, roomData.Timeline.Events...) } }