Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Improve reactor_tick_time metric #11724

Merged
merged 9 commits into from
Jan 17, 2022
70 changes: 1 addition & 69 deletions synapse/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import functools
import itertools
import logging
import os
import platform
import threading
import time
from typing import (
Any,
Callable,
Dict,
Generic,
Expand All @@ -33,7 +30,6 @@
Type,
TypeVar,
Union,
cast,
)

import attr
Expand All @@ -44,11 +40,9 @@
GaugeMetricFamily,
)

from twisted.internet import reactor
from twisted.internet.base import ReactorBase
from twisted.python.threadpool import ThreadPool

import synapse
import synapse.metrics._reactor_metrics
from synapse.metrics._exposition import (
MetricsResource,
generate_latest,
Expand Down Expand Up @@ -368,16 +362,6 @@ def collect(self) -> Iterable[Metric]:
REGISTRY.register(CPUMetrics())


#
# Twisted reactor metrics
#

tick_time = Histogram(
"python_twisted_reactor_tick_time",
"Tick time of the Twisted reactor (sec)",
buckets=[0.001, 0.002, 0.005, 0.01, 0.025, 0.05, 0.1, 0.2, 0.5, 1, 2, 5],
)

#
# Federation Metrics
#
Expand Down Expand Up @@ -429,8 +413,6 @@ def collect(self) -> Iterable[Metric]:
" ".join([platform.system(), platform.release()]),
).set(1)

last_ticked = time.time()

# 3PID send info
threepid_send_requests = Histogram(
"synapse_threepid_send_requests_with_tries",
Expand Down Expand Up @@ -478,56 +460,6 @@ def register_threadpool(name: str, threadpool: ThreadPool) -> None:
)


class ReactorLastSeenMetric:
def collect(self) -> Iterable[Metric]:
cm = GaugeMetricFamily(
"python_twisted_reactor_last_seen",
"Seconds since the Twisted reactor was last seen",
)
cm.add_metric([], time.time() - last_ticked)
yield cm


REGISTRY.register(ReactorLastSeenMetric())

F = TypeVar("F", bound=Callable[..., Any])


def runUntilCurrentTimer(reactor: ReactorBase, func: F) -> F:
@functools.wraps(func)
def f(*args: Any, **kwargs: Any) -> Any:
start = time.time()
ret = func(*args, **kwargs)
end = time.time()

# record the amount of wallclock time spent running pending calls.
# This is a proxy for the actual amount of time between reactor polls,
# since about 25% of time is actually spent running things triggered by
# I/O events, but that is harder to capture without rewriting half the
# reactor.
tick_time.observe(end - start)

# Update the time we last ticked, for the metric to test whether
# Synapse's reactor has frozen
global last_ticked
last_ticked = end

return ret

return cast(F, f)


try:
# Ensure the reactor has all the attributes we expect
reactor.runUntilCurrent # type: ignore

# runUntilCurrent is called when we have pending calls. It is called once
# per iteratation after fd polling.
reactor.runUntilCurrent = runUntilCurrentTimer(reactor, reactor.runUntilCurrent) # type: ignore
except AttributeError:
pass


__all__ = [
"MetricsResource",
"generate_latest",
Expand Down
83 changes: 83 additions & 0 deletions synapse/metrics/_reactor_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Copyright 2022 The Matrix.org Foundation C.I.C.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import select
import time
from typing import Any, Iterable, List, Tuple

from prometheus_client import Histogram, Metric
from prometheus_client.core import REGISTRY, GaugeMetricFamily

from twisted.internet import reactor

#
# Twisted reactor metrics
#

tick_time = Histogram(
"python_twisted_reactor_tick_time",
"Tick time of the Twisted reactor (sec)",
buckets=[0.001, 0.002, 0.005, 0.01, 0.025, 0.05, 0.1, 0.2, 0.5, 1, 2, 5],
)


class EpollWrapper:
"""a wrapper for an epoll object which records the time between polls"""

def __init__(self, poller: "select.epoll"):
self.last_polled = time.time()
self._poller = poller

def poll(self, *args, **kwargs) -> List[Tuple[int, int]]: # type: ignore[no-untyped-def]
squahtx marked this conversation as resolved.
Show resolved Hide resolved
# record the time since poll() was last called. This gives a good proxy for
# how long it takes to run everything in the reactor - ie, how long anything
# waiting for the next tick will have to wait.
tick_time.observe(time.time() - self.last_polled)

ret = self._poller.poll(*args, **kwargs)

self.last_polled = time.time()
return ret

def __getattr__(self, item: str) -> Any:
return getattr(self._poller, item)


class ReactorLastSeenMetric:
def __init__(self, epoll_wrapper: EpollWrapper):
self._epoll_wrapper = epoll_wrapper

def collect(self) -> Iterable[Metric]:
cm = GaugeMetricFamily(
"python_twisted_reactor_last_seen",
"Seconds since the Twisted reactor was last seen",
)
cm.add_metric([], time.time() - self._epoll_wrapper.last_polled)
yield cm


try:
# if the reactor has a `_poller` attribute, which is an `epoll` object
# (ie, it's an EPollReactor), we wrap the `epoll` with a thing that will
# measure the time between ticks
from select import epoll

poller = reactor._poller # type: ignore[attr-defined]
except (AttributeError, ImportError):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When would the reactor not be epoll-based?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

whenever select.epoll doesn't exist, which basically means we're not running on Linux: https://docs.python.org/3/library/select.html#select.epoll

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can see how the reactor is chosen at: https://github.com/twisted/twisted/blob/twisted-20.3.0/src/twisted/internet/default.py#L41-L53 (there's a handful of others you can install manually too...)

Would it make more sense to check if the reactor is. Both PollReactor and EPollReactor have a _poller attribute, but obviously only one is of type epoll.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's an isinstance(poller, epoll) check below which I think does what you're asking?

Copy link
Member

@clokep clokep Jan 14, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's an isinstance(poller, epoll) check below which I think does what you're asking?

Errr, yes the code is mostly doing what I thought.

I was wondering if it makes sense to check that the reactor is an EPollReactor instead, in case there's some other reactor that someone is using with a _poller attribute. I think it still wouldn't be an epoll though, and if it is the wrapper should work fine. So LGTM!

pass
else:
if isinstance(poller, epoll):
poller = EpollWrapper(poller)
reactor._poller = poller # type: ignore[attr-defined]
REGISTRY.register(ReactorLastSeenMetric(poller))