Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Retry some http replication failures #12182

Merged
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/12182.misc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Retry HTTP replication failures, this should prevent 502's when restarting stateful workers (main, event persisters, stream writers). Contributed by Nick @ Beeper.
39 changes: 28 additions & 11 deletions synapse/replication/http/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

from prometheus_client import Counter, Gauge

from twisted.internet.error import ConnectError, DNSLookupError
from twisted.web.server import Request

from synapse.api.errors import HttpResponseException, SynapseError
Expand Down Expand Up @@ -94,6 +95,8 @@ class ReplicationEndpoint(metaclass=abc.ABCMeta):
METHOD = "POST"
CACHE = True
RETRY_ON_TIMEOUT = True
RETRY_ON_CONNECT_ERROR = True
RETRY_ON_CONNECT_ERROR_ATTEMPTS = 5 # =63s (2^6-1)
richvdh marked this conversation as resolved.
Show resolved Hide resolved

def __init__(self, hs: "HomeServer"):
if self.CACHE:
Expand Down Expand Up @@ -236,30 +239,44 @@ async def send_request(*, instance_name: str = "master", **kwargs: Any) -> Any:
"/".join(url_args),
)

headers: Dict[bytes, List[bytes]] = {}
# Add an authorization header, if configured.
if replication_secret:
headers[b"Authorization"] = [b"Bearer " + replication_secret]
opentracing.inject_header_dict(headers, check_destination=False)

try:
# Keep track of attempts made so we can bail if we don't manage to
# connect to the target after N tries.
attempts = 0
# We keep retrying the same request for timeouts. This is so that we
# have a good idea that the request has either succeeded or failed
# on the master, and so whether we should clean up or not.
while True:
headers: Dict[bytes, List[bytes]] = {}
# Add an authorization header, if configured.
if replication_secret:
headers[b"Authorization"] = [
b"Bearer " + replication_secret
]
opentracing.inject_header_dict(headers, check_destination=False)
try:
result = await request_func(uri, data, headers=headers)
break
except RequestTimedOutError:
if not cls.RETRY_ON_TIMEOUT:
raise

logger.warning("%s request timed out; retrying", cls.NAME)
logger.warning("%s request timed out; retrying", cls.NAME)

# If we timed out we probably don't need to worry about backing
# off too much, but lets just wait a little anyway.
await clock.sleep(1)
except (ConnectError, DNSLookupError):
if not cls.RETRY_ON_CONNECT_ERROR:
raise
if attempts > cls.RETRY_ON_CONNECT_ERROR_ATTEMPTS:
raise

logger.warning(
"%s request connection failed; retrying", cls.NAME
)
richvdh marked this conversation as resolved.
Show resolved Hide resolved

# If we timed out we probably don't need to worry about backing
# off too much, but lets just wait a little anyway.
await clock.sleep(1)
await clock.sleep(2 ** attempts)
attempts += 1
except HttpResponseException as e:
# We convert to SynapseError as we know that it was a SynapseError
# on the main process that we should send to the client. (And
Expand Down