Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: implement exponential random retry strategy #225

Merged
merged 13 commits into from
Apr 29, 2021
16 changes: 11 additions & 5 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -256,17 +256,23 @@ The batching is configurable by ``write_options``\ :
- the number of milliseconds to increase the batch flush interval by a random amount
- ``0``
* - **retry_interval**
bednar marked this conversation as resolved.
Show resolved Hide resolved
- the number of milliseconds to retry unsuccessful write. The retry interval is used when the InfluxDB server does not specify "Retry-After" header.
- the number of milliseconds to retry first unsuccessful write. The next retry delay is computed using Full Jitter formula. The retry interval is used when the InfluxDB server does not specify "Retry-After" header.
- ``5000``
* - **max_retry_time**
- maximum total retry timout in milliseconds.
- ``180_000``
* - **max_retries**
- the number of max retries when write fails
- ``3``
- ``10``
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this correct?

* - **max_retry_delay**
- the maximum delay between each retry attempt in milliseconds
- ``180_000``
- ``150_000``
* - **min_retry_delay**
- the minimum delay between each retry attempt in milliseconds
- ``1_000``
* - **exponential_base**
- the base for the exponential retry delay, the next delay is computed as ``retry_interval * exponential_base^(attempts-1) + random(jitter_interval)``
- ``5``
- the base for the exponential retry delay, the next delay is computed using Full Jitter formula ``retry_interval * exponential_base^(attempts-1) * random()``
- ``2``


.. code-block:: python
Expand Down
64 changes: 45 additions & 19 deletions influxdb_client/client/write/retry.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
"""Implementation for Retry strategy during HTTP requests."""

import logging
from datetime import datetime, timedelta
from itertools import takewhile
from random import random

from urllib3 import Retry
from urllib3.exceptions import MaxRetryError, ResponseError

from influxdb_client.client.exceptions import InfluxDBError

Expand All @@ -15,28 +17,44 @@ class WritesRetry(Retry):
"""
Writes retry configuration.

:param int jitter_interval: random milliseconds when retrying writes
:param int max_retry_delay: maximum delay when retrying write
:param int max_retry_time: maximum total retry timout in seconds, attempt after this timout throws MaxRetryError
:param int total: maximum number of retries
:param num backoff_factor: initial first retry delay range in seconds
:param num max_retry_delay: maximum delay when retrying write in seconds
:param num min_retry_delay: minimum delay when retrying write in seconds
:param int exponential_base: base for the exponential retry delay, the next delay is computed as
`backoff_factor * exponential_base^(attempts-1) + random(jitter_interval)`
`backoff_factor * exponential_base^(attempts-1) * random()`
"""

def __init__(self, jitter_interval=0, max_retry_delay=180, exponential_base=5, **kw):
def __init__(self, max_retry_time=180, total=10, backoff_factor=5, max_retry_delay=125, min_retry_delay=1,
exponential_base=2, **kw):
"""Initialize defaults."""
super().__init__(**kw)
self.jitter_interval = jitter_interval
self.total = total
self.backoff_factor = backoff_factor
self.max_retry_delay = max_retry_delay
self.min_retry_delay = min_retry_delay
self.max_retry_time = max_retry_time
self.exponential_base = exponential_base
self.retry_timeout = datetime.now() + timedelta(seconds=max_retry_time)

def new(self, **kw):
"""Initialize defaults."""
if 'jitter_interval' not in kw:
kw['jitter_interval'] = self.jitter_interval
if 'max_retry_delay' not in kw:
kw['max_retry_delay'] = self.max_retry_delay

if 'min_retry_delay' not in kw:
kw['min_retry_delay'] = self.min_retry_delay

if 'max_retry_time' not in kw:
kw['max_retry_time'] = self.max_retry_time

if 'exponential_base' not in kw:
kw['exponential_base'] = self.exponential_base
return super().new(**kw)

new = super().new(**kw)
new.retry_timeout = self.retry_timeout
return new

def is_retry(self, method, status_code, has_retry_after=False):
"""is_retry doesn't require retry_after header. If there is not Retry-After we will use backoff."""
Expand All @@ -58,18 +76,26 @@ def get_backoff_time(self):
if consecutive_errors_len < 0:
return 0

backoff_value = self.backoff_factor * (self.exponential_base ** consecutive_errors_len) + self._jitter_delay()
return min(self.max_retry_delay, backoff_value)

def get_retry_after(self, response):
"""Get the value of Retry-After header and append random jitter delay."""
retry_after = super().get_retry_after(response)
if retry_after:
retry_after += self._jitter_delay()
return retry_after
delay_range = self.backoff_factor
i = 1
while i <= consecutive_errors_len:
i += 1
delay_range = delay_range * self.exponential_base
if delay_range > self.max_retry_delay:
Copy link

@sranka sranka Apr 21, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@alespour had a good point that the delays should be increasing (on average), this condition makes it hard to happen since the delay range is the same after a fixed count of attempts, the delays are then oscillating randomly around self.max_retry_delay/2. This can be fixed by restricting the delay range to a large number:

Suggested change
if delay_range > self.max_retry_delay:
if delay_range > 100_000_000:

break

delay = self.min_retry_delay + (delay_range - self.min_retry_delay) * self._random()
# at least min_retry_delay
delay = max(self.min_retry_delay, delay)
# at most max_retry_delay
delay = min(self.max_retry_delay, delay)
return delay

def increment(self, method=None, url=None, response=None, error=None, _pool=None, _stacktrace=None):
"""Return a new Retry object with incremented retry counters."""
if self.retry_timeout < datetime.now():
Copy link

@sranka sranka Apr 20, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it also react the same way when retry is disabled? (max_retry_time is 0)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, max_retry_time=0 means retry is disabled, here is the test:

def test_retry_disabled_max_retry_time(self):

raise MaxRetryError(_pool, url, error or ResponseError("max_retry_time exceeded"))

new_retry = super().increment(method, url, response, error, _pool, _stacktrace)

if response is not None:
Expand All @@ -87,5 +113,5 @@ def increment(self, method=None, url=None, response=None, error=None, _pool=None

return new_retry

def _jitter_delay(self):
return self.jitter_interval * random()
def _random(self):
return random()
26 changes: 14 additions & 12 deletions influxdb_client/client/write_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,11 @@ def __init__(self, write_type: WriteType = WriteType.batching,
batch_size=1_000, flush_interval=1_000,
jitter_interval=0,
retry_interval=5_000,
max_retries=3,
max_retry_delay=180_000,
exponential_base=5,
max_retries=10,
max_retry_delay=150_000,
min_retry_delay=1_000,
max_retry_time=180_000,
exponential_base=2,
write_scheduler=ThreadPoolScheduler(max_workers=1)) -> None:
"""
Create write api configuration.
Expand All @@ -51,8 +53,10 @@ def __init__(self, write_type: WriteType = WriteType.batching,
:param jitter_interval: this is primarily to avoid large write spikes for users running a large number of
client instances ie, a jitter of 5s and flush duration 10s means flushes will happen every 10-15s.
:param retry_interval: the time to wait before retry unsuccessful write
:param max_retries: the number of max retries when write fails
:param max_retries: the number of max retries when write fails, 0 means retry is disabled
:param max_retry_delay: the maximum delay between each retry attempt in milliseconds
:param min_retry_delay: the minimum delay between each retry attempt in milliseconds
:param max_retry_time: total timeout for all retry attempts in milliseconds, if 0 retry is disabled
:param exponential_base: base for the exponential retry delay, the next delay is computed as
`retry_interval * exponential_base^(attempts-1) + random(jitter_interval)`
:param write_scheduler:
Expand All @@ -64,6 +68,8 @@ def __init__(self, write_type: WriteType = WriteType.batching,
self.retry_interval = retry_interval
self.max_retries = max_retries
self.max_retry_delay = max_retry_delay
self.min_retry_delay = min_retry_delay
self.max_retry_time = max_retry_time
self.exponential_base = exponential_base
self.write_scheduler = write_scheduler

Expand All @@ -72,10 +78,11 @@ def to_retry_strategy(self):
return WritesRetry(
total=self.max_retries,
backoff_factor=self.retry_interval / 1_000,
jitter_interval=self.jitter_interval / 1_000,
max_retry_delay=self.max_retry_delay / 1_000,
min_retry_delay=self.min_retry_delay / 1_000,
max_retry_time=self.max_retry_time / 1000,
exponential_base=self.exponential_base,
method_whitelist=["POST"])
allowed_methods=["POST"])

def __getstate__(self):
"""Return a dict of attributes that you want to pickle."""
Expand Down Expand Up @@ -362,12 +369,7 @@ def _http(self, batch_item: _BatchItem):

logger.debug("Write time series data into InfluxDB: %s", batch_item)

retry = WritesRetry(
total=self._write_options.max_retries,
backoff_factor=self._write_options.retry_interval / 1_000,
jitter_interval=self._write_options.jitter_interval / 1_000,
max_retry_delay=self._write_options.max_retry_delay / 1_000,
method_whitelist=["POST"])
retry = self._write_options.to_retry_strategy()

self._post_write(False, batch_item.key.bucket, batch_item.key.org, batch_item.data,
batch_item.key.precision, urlopen_kw={'retries': retry})
Expand Down
32 changes: 32 additions & 0 deletions tests/test_WriteApiBatching.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,38 @@ def test_retry_interval_max_retries(self):

self.assertEqual(6, len(httpretty.httpretty.latest_requests))

def test_retry_disabled_max_retries(self):
httpretty.register_uri(httpretty.POST, uri="http://localhost/api/v2/write", status=429,
adding_headers={'Retry-After': '1'})

self._write_client.close()
self._write_client = WriteApi(influxdb_client=self.influxdb_client,
write_options=WriteOptions(max_retries=0,batch_size=2, flush_interval=1_000))

self._write_client.write("my-bucket", "my-org",
["h2o_feet,location=coyote_creek level\\ water_level=1 1",
"h2o_feet,location=coyote_creek level\\ water_level=2 2"])

time.sleep(2)

self.assertEqual(1, len(httpretty.httpretty.latest_requests))

def test_retry_disabled_max_retry_time(self):
httpretty.register_uri(httpretty.POST, uri="http://localhost/api/v2/write", status=429,
adding_headers={'Retry-After': '1'})

self._write_client.close()
self._write_client = WriteApi(influxdb_client=self.influxdb_client,
write_options=WriteOptions(max_retry_time=0,batch_size=2, flush_interval=1_000))

self._write_client.write("my-bucket", "my-org",
["h2o_feet,location=coyote_creek level\\ water_level=1 1",
"h2o_feet,location=coyote_creek level\\ water_level=2 2"])

time.sleep(2)

self.assertEqual(1, len(httpretty.httpretty.latest_requests))

def test_recover_from_error(self):
httpretty.register_uri(httpretty.POST, uri="http://localhost/api/v2/write", status=204)
httpretty.register_uri(httpretty.POST, uri="http://localhost/api/v2/write", status=400)
Expand Down
9 changes: 4 additions & 5 deletions tests/test_WriteOptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@ class TestWriteOptions(unittest.TestCase):
def test_default(self):
retry = WriteOptions().to_retry_strategy()

self.assertEqual(retry.total, 3)
self.assertEqual(retry.total, 10)
self.assertEqual(retry.backoff_factor, 5)
self.assertEqual(retry.jitter_interval, 0)
self.assertEqual(retry.max_retry_delay, 180)
self.assertEqual(retry.exponential_base, 5)
self.assertEqual(retry.max_retry_time, 180)
self.assertEqual(retry.max_retry_delay, 150)
self.assertEqual(retry.exponential_base, 2)
self.assertEqual(retry.method_whitelist, ["POST"])

def test_custom(self):
Expand All @@ -22,7 +22,6 @@ def test_custom(self):

self.assertEqual(retry.total, 5)
self.assertEqual(retry.backoff_factor, 0.5)
self.assertEqual(retry.jitter_interval, 2)
self.assertEqual(retry.max_retry_delay, 7.5)
self.assertEqual(retry.exponential_base, 2)
self.assertEqual(retry.method_whitelist, ["POST"])
Loading