Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: implement exponential random retry strategy #225

Merged
merged 13 commits into from
Apr 29, 2021
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

### Features
1. [#203](https://github.com/influxdata/influxdb-client-python/issues/219): Bind query parameters
1. [#225](https://github.com/influxdata/influxdb-client-python/pull/225): Exponential random backoff retry strategy

### Bug Fixes
1. [#222](https://github.com/influxdata/influxdb-client-python/pull/222): Pass configured timeout to HTTP client
Expand Down
13 changes: 8 additions & 5 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -256,17 +256,20 @@ The batching is configurable by ``write_options``\ :
- the number of milliseconds to increase the batch flush interval by a random amount
- ``0``
* - **retry_interval**
bednar marked this conversation as resolved.
Show resolved Hide resolved
- the number of milliseconds to retry unsuccessful write. The retry interval is used when the InfluxDB server does not specify "Retry-After" header.
- the number of milliseconds to retry first unsuccessful write. The next retry delay is computed using exponential random backoff. The retry interval is used when the InfluxDB server does not specify "Retry-After" header.
- ``5000``
* - **max_retry_time**
- maximum total retry timout in milliseconds.
- ``180_000``
* - **max_retries**
- the number of max retries when write fails
- ``3``
- ``10``
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this correct?

* - **max_retry_delay**
- the maximum delay between each retry attempt in milliseconds
- ``180_000``
- ``125_000``
* - **exponential_base**
- the base for the exponential retry delay, the next delay is computed as ``retry_interval * exponential_base^(attempts-1) + random(jitter_interval)``
- ``5``
- the base for the exponential retry delay, the next delay is computed using random exponential backoff. Example for ``retry_interval=5_000, exponential_base=2, max_retry_delay=125_000, total=5`` Retry delays are random distributed values within the ranges of ``[5_000-10_000, 10_000-20_000, 20_000-40_000, 40_000-80_000, 80_000-125_000]``
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add note how looks formula to compute delay.

- ``2``


.. code-block:: python
Expand Down
66 changes: 47 additions & 19 deletions influxdb_client/client/write/retry.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
"""Implementation for Retry strategy during HTTP requests."""

import logging
from datetime import datetime, timedelta
from itertools import takewhile
from random import random

from urllib3 import Retry
from urllib3.exceptions import MaxRetryError, ResponseError

from influxdb_client.client.exceptions import InfluxDBError

Expand All @@ -15,28 +17,45 @@ class WritesRetry(Retry):
"""
Writes retry configuration.

:param int jitter_interval: random milliseconds when retrying writes
:param int max_retry_delay: maximum delay when retrying write
:param int exponential_base: base for the exponential retry delay, the next delay is computed as
`backoff_factor * exponential_base^(attempts-1) + random(jitter_interval)`
:param int max_retry_time: maximum total retry timout in seconds, attempt after this timout throws MaxRetryError
:param int total: maximum number of retries
:param num backoff_factor: initial first retry delay range in seconds
:param num max_retry_delay: maximum delay when retrying write in seconds
:param int exponential_base: base for the exponential retry delay,

The next delay is computed as random value between range
`backoff_factor * exponential_base^(attempts-1)` and `backoff_factor * exponential_base^(attempts)

Example: for backoff_factor=5, exponential_base=2, max_retry_delay=125, total=5
retry delays are random distributed values within the ranges of
[5-10, 10-20, 20-40, 40-80, 80-125]

"""

def __init__(self, jitter_interval=0, max_retry_delay=180, exponential_base=5, **kw):
def __init__(self, max_retry_time=180, total=10, backoff_factor=5, max_retry_delay=125, exponential_base=2, **kw):
"""Initialize defaults."""
super().__init__(**kw)
self.jitter_interval = jitter_interval
self.total = total
self.backoff_factor = backoff_factor
self.max_retry_delay = max_retry_delay
self.max_retry_time = max_retry_time
self.exponential_base = exponential_base
self.retry_timeout = datetime.now() + timedelta(seconds=max_retry_time)

def new(self, **kw):
"""Initialize defaults."""
if 'jitter_interval' not in kw:
kw['jitter_interval'] = self.jitter_interval
if 'max_retry_delay' not in kw:
kw['max_retry_delay'] = self.max_retry_delay

if 'max_retry_time' not in kw:
kw['max_retry_time'] = self.max_retry_time

if 'exponential_base' not in kw:
kw['exponential_base'] = self.exponential_base
return super().new(**kw)

new = super().new(**kw)
new.retry_timeout = self.retry_timeout
return new

def is_retry(self, method, status_code, has_retry_after=False):
"""is_retry doesn't require retry_after header. If there is not Retry-After we will use backoff."""
Expand All @@ -58,18 +77,27 @@ def get_backoff_time(self):
if consecutive_errors_len < 0:
return 0

backoff_value = self.backoff_factor * (self.exponential_base ** consecutive_errors_len) + self._jitter_delay()
return min(self.max_retry_delay, backoff_value)
range_start = self.backoff_factor
range_stop = self.backoff_factor * self.exponential_base

i = 1
while i <= consecutive_errors_len:
i += 1
range_start = range_stop
range_stop = range_stop * self.exponential_base
if range_stop > self.max_retry_delay:
break

def get_retry_after(self, response):
"""Get the value of Retry-After header and append random jitter delay."""
retry_after = super().get_retry_after(response)
if retry_after:
retry_after += self._jitter_delay()
return retry_after
if range_stop > self.max_retry_delay:
range_stop = self.max_retry_delay

return range_start + (range_stop - range_start) * self._random()

def increment(self, method=None, url=None, response=None, error=None, _pool=None, _stacktrace=None):
"""Return a new Retry object with incremented retry counters."""
if self.retry_timeout < datetime.now():
Copy link

@sranka sranka Apr 20, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it also react the same way when retry is disabled? (max_retry_time is 0)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, max_retry_time=0 means retry is disabled, here is the test:

def test_retry_disabled_max_retry_time(self):

raise MaxRetryError(_pool, url, error or ResponseError("max_retry_time exceeded"))

new_retry = super().increment(method, url, response, error, _pool, _stacktrace)

if response is not None:
Expand All @@ -87,5 +115,5 @@ def increment(self, method=None, url=None, response=None, error=None, _pool=None

return new_retry

def _jitter_delay(self):
return self.jitter_interval * random()
def _random(self):
return random()
25 changes: 11 additions & 14 deletions influxdb_client/client/write_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,10 @@ def __init__(self, write_type: WriteType = WriteType.batching,
batch_size=1_000, flush_interval=1_000,
jitter_interval=0,
retry_interval=5_000,
max_retries=3,
max_retry_delay=180_000,
exponential_base=5,
max_retries=10,
max_retry_delay=125_000,
max_retry_time=180_000,
exponential_base=2,
write_scheduler=ThreadPoolScheduler(max_workers=1)) -> None:
"""
Create write api configuration.
Expand All @@ -51,10 +52,10 @@ def __init__(self, write_type: WriteType = WriteType.batching,
:param jitter_interval: this is primarily to avoid large write spikes for users running a large number of
client instances ie, a jitter of 5s and flush duration 10s means flushes will happen every 10-15s.
:param retry_interval: the time to wait before retry unsuccessful write
:param max_retries: the number of max retries when write fails
:param max_retries: the number of max retries when write fails, 0 means retry is disabled
:param max_retry_delay: the maximum delay between each retry attempt in milliseconds
:param exponential_base: base for the exponential retry delay, the next delay is computed as
`retry_interval * exponential_base^(attempts-1) + random(jitter_interval)`
:param max_retry_time: total timeout for all retry attempts in milliseconds, if 0 retry is disabled
:param exponential_base: base for the exponential retry delay
:param write_scheduler:
"""
self.write_type = write_type
Expand All @@ -64,6 +65,7 @@ def __init__(self, write_type: WriteType = WriteType.batching,
self.retry_interval = retry_interval
self.max_retries = max_retries
self.max_retry_delay = max_retry_delay
self.max_retry_time = max_retry_time
self.exponential_base = exponential_base
self.write_scheduler = write_scheduler

Expand All @@ -72,10 +74,10 @@ def to_retry_strategy(self):
return WritesRetry(
total=self.max_retries,
backoff_factor=self.retry_interval / 1_000,
jitter_interval=self.jitter_interval / 1_000,
max_retry_delay=self.max_retry_delay / 1_000,
max_retry_time=self.max_retry_time / 1000,
exponential_base=self.exponential_base,
method_whitelist=["POST"])
allowed_methods=["POST"])

def __getstate__(self):
"""Return a dict of attributes that you want to pickle."""
Expand Down Expand Up @@ -362,12 +364,7 @@ def _http(self, batch_item: _BatchItem):

logger.debug("Write time series data into InfluxDB: %s", batch_item)

retry = WritesRetry(
total=self._write_options.max_retries,
backoff_factor=self._write_options.retry_interval / 1_000,
jitter_interval=self._write_options.jitter_interval / 1_000,
max_retry_delay=self._write_options.max_retry_delay / 1_000,
method_whitelist=["POST"])
retry = self._write_options.to_retry_strategy()

self._post_write(False, batch_item.key.bucket, batch_item.key.org, batch_item.data,
batch_item.key.precision, urlopen_kw={'retries': retry})
Expand Down
34 changes: 33 additions & 1 deletion tests/test_WriteApiBatching.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def test_retry_interval(self):
time.sleep(1)
self.assertEqual(1, len(httpretty.httpretty.latest_requests), msg="first request immediately")

time.sleep(1.5)
time.sleep(3)
self.assertEqual(2, len(httpretty.httpretty.latest_requests), msg="second request after delay_interval")

time.sleep(3)
Expand Down Expand Up @@ -238,6 +238,38 @@ def test_retry_interval_max_retries(self):

self.assertEqual(6, len(httpretty.httpretty.latest_requests))

def test_retry_disabled_max_retries(self):
httpretty.register_uri(httpretty.POST, uri="http://localhost/api/v2/write", status=429,
adding_headers={'Retry-After': '1'})

self._write_client.close()
self._write_client = WriteApi(influxdb_client=self.influxdb_client,
write_options=WriteOptions(max_retries=0,batch_size=2, flush_interval=1_000))

self._write_client.write("my-bucket", "my-org",
["h2o_feet,location=coyote_creek level\\ water_level=1 1",
"h2o_feet,location=coyote_creek level\\ water_level=2 2"])

time.sleep(2)

self.assertEqual(1, len(httpretty.httpretty.latest_requests))

def test_retry_disabled_max_retry_time(self):
httpretty.register_uri(httpretty.POST, uri="http://localhost/api/v2/write", status=429,
adding_headers={'Retry-After': '1'})

self._write_client.close()
self._write_client = WriteApi(influxdb_client=self.influxdb_client,
write_options=WriteOptions(max_retry_time=0,batch_size=2, flush_interval=1_000))

self._write_client.write("my-bucket", "my-org",
["h2o_feet,location=coyote_creek level\\ water_level=1 1",
"h2o_feet,location=coyote_creek level\\ water_level=2 2"])

time.sleep(5)

self.assertEqual(1, len(httpretty.httpretty.latest_requests))

def test_recover_from_error(self):
httpretty.register_uri(httpretty.POST, uri="http://localhost/api/v2/write", status=204)
httpretty.register_uri(httpretty.POST, uri="http://localhost/api/v2/write", status=400)
Expand Down
9 changes: 4 additions & 5 deletions tests/test_WriteOptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@ class TestWriteOptions(unittest.TestCase):
def test_default(self):
retry = WriteOptions().to_retry_strategy()

self.assertEqual(retry.total, 3)
self.assertEqual(retry.total, 10)
self.assertEqual(retry.backoff_factor, 5)
self.assertEqual(retry.jitter_interval, 0)
self.assertEqual(retry.max_retry_delay, 180)
self.assertEqual(retry.exponential_base, 5)
self.assertEqual(retry.max_retry_time, 180)
self.assertEqual(retry.max_retry_delay, 125)
self.assertEqual(retry.exponential_base, 2)
self.assertEqual(retry.method_whitelist, ["POST"])

def test_custom(self):
Expand All @@ -22,7 +22,6 @@ def test_custom(self):

self.assertEqual(retry.total, 5)
self.assertEqual(retry.backoff_factor, 0.5)
self.assertEqual(retry.jitter_interval, 2)
self.assertEqual(retry.max_retry_delay, 7.5)
self.assertEqual(retry.exponential_base, 2)
self.assertEqual(retry.method_whitelist, ["POST"])
Loading