From 7f624279d880da514667d71fcc52a60092ad171e Mon Sep 17 00:00:00 2001 From: Sam Hames Date: Wed, 6 Apr 2022 14:01:02 +1000 Subject: [PATCH] Catch additional edge case in the counts API A search for only a user that doesn't exist like "from:10", doesn't return any data, just immediately creates a response with a meta key indicating there are zero matching tweets. This conflicts with the workaround to catch early termination of counts, and was leading to errors being raised. This change now differentiates between the two failure modes, by checking if any rows of data have been returned previously, or if this is an immediate early termination like happens when a user doesn't exist anymore. --- twarc/client2.py | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/twarc/client2.py b/twarc/client2.py index 0b1db1b2..b2b4917d 100644 --- a/twarc/client2.py +++ b/twarc/client2.py @@ -226,6 +226,18 @@ def _search( # Mark that we're using counts, to workaround a limitation of the # Twitter API with long running counts. using_counts = True + + # We need to use these as sentinel values, to differentiate + # between the count API returning zero prematurely, and queries + # like "from:". In the latter case + # instead of returning counts of 0 per day, it will just return + # an empty response with a total tweet count of zero. We can + # disambiguate the two cases by noting that the premature + # termination will already have counted some tweets correctly, + # while the latter will return immediately without any data + # rows. + time_periods_collected = 0 + last_time_start = None else: params = self._prepare_params( **params, @@ -254,6 +266,7 @@ def _search( # can't return without 'data' if there are no results if "data" in response: last_time_start = response["data"][0]["start"] + time_periods_collected += len(response["data"]) yield response else: @@ -264,9 +277,19 @@ def _search( # fiddly because Python doesn't let you specify milliseconds only for # strftime. if ( + # If there's no explicit start time we're getting the last + # 30 days by default, so don't need to do the tricky + # things. start_time is None - or (start_time.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z") - == last_time_start + # We've actually reached the specified start time + or ( + (start_time.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z") + == last_time_start + ) + # Or, we've hit one of the special cases that returns no rows + # of data, and immediately indicates zero tweets returned, like + # searching for a tweet that doesn't exist. + or (time_periods_collected == 0) ): break else: