Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Flaky test_crawl_with_proxy #743

Open
janbuchar opened this issue Nov 25, 2024 · 1 comment
Open

Flaky test_crawl_with_proxy #743

janbuchar opened this issue Nov 25, 2024 · 1 comment
Labels
bug Something isn't working. debt Code quality improvement or decrease of technical debt. t-tooling Issues with this label are in the ownership of the tooling team.

Comments

@janbuchar
Copy link
Collaborator

Now that httpbin fails less often, we should look into the other flaky tests... this one fails quite often:

____________________________ test_crawl_with_proxy _____________________________
[gw0] linux -- Python 3.10.15 /home/runner/.cache/pypoetry/virtualenvs/crawlee-CKpluuj2-py3.10/bin/python

self = <curl_cffi.requests.session.AsyncSession object at 0x7fb471ec7df0>
method = 'GET'
url = 'https://janbuchar--httpbin.apify.actor/status/222?token=apify_api_xERCvoSw7a3SEVVclipJJPsfhGTFwZ3MKrI1'
params = None, data = None, json = None, headers = HttpHeaders(root={})
cookies = None, files = None, auth = None
timeout = <object object at 0x7fb475314500>, allow_redirects = True
max_redirects = None, proxies = None, proxy = None, proxy_auth = None
verify = None, referer = None, accept_encoding = 'gzip, deflate, br'
content_callback = None, impersonate = None, ja3 = None, akamai = None
extra_fp = None, default_headers = None, default_encoding = 'utf-8', quote = ''
http_version = None, interface = None, cert = None, stream = False
max_recv_speed = 0, multipart = None

    async def request(
        self,
        method: HttpMethod,
        url: str,
        params: Optional[Union[Dict, List, Tuple]] = None,
        data: Optional[Union[Dict[str, str], List[Tuple], str, BytesIO, bytes]] = None,
        json: Optional[dict] = None,
        headers: Optional[HeaderTypes] = None,
        cookies: Optional[CookieTypes] = None,
        files: Optional[Dict] = None,
        auth: Optional[Tuple[str, str]] = None,
        timeout: Optional[Union[float, Tuple[float, float], object]] = not_set,
        allow_redirects: Optional[bool] = None,
        max_redirects: Optional[int] = None,
        proxies: Optional[ProxySpec] = None,
        proxy: Optional[str] = None,
        proxy_auth: Optional[Tuple[str, str]] = None,
        verify: Optional[bool] = None,
        referer: Optional[str] = None,
        accept_encoding: Optional[str] = "gzip, deflate, br",
        content_callback: Optional[Callable] = None,
        impersonate: Optional[BrowserTypeLiteral] = None,
        ja3: Optional[str] = None,
        akamai: Optional[str] = None,
        extra_fp: Optional[Union[ExtraFingerprints, ExtraFpDict]] = None,
        default_headers: Optional[bool] = None,
        default_encoding: Union[str, Callable[[bytes], str]] = "utf-8",
        quote: Union[str, Literal[False]] = "",
        http_version: Optional[CurlHttpVersion] = None,
        interface: Optional[str] = None,
        cert: Optional[Union[str, Tuple[str, str]]] = None,
        stream: bool = False,
        max_recv_speed: int = 0,
        multipart: Optional[CurlMime] = None,
    ):
        """Send the request, see ``curl_cffi.requests.request`` for details on parameters."""
        self._check_session_closed()
    
        curl = await self.pop_curl()
        req, buffer, header_buffer, q, header_recved, quit_now = self._set_curl_options(
            curl=curl,
            method=method,
            url=url,
            params=params,
            data=data,
            json=json,
            headers=headers,
            cookies=cookies,
            files=files,
            auth=auth,
            timeout=timeout,
            allow_redirects=allow_redirects,
            max_redirects=max_redirects,
            proxies=proxies,
            proxy=proxy,
            proxy_auth=proxy_auth,
            verify=verify,
            referer=referer,
            accept_encoding=accept_encoding,
            content_callback=content_callback,
            impersonate=impersonate,
            ja3=ja3,
            akamai=akamai,
            extra_fp=extra_fp,
            default_headers=default_headers,
            quote=quote,
            http_version=http_version,
            interface=interface,
            stream=stream,
            max_recv_speed=max_recv_speed,
            multipart=multipart,
            cert=cert,
            queue_class=asyncio.Queue,
            event_class=asyncio.Event,
        )
        if stream:
            task = self.acurl.add_handle(curl)
    
            async def perform():
                try:
                    await task
                except CurlError as e:
                    rsp = self._parse_response(curl, buffer, header_buffer, default_encoding)
                    rsp.request = req
                    cast(asyncio.Queue, q).put_nowait(RequestException(str(e), e.code, rsp))
                finally:
                    if not cast(asyncio.Event, header_recved).is_set():
                        cast(asyncio.Event, header_recved).set()
                    # None acts as a sentinel
                    await cast(asyncio.Queue, q).put(None)
    
            def cleanup(fut):
                self.release_curl(curl)
    
            stream_task = asyncio.create_task(perform())
            stream_task.add_done_callback(cleanup)
    
            await cast(asyncio.Event, header_recved).wait()
    
            # Unlike threads, coroutines does not use preemptive scheduling.
            # For asyncio, there is no need for a header_parsed event, the
            # _parse_response will execute in the foreground, no background tasks running.
            rsp = self._parse_response(curl, buffer, header_buffer, default_encoding)
    
            first_element = _peek_aio_queue(cast(asyncio.Queue, q))
            if isinstance(first_element, RequestException):
                self.release_curl(curl)
                raise first_element
    
            rsp.request = req
            rsp.astream_task = stream_task
            rsp.quit_now = quit_now
            rsp.queue = q
            return rsp
        else:
            try:
                # curl.debug()
                # print("using curl instance: ", curl)
                task = self.acurl.add_handle(curl)
>               await task
E               curl_cffi.curl.CurlError: Failed to perform, curl: (16) . See https://curl.se/libcurl/c/libcurl-errors.html first for more details.

../../../.cache/pypoetry/virtualenvs/crawlee-CKpluuj2-py3.10/lib/python3.10/site-packages/curl_cffi/requests/session.py:1333: CurlError

The above exception was the direct cause of the following exception:

http_client = <crawlee.http_clients.curl_impersonate.CurlImpersonateHttpClient object at 0x7fb472034f10>
proxy = ProxyInfo(url='***127.0.0.1:51183', scheme='http', hostname='127.0.0.1', port=51183, username='user', password='pass', session_id=None, proxy_tier=None)
httpbin = URL('***')

    @pytest.mark.skipif(os.name == 'nt', reason='Skipped on Windows')
    async def test_crawl_with_proxy(
        http_client: CurlImpersonateHttpClient,
        proxy: ProxyInfo,
        httpbin: URL,
    ) -> None:
        url = str(httpbin.copy_with(path='/status/222'))
        request = Request.from_url(url)
    
        async with Statistics() as statistics:
>           result = await http_client.crawl(request, proxy_info=proxy, statistics=statistics)

tests/unit/http_clients/test_curl_impersonate.py:34: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
src/crawlee/http_clients/curl_impersonate.py:132: in crawl
    response = await client.request(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <curl_cffi.requests.session.AsyncSession object at 0x7fb471ec7df0>
method = 'GET'
url = 'https://janbuchar--httpbin.apify.actor/status/222?token=apify_api_xERCvoSw7a3SEVVclipJJPsfhGTFwZ3MKrI1'
params = None, data = None, json = None, headers = HttpHeaders(root={})
cookies = None, files = None, auth = None
timeout = <object object at 0x7fb475314500>, allow_redirects = True
max_redirects = None, proxies = None, proxy = None, proxy_auth = None
verify = None, referer = None, accept_encoding = 'gzip, deflate, br'
content_callback = None, impersonate = None, ja3 = None, akamai = None
extra_fp = None, default_headers = None, default_encoding = 'utf-8', quote = ''
http_version = None, interface = None, cert = None, stream = False
max_recv_speed = 0, multipart = None

    async def request(
        self,
        method: HttpMethod,
        url: str,
        params: Optional[Union[Dict, List, Tuple]] = None,
        data: Optional[Union[Dict[str, str], List[Tuple], str, BytesIO, bytes]] = None,
        json: Optional[dict] = None,
        headers: Optional[HeaderTypes] = None,
        cookies: Optional[CookieTypes] = None,
        files: Optional[Dict] = None,
        auth: Optional[Tuple[str, str]] = None,
        timeout: Optional[Union[float, Tuple[float, float], object]] = not_set,
        allow_redirects: Optional[bool] = None,
        max_redirects: Optional[int] = None,
        proxies: Optional[ProxySpec] = None,
        proxy: Optional[str] = None,
        proxy_auth: Optional[Tuple[str, str]] = None,
        verify: Optional[bool] = None,
        referer: Optional[str] = None,
        accept_encoding: Optional[str] = "gzip, deflate, br",
        content_callback: Optional[Callable] = None,
        impersonate: Optional[BrowserTypeLiteral] = None,
        ja3: Optional[str] = None,
        akamai: Optional[str] = None,
        extra_fp: Optional[Union[ExtraFingerprints, ExtraFpDict]] = None,
        default_headers: Optional[bool] = None,
        default_encoding: Union[str, Callable[[bytes], str]] = "utf-8",
        quote: Union[str, Literal[False]] = "",
        http_version: Optional[CurlHttpVersion] = None,
        interface: Optional[str] = None,
        cert: Optional[Union[str, Tuple[str, str]]] = None,
        stream: bool = False,
        max_recv_speed: int = 0,
        multipart: Optional[CurlMime] = None,
    ):
        """Send the request, see ``curl_cffi.requests.request`` for details on parameters."""
        self._check_session_closed()
    
        curl = await self.pop_curl()
        req, buffer, header_buffer, q, header_recved, quit_now = self._set_curl_options(
            curl=curl,
            method=method,
            url=url,
            params=params,
            data=data,
            json=json,
            headers=headers,
            cookies=cookies,
            files=files,
            auth=auth,
            timeout=timeout,
            allow_redirects=allow_redirects,
            max_redirects=max_redirects,
            proxies=proxies,
            proxy=proxy,
            proxy_auth=proxy_auth,
            verify=verify,
            referer=referer,
            accept_encoding=accept_encoding,
            content_callback=content_callback,
            impersonate=impersonate,
            ja3=ja3,
            akamai=akamai,
            extra_fp=extra_fp,
            default_headers=default_headers,
            quote=quote,
            http_version=http_version,
            interface=interface,
            stream=stream,
            max_recv_speed=max_recv_speed,
            multipart=multipart,
            cert=cert,
            queue_class=asyncio.Queue,
            event_class=asyncio.Event,
        )
        if stream:
            task = self.acurl.add_handle(curl)
    
            async def perform():
                try:
                    await task
                except CurlError as e:
                    rsp = self._parse_response(curl, buffer, header_buffer, default_encoding)
                    rsp.request = req
                    cast(asyncio.Queue, q).put_nowait(RequestException(str(e), e.code, rsp))
                finally:
                    if not cast(asyncio.Event, header_recved).is_set():
                        cast(asyncio.Event, header_recved).set()
                    # None acts as a sentinel
                    await cast(asyncio.Queue, q).put(None)
    
            def cleanup(fut):
                self.release_curl(curl)
    
            stream_task = asyncio.create_task(perform())
            stream_task.add_done_callback(cleanup)
    
            await cast(asyncio.Event, header_recved).wait()
    
            # Unlike threads, coroutines does not use preemptive scheduling.
            # For asyncio, there is no need for a header_parsed event, the
            # _parse_response will execute in the foreground, no background tasks running.
            rsp = self._parse_response(curl, buffer, header_buffer, default_encoding)
    
            first_element = _peek_aio_queue(cast(asyncio.Queue, q))
            if isinstance(first_element, RequestException):
                self.release_curl(curl)
                raise first_element
    
            rsp.request = req
            rsp.astream_task = stream_task
            rsp.quit_now = quit_now
            rsp.queue = q
            return rsp
        else:
            try:
                # curl.debug()
                # print("using curl instance: ", curl)
                task = self.acurl.add_handle(curl)
                await task
            except CurlError as e:
                rsp = self._parse_response(curl, buffer, header_buffer, default_encoding)
                rsp.request = req
                error = code2error(e.code, str(e))
>               raise error(str(e), e.code, rsp) from e
E               curl_cffi.requests.exceptions.HTTPError: Failed to perform, curl: (16) . See https://curl.se/libcurl/c/libcurl-errors.html first for more details.

../../../.cache/pypoetry/virtualenvs/crawlee-CKpluuj2-py3.10/lib/python3.10/site-packages/curl_cffi/requests/session.py:1338: HTTPError
@janbuchar janbuchar added bug Something isn't working. debt Code quality improvement or decrease of technical debt. labels Nov 25, 2024
@github-actions github-actions bot added the t-tooling Issues with this label are in the ownership of the tooling team. label Nov 25, 2024
@janbuchar
Copy link
Collaborator Author

Same goes for test_curl_impersonate.py::test_send_request_with_proxy, I suspect the reason will be the same.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working. debt Code quality improvement or decrease of technical debt. t-tooling Issues with this label are in the ownership of the tooling team.
Projects
None yet
Development

No branches or pull requests

1 participant