From 1437a6916773097f8c274c231418b82ae4072313 Mon Sep 17 00:00:00 2001 From: Tom Solberg Date: Tue, 5 Apr 2022 18:41:42 +0200 Subject: [PATCH 01/10] add prefetching of index in legacy repositories --- docs/repositories.md | 14 ++++ src/poetry/factory.py | 2 + src/poetry/repositories/legacy_repository.py | 17 +++++ src/poetry/repositories/link_sources/html.py | 27 ++++++++ tests/repositories/fixtures/legacy/index.html | 3 + tests/repositories/test_legacy_repository.py | 65 ++++++++++++++++++- 6 files changed, 126 insertions(+), 2 deletions(-) create mode 100644 tests/repositories/fixtures/legacy/index.html diff --git a/docs/repositories.md b/docs/repositories.md index 490d78bf785..b9dd26ae82b 100644 --- a/docs/repositories.md +++ b/docs/repositories.md @@ -237,6 +237,20 @@ Note the trailing `/simple/`. This is important when configuring {{% /note %}} +Repositories following the [PEP503](https://peps.python.org/pep-0503/) +specification should expose a root page with individual links for each +package it serves. This isn't reliably implemented everywhere, which +leads to increased network traffic and slower resolve times. If you're +using a repository which has a valid listing, you can add the +`indexed` property to let Poetry prefetch and cache this package list. + +```toml +[[tool.poetry.source]] +name = "foo" +url = "https://foo.bar/simple/" +indexed = true +``` + In addition to [PEP 503](https://peps.python.org/pep-0503/), Poetry can also handle simple API repositories that implement [PEP 658](https://peps.python.org/pep-0658/) (*Introduced in 1.2.0*). This is helpful in reducing dependency resolution time for packages from these sources as Poetry can diff --git a/src/poetry/factory.py b/src/poetry/factory.py index f1ab8ec99fd..e58d2d5b260 100644 --- a/src/poetry/factory.py +++ b/src/poetry/factory.py @@ -185,6 +185,7 @@ def create_package_source( raise RuntimeError("Missing [name] in source.") name = source["name"] url = source["url"] + indexed = bool(source.get("indexed", False)) repository_class = LegacyRepository @@ -196,6 +197,7 @@ def create_package_source( url, config=auth_config, disable_cache=disable_cache, + indexed=indexed, ) @classmethod diff --git a/src/poetry/repositories/legacy_repository.py b/src/poetry/repositories/legacy_repository.py index 11d14af1cb3..4f6653376fd 100644 --- a/src/poetry/repositories/legacy_repository.py +++ b/src/poetry/repositories/legacy_repository.py @@ -9,6 +9,7 @@ from poetry.inspection.info import PackageInfo from poetry.repositories.exceptions import PackageNotFound from poetry.repositories.http import HTTPRepository +from poetry.repositories.link_sources.html import SimpleIndexPage from poetry.repositories.link_sources.html import SimpleRepositoryPage from poetry.utils.helpers import canonicalize_name @@ -27,12 +28,17 @@ def __init__( url: str, config: Config | None = None, disable_cache: bool = False, + indexed: bool = False, ) -> None: if name == "pypi": raise ValueError("The name [pypi] is reserved for repositories") super().__init__(name, url.rstrip("/"), config, disable_cache) + self._index_page = None + if indexed: + self._index_page = self._get_index_page() + def find_packages(self, dependency: Dependency) -> list[Package]: packages = [] constraint, allow_prereleases = self._get_constraints_from_dependency( @@ -48,6 +54,11 @@ def find_packages(self, dependency: Dependency) -> list[Package]: if self._cache.store("matches").has(key): versions = self._cache.store("matches").get(key) else: + if self._index_page is not None and not self._index_page.serves_package( + dependency.name + ): + return [] + page = self._get_page(f"/{dependency.name.replace('.', '-')}/") if page is None: return [] @@ -147,3 +158,9 @@ def _get_page(self, endpoint: str) -> SimpleRepositoryPage | None: if not response: return None return SimpleRepositoryPage(response.url, response.text) + + def _get_index_page(self) -> SimpleIndexPage | None: + response = self._get_response("") + if not response: + return None + return SimpleIndexPage(response.url, response.text) diff --git a/src/poetry/repositories/link_sources/html.py b/src/poetry/repositories/link_sources/html.py index c3c3cc4ce40..f360b401096 100644 --- a/src/poetry/repositories/link_sources/html.py +++ b/src/poetry/repositories/link_sources/html.py @@ -46,3 +46,30 @@ def __init__(self, url: str, content: str) -> None: if not url.endswith("/"): url += "/" super().__init__(url=url, content=content) + + +class SimpleIndexPage: + """Describes the root page of a PEP503 compliant repository. + + This contains a list of links, each one corresponding to a served project. + """ + + def __init__(self, url: str, content: str) -> None: + if not url.endswith("/"): + url += "/" + + self._url = url + self._content = content + self._parsed = html5lib.parse(content, namespaceHTMLElements=False) + self._cached_packages = set(self.links) + + @property + def links(self) -> Iterator[Link]: + # Note: PEP426 specifies that comparisons should be + # case-insensitive. For simplicity, we'll do lookups using + # lowercase-naming, and treating - and _ equivalently. + for anchor in self._parsed.findall(".//a"): + yield anchor.text.lower().replace("-", "_") + + def serves_package(self, name: str) -> bool: + return name.lower().replace("-", "_") in self._cached_packages diff --git a/tests/repositories/fixtures/legacy/index.html b/tests/repositories/fixtures/legacy/index.html new file mode 100644 index 00000000000..a66b1c6d759 --- /dev/null +++ b/tests/repositories/fixtures/legacy/index.html @@ -0,0 +1,3 @@ +pyyaml +missing-version +black diff --git a/tests/repositories/test_legacy_repository.py b/tests/repositories/test_legacy_repository.py index bb0c33f6a76..50017089d50 100644 --- a/tests/repositories/test_legacy_repository.py +++ b/tests/repositories/test_legacy_repository.py @@ -16,6 +16,7 @@ from poetry.repositories.exceptions import PackageNotFound from poetry.repositories.exceptions import RepositoryError from poetry.repositories.legacy_repository import LegacyRepository +from poetry.repositories.link_sources.html import SimpleIndexPage from poetry.repositories.link_sources.html import SimpleRepositoryPage @@ -41,8 +42,10 @@ class MockRepository(LegacyRepository): FIXTURES = Path(__file__).parent / "fixtures" / "legacy" - def __init__(self) -> None: - super().__init__("legacy", url="http://legacy.foo.bar", disable_cache=True) + def __init__(self, indexed: bool = False) -> None: + super().__init__( + "legacy", url="http://legacy.foo.bar", disable_cache=True, indexed=indexed + ) def _get_page(self, endpoint: str) -> SimpleRepositoryPage | None: parts = endpoint.split("/") @@ -388,6 +391,64 @@ def test_get_package_retrieves_packages_with_no_hashes(): ] == package.files +def test_unindexed_has_no_root_page(): + repo = MockRepository() + assert not repo._index_page + + +class MockIndexedRepository(MockRepository): + def __init__(self) -> None: + super().__init__(True) + + def _get_index_page(self) -> SimpleIndexPage | None: + fixture = self.FIXTURES / "index.html" + if not fixture.exists(): + return + + with fixture.open(encoding="utf-8") as f: + return SimpleIndexPage(self._url + "/", f.read()) + + +def test_indexed_has_root_page(): + repo = MockIndexedRepository() + assert repo._index_page + + +def test_indexed_root_page_has_valid_content(): + repo = MockIndexedRepository() + assert repo._index_page.serves_package("pyyaml") + + +def test_indexed_fails_on_missing(): + repo = MockIndexedRepository() + + packages = repo.find_packages(Factory.create_dependency("this-doesnt-exist", "*")) + + assert packages == [] + + +def test_indexed_succeeds_on_existing(): + repo = MockIndexedRepository() + + packages = repo.find_packages(Factory.create_dependency("pyyaml", "*")) + + assert len(packages) == 1 + + +def test_indexed_pep426_underscore_hyphen(): + repo = MockIndexedRepository() + + # 'missing-version' in the index + assert repo._index_page.serves_package("missing_version") + + +def test_indexed_pep426_case_insensitive(): + repo = MockIndexedRepository() + + # 'black' in the index + assert repo._index_page.serves_package("Black") + + class MockHttpRepository(LegacyRepository): def __init__( self, endpoint_responses: dict, http: type[httpretty.httpretty] From d57cc29c893c2fdc16e4a22f8febf64120a95287 Mon Sep 17 00:00:00 2001 From: Tom Solberg Date: Thu, 26 May 2022 20:56:49 +0200 Subject: [PATCH 02/10] fix mypy post rebase --- src/poetry/repositories/link_sources/html.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/poetry/repositories/link_sources/html.py b/src/poetry/repositories/link_sources/html.py index f360b401096..9c174ae5f29 100644 --- a/src/poetry/repositories/link_sources/html.py +++ b/src/poetry/repositories/link_sources/html.py @@ -64,12 +64,13 @@ def __init__(self, url: str, content: str) -> None: self._cached_packages = set(self.links) @property - def links(self) -> Iterator[Link]: + def links(self) -> Iterator[str]: # Note: PEP426 specifies that comparisons should be # case-insensitive. For simplicity, we'll do lookups using # lowercase-naming, and treating - and _ equivalently. for anchor in self._parsed.findall(".//a"): - yield anchor.text.lower().replace("-", "_") + text: str = anchor.text + yield text.lower().replace("-", "_") def serves_package(self, name: str) -> bool: return name.lower().replace("-", "_") in self._cached_packages From 66c2926819fc192e7323e0d5293a525c0660a2db Mon Sep 17 00:00:00 2001 From: Tom Solberg Date: Thu, 26 May 2022 21:07:36 +0200 Subject: [PATCH 03/10] pre-commit is lazy --- src/poetry/repositories/link_sources/html.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/poetry/repositories/link_sources/html.py b/src/poetry/repositories/link_sources/html.py index 9c174ae5f29..aa7bd9f822c 100644 --- a/src/poetry/repositories/link_sources/html.py +++ b/src/poetry/repositories/link_sources/html.py @@ -69,7 +69,10 @@ def links(self) -> Iterator[str]: # case-insensitive. For simplicity, we'll do lookups using # lowercase-naming, and treating - and _ equivalently. for anchor in self._parsed.findall(".//a"): - text: str = anchor.text + text: str | None = anchor.text + if text is None: + continue + yield text.lower().replace("-", "_") def serves_package(self, name: str) -> bool: From b404d7a4c0f6f308150ba1f915af64887bc3e613 Mon Sep 17 00:00:00 2001 From: Ewoud Samuels Date: Fri, 3 Jun 2022 16:28:02 +0200 Subject: [PATCH 04/10] Add indexed keyword to Source dataclass The new indexed keyword introduced for pre-fetching legacy repositories was not known the Source class. --- src/poetry/config/source.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/poetry/config/source.py b/src/poetry/config/source.py index f3af0c589e2..c4836a73139 100644 --- a/src/poetry/config/source.py +++ b/src/poetry/config/source.py @@ -9,6 +9,7 @@ class Source: url: str default: bool = dataclasses.field(default=False) secondary: bool = dataclasses.field(default=False) + indexed: bool = dataclasses.field(default=False) def to_dict(self) -> dict[str, str | bool]: return dataclasses.asdict(self) From 122b02e1d5ad9b28abef2c91e29f62d1e652d848 Mon Sep 17 00:00:00 2001 From: Tom Solberg Date: Fri, 3 Jun 2022 23:16:50 +0200 Subject: [PATCH 05/10] Update src/poetry/repositories/link_sources/html.py Co-authored-by: Bjorn Neergaard --- src/poetry/repositories/link_sources/html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/poetry/repositories/link_sources/html.py b/src/poetry/repositories/link_sources/html.py index aa7bd9f822c..7adeba827d4 100644 --- a/src/poetry/repositories/link_sources/html.py +++ b/src/poetry/repositories/link_sources/html.py @@ -49,7 +49,7 @@ def __init__(self, url: str, content: str) -> None: class SimpleIndexPage: - """Describes the root page of a PEP503 compliant repository. + """Describes the root page of a PEP 503 compliant repository. This contains a list of links, each one corresponding to a served project. """ From 4f003a2432d20e6e2f4bc197f94071b631889060 Mon Sep 17 00:00:00 2001 From: Tom Solberg Date: Fri, 3 Jun 2022 23:17:18 +0200 Subject: [PATCH 06/10] Update docs/repositories.md Co-authored-by: Bjorn Neergaard --- docs/repositories.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/repositories.md b/docs/repositories.md index b9dd26ae82b..481437a46d2 100644 --- a/docs/repositories.md +++ b/docs/repositories.md @@ -237,7 +237,7 @@ Note the trailing `/simple/`. This is important when configuring {{% /note %}} -Repositories following the [PEP503](https://peps.python.org/pep-0503/) +Repositories following the [PEP 503](https://peps.python.org/pep-0503/) specification should expose a root page with individual links for each package it serves. This isn't reliably implemented everywhere, which leads to increased network traffic and slower resolve times. If you're From e85e1c5d28394942e32a7b5b270a8cfce7c5bdd9 Mon Sep 17 00:00:00 2001 From: Tom Solberg Date: Tue, 7 Jun 2022 18:08:06 +0200 Subject: [PATCH 07/10] update from feedback --- src/poetry/factory.py | 8 +++- src/poetry/repositories/indexed.py | 42 ++++++++++++++++++++ src/poetry/repositories/legacy_repository.py | 17 -------- tests/repositories/test_legacy_repository.py | 35 ++++++++-------- 4 files changed, 66 insertions(+), 36 deletions(-) create mode 100644 src/poetry/repositories/indexed.py diff --git a/src/poetry/factory.py b/src/poetry/factory.py index e58d2d5b260..15c446f5528 100644 --- a/src/poetry/factory.py +++ b/src/poetry/factory.py @@ -174,6 +174,7 @@ def configure_sources( def create_package_source( cls, source: dict[str, str], auth_config: Config, disable_cache: bool = False ) -> LegacyRepository: + from poetry.repositories.indexed import IndexedLegacyRepository from poetry.repositories.legacy_repository import LegacyRepository from poetry.repositories.single_page_repository import SinglePageRepository @@ -191,13 +192,18 @@ def create_package_source( if re.match(r".*\.(htm|html)$", url): repository_class = SinglePageRepository + if indexed: + raise RuntimeError( + "cannot set indexed=True for a single-page repository" + ) + elif indexed: + repository_class = IndexedLegacyRepository return repository_class( name, url, config=auth_config, disable_cache=disable_cache, - indexed=indexed, ) @classmethod diff --git a/src/poetry/repositories/indexed.py b/src/poetry/repositories/indexed.py new file mode 100644 index 00000000000..63a13146c29 --- /dev/null +++ b/src/poetry/repositories/indexed.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from poetry.core.packages.package import Package + +from poetry.repositories.exceptions import RepositoryError +from poetry.repositories.legacy_repository import LegacyRepository +from poetry.repositories.link_sources.html import SimpleIndexPage + + +if TYPE_CHECKING: + from poetry.core.packages.dependency import Dependency + + from poetry.config.config import Config + + +class IndexedLegacyRepository(LegacyRepository): + def __init__( + self, + name: str, + url: str, + config: Config | None = None, + disable_cache: bool = False, + ) -> None: + super().__init__(name, url.rstrip("/"), config, disable_cache) + + self._index_page = self._get_index_page() + + def find_packages(self, dependency: Dependency) -> list[Package]: + if not self._index_page.serves_package(dependency.name): + return [] + + return super().find_packages(dependency) + + def _get_index_page(self) -> SimpleIndexPage | None: + response = self._get_response("") + if not response: + raise RepositoryError( + f"Failed fetching index page for repository {self.name}" + ) + return SimpleIndexPage(response.url, response.text) diff --git a/src/poetry/repositories/legacy_repository.py b/src/poetry/repositories/legacy_repository.py index 4f6653376fd..11d14af1cb3 100644 --- a/src/poetry/repositories/legacy_repository.py +++ b/src/poetry/repositories/legacy_repository.py @@ -9,7 +9,6 @@ from poetry.inspection.info import PackageInfo from poetry.repositories.exceptions import PackageNotFound from poetry.repositories.http import HTTPRepository -from poetry.repositories.link_sources.html import SimpleIndexPage from poetry.repositories.link_sources.html import SimpleRepositoryPage from poetry.utils.helpers import canonicalize_name @@ -28,17 +27,12 @@ def __init__( url: str, config: Config | None = None, disable_cache: bool = False, - indexed: bool = False, ) -> None: if name == "pypi": raise ValueError("The name [pypi] is reserved for repositories") super().__init__(name, url.rstrip("/"), config, disable_cache) - self._index_page = None - if indexed: - self._index_page = self._get_index_page() - def find_packages(self, dependency: Dependency) -> list[Package]: packages = [] constraint, allow_prereleases = self._get_constraints_from_dependency( @@ -54,11 +48,6 @@ def find_packages(self, dependency: Dependency) -> list[Package]: if self._cache.store("matches").has(key): versions = self._cache.store("matches").get(key) else: - if self._index_page is not None and not self._index_page.serves_package( - dependency.name - ): - return [] - page = self._get_page(f"/{dependency.name.replace('.', '-')}/") if page is None: return [] @@ -158,9 +147,3 @@ def _get_page(self, endpoint: str) -> SimpleRepositoryPage | None: if not response: return None return SimpleRepositoryPage(response.url, response.text) - - def _get_index_page(self) -> SimpleIndexPage | None: - response = self._get_response("") - if not response: - return None - return SimpleIndexPage(response.url, response.text) diff --git a/tests/repositories/test_legacy_repository.py b/tests/repositories/test_legacy_repository.py index 50017089d50..f3b58750a0f 100644 --- a/tests/repositories/test_legacy_repository.py +++ b/tests/repositories/test_legacy_repository.py @@ -15,6 +15,7 @@ from poetry.factory import Factory from poetry.repositories.exceptions import PackageNotFound from poetry.repositories.exceptions import RepositoryError +from poetry.repositories.indexed import IndexedLegacyRepository from poetry.repositories.legacy_repository import LegacyRepository from poetry.repositories.link_sources.html import SimpleIndexPage from poetry.repositories.link_sources.html import SimpleRepositoryPage @@ -42,10 +43,8 @@ class MockRepository(LegacyRepository): FIXTURES = Path(__file__).parent / "fixtures" / "legacy" - def __init__(self, indexed: bool = False) -> None: - super().__init__( - "legacy", url="http://legacy.foo.bar", disable_cache=True, indexed=indexed - ) + def __init__(self) -> None: + super().__init__("legacy", url="http://legacy.foo.bar", disable_cache=True) def _get_page(self, endpoint: str) -> SimpleRepositoryPage | None: parts = endpoint.split("/") @@ -391,15 +390,7 @@ def test_get_package_retrieves_packages_with_no_hashes(): ] == package.files -def test_unindexed_has_no_root_page(): - repo = MockRepository() - assert not repo._index_page - - -class MockIndexedRepository(MockRepository): - def __init__(self) -> None: - super().__init__(True) - +class MockIndexedRepository(MockRepository, IndexedLegacyRepository): def _get_index_page(self) -> SimpleIndexPage | None: fixture = self.FIXTURES / "index.html" if not fixture.exists(): @@ -409,11 +400,6 @@ def _get_index_page(self) -> SimpleIndexPage | None: return SimpleIndexPage(self._url + "/", f.read()) -def test_indexed_has_root_page(): - repo = MockIndexedRepository() - assert repo._index_page - - def test_indexed_root_page_has_valid_content(): repo = MockIndexedRepository() assert repo._index_page.serves_package("pyyaml") @@ -449,6 +435,19 @@ def test_indexed_pep426_case_insensitive(): assert repo._index_page.serves_package("Black") +def test_indexed_retrieves_package_with_no_hashes(): + repo = MockIndexedRepository() + + package = repo.package("jupyter", "1.0.0") + + assert [ + { + "file": "jupyter-1.0.0.tar.gz", + "hash": "sha256:d9dc4b3318f310e34c82951ea5d6683f67bed7def4b259fafbfe4f1beb1d8e5f", # noqa: E501 + } + ] == package.files + + class MockHttpRepository(LegacyRepository): def __init__( self, endpoint_responses: dict, http: type[httpretty.httpretty] From 463d1846829fe3cf2625d91133733f5fe790cef9 Mon Sep 17 00:00:00 2001 From: Tom Solberg Date: Tue, 7 Jun 2022 18:15:26 +0200 Subject: [PATCH 08/10] must be there --- src/poetry/repositories/indexed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/poetry/repositories/indexed.py b/src/poetry/repositories/indexed.py index 63a13146c29..f517fc426e9 100644 --- a/src/poetry/repositories/indexed.py +++ b/src/poetry/repositories/indexed.py @@ -33,7 +33,7 @@ def find_packages(self, dependency: Dependency) -> list[Package]: return super().find_packages(dependency) - def _get_index_page(self) -> SimpleIndexPage | None: + def _get_index_page(self) -> SimpleIndexPage: response = self._get_response("") if not response: raise RepositoryError( From 174aca4c87fbf6782a71c6d60cd49a2063a8ef49 Mon Sep 17 00:00:00 2001 From: Tom Solberg Date: Tue, 7 Jun 2022 18:21:27 +0200 Subject: [PATCH 09/10] canonicalize --- src/poetry/repositories/link_sources/html.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/poetry/repositories/link_sources/html.py b/src/poetry/repositories/link_sources/html.py index 7adeba827d4..ab002444bc8 100644 --- a/src/poetry/repositories/link_sources/html.py +++ b/src/poetry/repositories/link_sources/html.py @@ -9,6 +9,7 @@ from poetry.core.packages.utils.link import Link from poetry.repositories.link_sources.base import LinkSource +from poetry.utils.helpers import canonicalize_name if TYPE_CHECKING: @@ -73,7 +74,7 @@ def links(self) -> Iterator[str]: if text is None: continue - yield text.lower().replace("-", "_") + yield canonicalize_name(text) def serves_package(self, name: str) -> bool: - return name.lower().replace("-", "_") in self._cached_packages + return canonicalize_name(name) in self._cached_packages From a0c1846c0011eae094b785805590d37cea92a4cd Mon Sep 17 00:00:00 2001 From: Tom Solberg Date: Tue, 7 Jun 2022 18:41:36 +0200 Subject: [PATCH 10/10] move import --- src/poetry/repositories/indexed.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/poetry/repositories/indexed.py b/src/poetry/repositories/indexed.py index f517fc426e9..0ec18bed37e 100644 --- a/src/poetry/repositories/indexed.py +++ b/src/poetry/repositories/indexed.py @@ -2,8 +2,6 @@ from typing import TYPE_CHECKING -from poetry.core.packages.package import Package - from poetry.repositories.exceptions import RepositoryError from poetry.repositories.legacy_repository import LegacyRepository from poetry.repositories.link_sources.html import SimpleIndexPage @@ -11,6 +9,7 @@ if TYPE_CHECKING: from poetry.core.packages.dependency import Dependency + from poetry.core.packages.package import Package from poetry.config.config import Config