Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add prefetching of index in PEP503 repositories #5442

Closed
14 changes: 14 additions & 0 deletions docs/repositories.md
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,20 @@ Note the trailing `/simple/`. This is important when configuring

{{% /note %}}

Repositories following the [PEP 503](https://peps.python.org/pep-0503/)
specification should expose a root page with individual links for each
package it serves. This isn't reliably implemented everywhere, which
leads to increased network traffic and slower resolve times. If you're
using a repository which has a valid listing, you can add the
`indexed` property to let Poetry prefetch and cache this package list.

```toml
[[tool.poetry.source]]
name = "foo"
url = "https://foo.bar/simple/"
indexed = true
```

In addition to [PEP 503](https://peps.python.org/pep-0503/), Poetry can also handle simple API
repositories that implement [PEP 658](https://peps.python.org/pep-0658/) (*Introduced in 1.2.0*).
This is helpful in reducing dependency resolution time for packages from these sources as Poetry can
Expand Down
1 change: 1 addition & 0 deletions src/poetry/config/source.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ class Source:
url: str
default: bool = dataclasses.field(default=False)
secondary: bool = dataclasses.field(default=False)
indexed: bool = dataclasses.field(default=False)

def to_dict(self) -> dict[str, str | bool]:
return dataclasses.asdict(self)
8 changes: 8 additions & 0 deletions src/poetry/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ def configure_sources(
def create_package_source(
cls, source: dict[str, str], auth_config: Config, disable_cache: bool = False
) -> LegacyRepository:
from poetry.repositories.indexed import IndexedLegacyRepository
from poetry.repositories.legacy_repository import LegacyRepository
from poetry.repositories.single_page_repository import SinglePageRepository

Expand All @@ -185,11 +186,18 @@ def create_package_source(
raise RuntimeError("Missing [name] in source.")
name = source["name"]
url = source["url"]
indexed = bool(source.get("indexed", False))

repository_class = LegacyRepository

if re.match(r".*\.(htm|html)$", url):
repository_class = SinglePageRepository
if indexed:
raise RuntimeError(
"cannot set indexed=True for a single-page repository"
)
elif indexed:
repository_class = IndexedLegacyRepository

return repository_class(
name,
Expand Down
41 changes: 41 additions & 0 deletions src/poetry/repositories/indexed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from __future__ import annotations

from typing import TYPE_CHECKING

from poetry.repositories.exceptions import RepositoryError
from poetry.repositories.legacy_repository import LegacyRepository
from poetry.repositories.link_sources.html import SimpleIndexPage


if TYPE_CHECKING:
from poetry.core.packages.dependency import Dependency
from poetry.core.packages.package import Package

from poetry.config.config import Config


class IndexedLegacyRepository(LegacyRepository):
def __init__(
self,
name: str,
url: str,
config: Config | None = None,
disable_cache: bool = False,
) -> None:
super().__init__(name, url.rstrip("/"), config, disable_cache)

self._index_page = self._get_index_page()

def find_packages(self, dependency: Dependency) -> list[Package]:
if not self._index_page.serves_package(dependency.name):
return []

return super().find_packages(dependency)

def _get_index_page(self) -> SimpleIndexPage:
response = self._get_response("")
if not response:
raise RepositoryError(
f"Failed fetching index page for repository {self.name}"
)
return SimpleIndexPage(response.url, response.text)
32 changes: 32 additions & 0 deletions src/poetry/repositories/link_sources/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from poetry.core.packages.utils.link import Link

from poetry.repositories.link_sources.base import LinkSource
from poetry.utils.helpers import canonicalize_name


if TYPE_CHECKING:
Expand Down Expand Up @@ -46,3 +47,34 @@ def __init__(self, url: str, content: str) -> None:
if not url.endswith("/"):
url += "/"
super().__init__(url=url, content=content)


class SimpleIndexPage:
"""Describes the root page of a PEP 503 compliant repository.

This contains a list of links, each one corresponding to a served project.
"""

def __init__(self, url: str, content: str) -> None:
if not url.endswith("/"):
url += "/"

self._url = url
self._content = content
self._parsed = html5lib.parse(content, namespaceHTMLElements=False)
self._cached_packages = set(self.links)

@property
def links(self) -> Iterator[str]:
# Note: PEP426 specifies that comparisons should be
# case-insensitive. For simplicity, we'll do lookups using
# lowercase-naming, and treating - and _ equivalently.
for anchor in self._parsed.findall(".//a"):
text: str | None = anchor.text
if text is None:
continue

yield canonicalize_name(text)

def serves_package(self, name: str) -> bool:
return canonicalize_name(name) in self._cached_packages
3 changes: 3 additions & 0 deletions tests/repositories/fixtures/legacy/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
<a href="pyyaml/">pyyaml</a>
<a href="missing-version/">missing-version</a>
<a href="black/">black</a>
60 changes: 60 additions & 0 deletions tests/repositories/test_legacy_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
from poetry.factory import Factory
from poetry.repositories.exceptions import PackageNotFound
from poetry.repositories.exceptions import RepositoryError
from poetry.repositories.indexed import IndexedLegacyRepository
from poetry.repositories.legacy_repository import LegacyRepository
from poetry.repositories.link_sources.html import SimpleIndexPage
from poetry.repositories.link_sources.html import SimpleRepositoryPage


Expand Down Expand Up @@ -388,6 +390,64 @@ def test_get_package_retrieves_packages_with_no_hashes():
] == package.files


class MockIndexedRepository(MockRepository, IndexedLegacyRepository):
def _get_index_page(self) -> SimpleIndexPage | None:
fixture = self.FIXTURES / "index.html"
if not fixture.exists():
return

with fixture.open(encoding="utf-8") as f:
return SimpleIndexPage(self._url + "/", f.read())


def test_indexed_root_page_has_valid_content():
repo = MockIndexedRepository()
assert repo._index_page.serves_package("pyyaml")


def test_indexed_fails_on_missing():
repo = MockIndexedRepository()

packages = repo.find_packages(Factory.create_dependency("this-doesnt-exist", "*"))

assert packages == []


def test_indexed_succeeds_on_existing():
repo = MockIndexedRepository()

packages = repo.find_packages(Factory.create_dependency("pyyaml", "*"))

assert len(packages) == 1


def test_indexed_pep426_underscore_hyphen():
repo = MockIndexedRepository()

# 'missing-version' in the index
assert repo._index_page.serves_package("missing_version")


def test_indexed_pep426_case_insensitive():
repo = MockIndexedRepository()

# 'black' in the index
assert repo._index_page.serves_package("Black")


def test_indexed_retrieves_package_with_no_hashes():
repo = MockIndexedRepository()

package = repo.package("jupyter", "1.0.0")

assert [
{
"file": "jupyter-1.0.0.tar.gz",
"hash": "sha256:d9dc4b3318f310e34c82951ea5d6683f67bed7def4b259fafbfe4f1beb1d8e5f", # noqa: E501
}
] == package.files


class MockHttpRepository(LegacyRepository):
def __init__(
self, endpoint_responses: dict, http: type[httpretty.httpretty]
Expand Down