From 57838a10d7fde68e5042ea8c6652906c1ff46d37 Mon Sep 17 00:00:00 2001 From: Benjamin Lefaudeux Date: Mon, 28 Oct 2024 17:17:53 +0000 Subject: [PATCH 1/3] Adding a pre-commit Adding more go linting Rebase off golangci GHA --- .github/workflows/go.yml | 16 +++++++++++++--- .pre-commit-config.yaml | 14 ++++++++++++++ MANIFEST.in | 2 +- generate_python_package.sh | 2 -- pyproject.toml | 2 +- requirements.txt | 2 +- src/polyglot.py | 2 +- 7 files changed, 31 insertions(+), 9 deletions(-) create mode 100644 .pre-commit-config.yaml diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 7b0e5a4..b1c0621 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -16,15 +16,25 @@ jobs: - uses: actions/checkout@v4 - name: Set up Go - uses: actions/setup-go@v4 - with: - go-version: "1.20" + uses: golangci/golangci-lint-action@v6 + + # uses: actions/setup-go@v4 + # with: + # go-version: "1.20" - name: Install linux deps run: | sudo apt-get update sudo apt-get -y install libvips-dev + - name: Install pre-commit + run: | + python -m pip install --upgrade pip + pip install pre-commit + + - name: Run pre-commit + run: pre-commit run --all-files + - name: Build run: cd src/cmd/main && go build -v main.go diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..7127eaf --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,14 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.3.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - repo: https://github.com/dnephin/pre-commit-golang + rev: v0.5.1 + hooks: + - id: go-fmt + - id: go-imports + - id: golangci-lint + args: ["run", "src"] diff --git a/MANIFEST.in b/MANIFEST.in index f0177f0..8bf9108 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,4 +2,4 @@ include *.md include *.py include datago/*.h include datago/*.c -include datago/*.so \ No newline at end of file +include datago/*.so diff --git a/generate_python_package.sh b/generate_python_package.sh index 2839eb2..3eeee94 100755 --- a/generate_python_package.sh +++ b/generate_python_package.sh @@ -22,5 +22,3 @@ rm LICENSE rm MANIFEST.in cd ../../.. - - diff --git a/pyproject.toml b/pyproject.toml index 1b6f0d6..56ccdfb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,4 +15,4 @@ classifiers = [ [project.urls] Homepage = "https://github.com/photoroom/datago" -Issues = "https://github.com/photoroom/datago/issues" \ No newline at end of file +Issues = "https://github.com/photoroom/datago/issues" diff --git a/requirements.txt b/requirements.txt index 55b033e..e079f8a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -pytest \ No newline at end of file +pytest diff --git a/src/polyglot.py b/src/polyglot.py index 98202d3..dab8fb7 100644 --- a/src/polyglot.py +++ b/src/polyglot.py @@ -63,4 +63,4 @@ def go_array_to_pil_image(go_array): return Image.frombuffer("RGBA", (w, h), np_array, "raw", "RGBA", 0, 1) assert c == 3, "Expected 3 channels" - return Image.fromarray(np_array) \ No newline at end of file + return Image.fromarray(np_array) From 7defb20a39e432098bdb28d610441b946d740da3 Mon Sep 17 00:00:00 2001 From: Benjamin Lefaudeux Date: Thu, 31 Oct 2024 11:20:00 +0000 Subject: [PATCH 2/3] simpler hierarchy Lint fixes --- .github/workflows/go.yml | 17 +++++----- .github/workflows/gopy.yml | 4 +-- .pre-commit-config.yaml | 3 -- README.md | 41 ++++++++++--------------- src/benchmark.py => benchmark.py | 0 {src/cmd/main => cmd}/main.go | 12 ++++++-- generate_python_package.sh | 4 +-- src/go.mod => go.mod | 0 src/go.sum => go.sum | 0 {src/pkg/client => pkg}/backend_http.go | 0 {src/pkg/client => pkg}/client.go | 2 -- {src/pkg/client => pkg}/frontend_db.go | 0 {src/pkg/client => pkg}/serdes.go | 6 +++- {src/pkg/client => pkg}/transforms.go | 0 {src/pkg/client => pkg}/utils.go | 0 src/polyglot.py => polyglot.py | 0 {src/tests => tests}/client_test.go | 2 +- 17 files changed, 42 insertions(+), 49 deletions(-) rename src/benchmark.py => benchmark.py (100%) rename {src/cmd/main => cmd}/main.go (95%) rename src/go.mod => go.mod (100%) rename src/go.sum => go.sum (100%) rename {src/pkg/client => pkg}/backend_http.go (100%) rename {src/pkg/client => pkg}/client.go (99%) rename {src/pkg/client => pkg}/frontend_db.go (100%) rename {src/pkg/client => pkg}/serdes.go (97%) rename {src/pkg/client => pkg}/transforms.go (100%) rename {src/pkg/client => pkg}/utils.go (100%) rename src/polyglot.py => polyglot.py (100%) rename {src/tests => tests}/client_test.go (99%) diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index b1c0621..46fd8bf 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -15,28 +15,25 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Set up Go - uses: golangci/golangci-lint-action@v6 - - # uses: actions/setup-go@v4 - # with: - # go-version: "1.20" - - name: Install linux deps run: | sudo apt-get update - sudo apt-get -y install libvips-dev + sudo apt-get -y install libvips-dev libjpeg-turbo8-dev + + - name: Set up Go + uses: golangci/golangci-lint-action@v6 - name: Install pre-commit run: | python -m pip install --upgrade pip pip install pre-commit + go install golang.org/x/tools/cmd/goimports@latest - name: Run pre-commit run: pre-commit run --all-files - name: Build - run: cd src/cmd/main && go build -v main.go + run: cd cmd && go build -v main.go - name: Test env: @@ -44,4 +41,4 @@ jobs: DATAROOM_TEST_SOURCE: ${{ secrets.DATAROOM_TEST_SOURCE }} DATAROOM_API_URL: ${{ secrets.DATAROOM_API_URL }} - run: cd src/tests && go test -v . + run: cd tests && go test -v . diff --git a/.github/workflows/gopy.yml b/.github/workflows/gopy.yml index f2694f7..1541e45 100644 --- a/.github/workflows/gopy.yml +++ b/.github/workflows/gopy.yml @@ -38,7 +38,7 @@ jobs: - name: Build python module run: | - cd src/pkg/client + cd pkg gopy pkg -author="Photoroom" -email="team@photoroom.com" -name="datago" . export DESTINATION="../../../build" mkdir -p $DESTINATION/datago @@ -47,7 +47,7 @@ jobs: mv Makefile $DESTINATION/. mv README.md $DESTINATION/. rm LICENSE MANIFEST.in - cd ../../../build + cd ../build - name: Install python module run: | diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7127eaf..a88190e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,6 +9,3 @@ repos: rev: v0.5.1 hooks: - id: go-fmt - - id: go-imports - - id: golangci-lint - args: ["run", "src"] diff --git a/README.md b/README.md index a3344a7..628a93b 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,12 @@ [![Build & Test](https://github.com/Photoroom/datago/actions/workflows/go.yml/badge.svg)](https://github.com/Photoroom/datago/actions/workflows/go.yml) [![Gopy](https://github.com/Photoroom/datago/actions/workflows/gopy.yml/badge.svg)](https://github.com/Photoroom/datago/actions/workflows/gopy.yml) -datago -====== +# datago A golang-based data loader which can be used from Python. Compatible with a soon-to-be open sourced VectorDB-enabled data stack, which exposes HTTP requests. Datago handles, outside of the Python GIL + - per sample IO from object storage - deserialization (jpg and png decompression) - some optional vision processing (aligning different image payloads) @@ -19,11 +19,9 @@ Datago is rank and world-size aware, in which case the samples are dispatched de Screenshot 2024-09-24 at 9 39 44 PM -
Use it -Use the package from Python ---------------------------- +## Use the package from Python ```python from datago import datago @@ -40,26 +38,22 @@ for _ in range(10): Please note that the image buffers will be passed around as raw pointers, they can be re-interpreted in python with the attached helpers - -Match the raw exported buffers with typical python types --------------------------------------------------------- +## Match the raw exported buffers with typical python types See helper functions provided in `polyglot.py`, should be self explanatory
Build it -Install deps ------------- +## Install deps ```bash $ sudo apt install golang libjpeg-turbo8-dev libvips-dev $ sudo ldconfig ``` -Build a benchmark CLI ---------------------- +## Build a benchmark CLI -From the root of this project `datago_src`: +From the root of this project: ```bash $ go build cmd/main/main.go @@ -77,23 +71,20 @@ Running it with additional sanity checks $ go run -race cmd/main/main.go ``` -Run the go test suite ---------------------- +## Run the go test suite -From the src folder +From the root folder ```bash $ go test -v tests/client_test.go ``` -Refresh the python package and its binaries -------------------------------------------- +## Refresh the python package and its binaries - Install the dependencies as detailed in the next point - Run the `generate_python_package.sh` script -Generate the python package binaries manually ---------------------------------------------- +## Generate the python package binaries manually ```bash $ python3 -m pip install pybindgen @@ -103,11 +94,12 @@ $ go install golang.org/x/image/draw ``` NOTE: + - you may need to add `~/go/bin` to your PATH so that gopy is found. - - Either `export PATH=$PATH:~/go/bin` or add it to your .bashrc - you may need this to make sure that LDD looks at the current folder `export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:.` -then from the /pkg/client folder: +then from the /pkg folder: ```bash $ gopy pkg -author="Photoroom" -email="team@photoroom.com" -url="" -name="datago" -version="0.0.1" . @@ -115,18 +107,17 @@ $ gopy pkg -author="Photoroom" -email="team@photoroom.com" -url="" -name="datago then you can `pip install -e .` from here. +## Update the pypi release (maintainers) -Update the pypi release (maintainers) -------------------------------------- ``` python3 setup.py sdist python3 -m twine upload dist/* --verbose ``` +
+# License -License -======= MIT License Copyright (c) 2024 Photoroom diff --git a/src/benchmark.py b/benchmark.py similarity index 100% rename from src/benchmark.py rename to benchmark.py diff --git a/src/cmd/main/main.go b/cmd/main.go similarity index 95% rename from src/cmd/main/main.go rename to cmd/main.go index 6307f60..70b14b5 100644 --- a/src/cmd/main/main.go +++ b/cmd/main.go @@ -1,7 +1,7 @@ package main import ( - datago "datago/pkg/client" + datago "datago/pkg" "flag" "fmt" "os" @@ -53,13 +53,19 @@ func main() { f, _ := os.Create("trace.out") // read with go tool trace trace.out - trace.Start(f) + err := trace.Start(f) + if err != nil { + panic(err) + } defer trace.Stop() } { f, _ := os.Create("cpu.prof") // read with go tool pprof cpu.prof - pprof.StartCPUProfile(f) + err := pprof.StartCPUProfile(f) + if err != nil { + panic(err) + } defer pprof.StopCPUProfile() } } diff --git a/generate_python_package.sh b/generate_python_package.sh index 3eeee94..181908f 100755 --- a/generate_python_package.sh +++ b/generate_python_package.sh @@ -11,7 +11,7 @@ DESTINATION="../../../python_$python_version" rm -rf $DESTINATION # Build the python package via the gopy toolchain -cd src/pkg/client +cd pkg gopy pkg -author="Photoroom" -email="team@photoroom.com" -url="" -name="datago" -version="0.3" . mkdir -p $DESTINATION/datago mv datago/* $DESTINATION/datago/. @@ -21,4 +21,4 @@ mv README.md $DESTINATION/. rm LICENSE rm MANIFEST.in -cd ../../.. +cd .. diff --git a/src/go.mod b/go.mod similarity index 100% rename from src/go.mod rename to go.mod diff --git a/src/go.sum b/go.sum similarity index 100% rename from src/go.sum rename to go.sum diff --git a/src/pkg/client/backend_http.go b/pkg/backend_http.go similarity index 100% rename from src/pkg/client/backend_http.go rename to pkg/backend_http.go diff --git a/src/pkg/client/client.go b/pkg/client.go similarity index 99% rename from src/pkg/client/client.go rename to pkg/client.go index e444e65..636fca2 100644 --- a/src/pkg/client/client.go +++ b/pkg/client.go @@ -4,7 +4,6 @@ import ( "context" "fmt" "log" - "net/http" "os" "runtime" "runtime/debug" @@ -88,7 +87,6 @@ type DatagoConfig struct { type DatagoClient struct { concurrency int - baseRequest http.Request context context.Context waitGroup *sync.WaitGroup diff --git a/src/pkg/client/frontend_db.go b/pkg/frontend_db.go similarity index 100% rename from src/pkg/client/frontend_db.go rename to pkg/frontend_db.go diff --git a/src/pkg/client/serdes.go b/pkg/serdes.go similarity index 97% rename from src/pkg/client/serdes.go rename to pkg/serdes.go index 5c22ce1..8f37ca9 100644 --- a/src/pkg/client/serdes.go +++ b/pkg/serdes.go @@ -133,6 +133,7 @@ func fetchURL(client *http.Client, url string, retries int) (URLPayload, error) if i == retries-1 { err_msg = fmt.Sprintf("failed to fetch %s %s", url, err) } + exponentialBackoffWait(i) continue } defer resp.Body.Close() @@ -141,13 +142,14 @@ func fetchURL(client *http.Client, url string, retries int) (URLPayload, error) if err != nil { // Renew the http client, not a shared resource client = &http.Client{Timeout: 30 * time.Second} + exponentialBackoffWait(i) continue } return URLPayload{url: url, content: body_bytes}, nil } - return URLPayload{url: url, content: nil}, fmt.Errorf(err_msg) + return URLPayload{url: url, content: nil}, fmt.Errorf("%s", err_msg) } func fetchImage(client *http.Client, url string, retries int, transform *ARAwareTransform, aspect_ratio float64, pre_encode_image bool, is_mask bool) (*ImagePayload, float64, error) { @@ -158,6 +160,7 @@ func fetchImage(client *http.Client, url string, retries int, transform *ARAware resp, err := client.Get(url) if err != nil { err_report = err + exponentialBackoffWait(i) // Renew the client in case the connection was closed client = &http.Client{Timeout: 30 * time.Second} @@ -168,6 +171,7 @@ func fetchImage(client *http.Client, url string, retries int, transform *ARAware body_bytes, err := readBodyBuffered(resp) if err != nil { err_report = err + exponentialBackoffWait(i) continue } diff --git a/src/pkg/client/transforms.go b/pkg/transforms.go similarity index 100% rename from src/pkg/client/transforms.go rename to pkg/transforms.go diff --git a/src/pkg/client/utils.go b/pkg/utils.go similarity index 100% rename from src/pkg/client/utils.go rename to pkg/utils.go diff --git a/src/polyglot.py b/polyglot.py similarity index 100% rename from src/polyglot.py rename to polyglot.py diff --git a/src/tests/client_test.go b/tests/client_test.go similarity index 99% rename from src/tests/client_test.go rename to tests/client_test.go index 7774cbf..296f80f 100644 --- a/src/tests/client_test.go +++ b/tests/client_test.go @@ -4,7 +4,7 @@ import ( "os" "testing" - datago "datago/pkg/client" + datago "datago/pkg" "github.com/davidbyttow/govips/v2/vips" ) From b9a929bd4ff36c39a6faeedd0579de3d631163e7 Mon Sep 17 00:00:00 2001 From: Benjamin Lefaudeux Date: Thu, 31 Oct 2024 11:42:31 +0000 Subject: [PATCH 3/3] moving the python files around, slightly cleaner landing page --- .github/workflows/gopy.yml | 5 +++-- benchmark.py => python/benchmark.py | 15 +++++++++++---- polyglot.py => python/polyglot.py | 0 {python_tests => python/tests}/datago_test.py | 0 4 files changed, 14 insertions(+), 6 deletions(-) rename benchmark.py => python/benchmark.py (85%) rename polyglot.py => python/polyglot.py (100%) rename {python_tests => python/tests}/datago_test.py (100%) diff --git a/.github/workflows/gopy.yml b/.github/workflows/gopy.yml index 1541e45..a23a360 100644 --- a/.github/workflows/gopy.yml +++ b/.github/workflows/gopy.yml @@ -40,13 +40,14 @@ jobs: run: | cd pkg gopy pkg -author="Photoroom" -email="team@photoroom.com" -name="datago" . - export DESTINATION="../../../build" + export DESTINATION="../build" mkdir -p $DESTINATION/datago mv datago/* $DESTINATION/datago/. mv setup.py $DESTINATION/. mv Makefile $DESTINATION/. mv README.md $DESTINATION/. rm LICENSE MANIFEST.in + ls cd ../build - name: Install python module @@ -64,4 +65,4 @@ jobs: run: | ls python3 -m pip install -r requirements.txt - pytest -xv python_tests/* + pytest -xv python/tests/* diff --git a/benchmark.py b/python/benchmark.py similarity index 85% rename from benchmark.py rename to python/benchmark.py index 85d9f6b..f170ade 100644 --- a/benchmark.py +++ b/python/benchmark.py @@ -3,13 +3,15 @@ import typer from tqdm import tqdm import numpy as np -from polyglot import go_array_to_pil_image, go_array_to_numpy +from python.polyglot import go_array_to_pil_image, go_array_to_numpy def benchmark( source: str = typer.Option("SOURCE", help="The source to test out"), limit: int = typer.Option(2000, help="The number of samples to test on"), - crop_and_resize: bool = typer.Option(True, help="Crop and resize the images on the fly"), + crop_and_resize: bool = typer.Option( + True, help="Crop and resize the images on the fly" + ), require_images: bool = typer.Option(True, help="Request the original images"), require_embeddings: bool = typer.Option(False, help="Request embeddings"), test_masks: bool = typer.Option(True, help="Test masks"), @@ -49,8 +51,13 @@ def benchmark( for _, mask_buffer in sample.Masks.items(): mask = go_array_to_pil_image(mask_buffer) - if hasattr(sample, "AdditionalImages") and "masked_image" in sample.AdditionalImages: - masked_image = go_array_to_pil_image(sample.AdditionalImages["masked_image"]) + if ( + hasattr(sample, "AdditionalImages") + and "masked_image" in sample.AdditionalImages + ): + masked_image = go_array_to_pil_image( + sample.AdditionalImages["masked_image"] + ) # Bring the latents to numpy if hasattr(sample, "Latents"): diff --git a/polyglot.py b/python/polyglot.py similarity index 100% rename from polyglot.py rename to python/polyglot.py diff --git a/python_tests/datago_test.py b/python/tests/datago_test.py similarity index 100% rename from python_tests/datago_test.py rename to python/tests/datago_test.py