Skip to content

Commit

Permalink
added doc_id
Browse files Browse the repository at this point in the history
Signed-off-by: matouma <[email protected]>
  • Loading branch information
matouma committed Dec 14, 2024
2 parents 9feee2d + 2eb47bd commit c5c1540
Show file tree
Hide file tree
Showing 51 changed files with 720 additions and 1,043 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,6 @@ FROM docker.io/python:3.10.14-slim-bullseye

RUN pip install --upgrade --no-cache-dir pip

# install pytest
RUN pip install --no-cache-dir pytest

# Create a user and use it to run the transform
RUN useradd -ms /bin/bash dpk
USER dpk
Expand All @@ -16,19 +13,10 @@ ARG DPK_WHEEL_FILE_NAME
COPY --chown=dpk:root data-processing-dist data-processing-dist
RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}

COPY --chown=dpk:root src/ src/
COPY --chown=dpk:root pyproject.toml pyproject.toml
COPY --chown=dpk:root README.md README.md
COPY --chown=dpk:root dpk_doc_id/ dpk_doc_id/
COPY --chown=dpk:root requirements.txt requirements.txt
RUN pip install --no-cache-dir -e .

# copy source data
COPY ./src/doc_id_transform_python.py .
COPY ./src/doc_id_local.py local/
RUN pip install --no-cache-dir -r requirements.txt

# copy test
COPY test/ test/
COPY test-data/ test-data/

# Set environment
ENV PYTHONPATH /home/dpk
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310

FROM ${BASE_IMAGE}

RUN pip install --upgrade --no-cache-dir pip
RUN pip install --upgrade --no-cache-dir pip

# install pytest
RUN pip install --no-cache-dir pytest
Expand All @@ -14,24 +14,9 @@ COPY --chown=ray:users data-processing-dist data-processing-dist
RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray]

## Copy the python version of the tansform
COPY --chown=ray:users python-transform/ python-transform/
RUN cd python-transform && pip install --no-cache-dir -e .

# Install ray project source
COPY --chown=ray:users src/ src/
COPY --chown=ray:users pyproject.toml pyproject.toml
COPY --chown=ray:users README.md README.md
RUN pip install --no-cache-dir -e .

# copy the main() entry point to the image
COPY ./src/doc_id_transform_ray.py .

# copy some of the samples in
COPY src/doc_id_local_ray.py local/

# copy test
COPY test/ test/
COPY test-data/ test-data/
COPY --chown=ray:users dpk_doc_id/ dpk_doc_id/
COPY --chown=ray:users requirements.txt requirements.txt
RUN pip install -r requirements.txt

# Set environment
ENV PYTHONPATH /home/ray
Expand All @@ -40,4 +25,4 @@ ENV PYTHONPATH /home/ray
ARG BUILD_DATE
ARG GIT_COMMIT
LABEL build-date=$BUILD_DATE
LABEL git-commit=$GIT_COMMIT
LABEL git-commit=$GIT_COMMIT
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
ARG BASE_IMAGE=quay.io/dataprep1/data-prep-kit/data-prep-kit-spark-3.5.2:latest
FROM ${BASE_IMAGE}
FROM quay.io/dataprep1/data-prep-kit/data-prep-kit-spark-3.5.2:latest

USER root
# install pytest
Expand All @@ -15,19 +14,12 @@ RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[spark]


# Install project source
COPY --chown=spark:root src/ src/
COPY --chown=spark:root pyproject.toml pyproject.toml
RUN pip install --no-cache-dir -e .

# copy the main() entry point to the image
COPY ./src/doc_id_transform_spark.py .
## Copy the python version of the tansform
COPY --chown=spark:root dpk_doc_id/ dpk_doc_id/
COPY --chown=spark:root requirements.txt requirements.txt
RUN pip install -r requirements.txt

# copy some of the samples in
COPY src/doc_id_local_spark.py local/

# copy test
COPY test/ test/
COPY test-data/ test-data/

USER spark

Expand Down
105 changes: 28 additions & 77 deletions transforms/universal/doc_id/Makefile
Original file line number Diff line number Diff line change
@@ -1,79 +1,30 @@
REPOROOT=../../..
# Use make help, to see the available rules
include $(REPOROOT)/.make.defaults

setup::
@# Help: Recursively make $@ all subdirs
$(MAKE) RULE=$@ .recurse

clean::
@# Help: Recursively make $@ all subdirs
$(MAKE) RULE=$@ .recurse

build::
@# Help: Recursively make $@ in subdirs
$(MAKE) RULE=$@ .recurse
venv::
@# Help: Recursively make $@ in subdirs
$(MAKE) RULE=$@ .recurse

image::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

publish::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

test-image::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

test::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

test-src::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

set-versions::
@# Help: Recursively $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

kind-load-image::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

docker-load-image::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

docker-save-image::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

.PHONY: workflow-venv
workflow-venv:
if [ -e kfp_ray ]; then \
$(MAKE) -C kfp_ray workflow-venv; \
fi

.PHONY: workflow-test
workflow-test:
if [ -e kfp_ray ]; then \
$(MAKE) -C kfp_ray workflow-test; \
fi

.PHONY: workflow-upload
workflow-upload:
if [ -e kfp_ray ]; then \
$(MAKE) -C kfp_ray workflow-upload; \
fi

.PHONY: workflow-build
workflow-build:
if [ -e kfp_ray ]; then \
$(MAKE) -C kfp_ray workflow-build; \
fi

include $(REPOROOT)/transforms/.make.cicd.targets

#
# This is intended to be included across the Makefiles provided within
# a given transform's directory tree, so must use compatible syntax.
#
################################################################################
# This defines the name of the transform and is used to match against
# expected files and is used to define the transform's image name.
TRANSFORM_NAME=$(shell basename `pwd`)

################################################################################



run-cli-spark-sample:
make venv
source venv/bin/activate && \
$(PYTHON) -m dpk_$(TRANSFORM_NAME).spark.transform \
--data_local_config "{ 'input_folder' : 'test-data/input', 'output_folder' : 'output'}" \
--doc_id_int True

run-cli-ray-sample:
make venv
source venv/bin/activate && \
$(PYTHON) -m dpk_$(TRANSFORM_NAME).ray.transform \
--run_locally True --data_local_config "{ 'input_folder' : 'test-data/input', 'output_folder' : 'output'}" \
--doc_id_int True
Loading

0 comments on commit c5c1540

Please sign in to comment.