Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Transforms 0.2.3.dev1 #847

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 20 additions & 55 deletions .make.defaults
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ __check_defined = \
# We create both local and remote tags. Local seems to be needed when using our spark
# base image. Remote seems to be needed by kfp.
.PHONY: .defaults.image
.defaults.image:: # Must be called with a DOCKER_IMAGE= settings.
.defaults.image:: # Must be called with a DOCKER_IMAGE_NAME= settings.
@# Help: Create the docker image $(DOCKER_LOCAL_IMAGE) and a tag for $(DOCKER_REMOTE_IMAGE)
$(call check_defined, DOCKER_IMAGE_NAME)
# The following touch seems to be needed to work around a docker build problem in which
Expand All @@ -222,14 +222,15 @@ __check_defined = \
if [ -e pyproject.toml ]; then \
touch pyproject.toml; \
fi
$(DOCKER) build -t $(DOCKER_LOCAL_IMAGE) $(DOCKER_BUILD_EXTRA_ARGS) \
$(DOCKER) build -f $(DOCKER_FILE) -t $(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) $(DOCKER_BUILD_EXTRA_ARGS) \
--platform $(DOCKER_PLATFORM) \
--build-arg EXTRA_INDEX_URL=$(EXTRA_INDEX_URL) \
--build-arg BASE_IMAGE=$(BASE_IMAGE) \
--build-arg DPK_WHEEL_FILE_NAME=$(DPK_WHEEL_FILE_NAME) \
--build-arg BUILD_DATE=$(shell date -u +'%Y-%m-%dT%H:%M:%SZ') \
--build-arg GIT_COMMIT=$(shell git log -1 --format=%h) .
$(DOCKER) tag $(DOCKER_LOCAL_IMAGE) $(DOCKER_REMOTE_IMAGE)
$(DOCKER) tag $(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) $(DOCKER_REGISTRY_ENDPOINT)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION)


# Copy a source tree in LIB_PATH, including src, pyproject.toml to LIB_NAME
# Generally used to copy source from within the repo into a local directory for use by a Dockerfile
Expand All @@ -244,24 +245,25 @@ __check_defined = \
cp -p ${LIB_PATH}/requirements.txt ${LIB_NAME}; \
fi


# Build and image using the local Dockerfile and make the data-processing-lib/python
# available in the current directory for use by the Dockerfile (i.e. to install the library).
#.PHONY: .defaults.python-lib-src-image
#.defaults.python-lib-src-image:: # Must be called with a DOCKER_LOCAL_IMAGE= settings.
# @# Help: Build the Python $(DOCKER_LOCAL_IMAGE) using the $(DOCKER_FILE), requirements.txt and data-processing-lib/python source
#ifeq ($(USE_REPO_LIB_SRC), 1)
# $(MAKE) LIB_PATH=$(DPK_PYTHON_LIB_DIR) LIB_NAME=data-processing-lib-python .defaults.copy-lib
#endif
# $(MAKE) DOCKER_IMAGE=$(DOCKER_LOCAL_IMAGE) .defaults.image
# -rm -rf data-processing-lib-python

.PHONY: .default.build-lib-wheel
.default.build-lib-wheel:
make -C $(REPOROOT)/data-processing-lib build-pkg-dist
$(MAKE) -C $(REPOROOT)/data-processing-lib build-pkg-dist
rm -rf data-processing-dist && mkdir data-processing-dist
cp $(REPOROOT)/data-processing-lib/dist/*.whl data-processing-dist


# Build and image using the local Dockerfile
# Assumes wheel has already been created
.PHONY: .defaults.lib-whl-image
.defaults.lib-whl-image::
# Must be called with a DOCKER_LOCAL_IMAGE= settings.
@# Help: Build the Python $(DOCKER_LOCAL_IMAGE) using the the wheel file for the library
@$(eval LIB_WHEEL_FILE := $(shell find data-processing-dist/*.whl))
$(eval LIB_WHEEL_FILE := $(shell basename $(LIB_WHEEL_FILE)))
$(MAKE) DPK_WHEEL_FILE_NAME=$(LIB_WHEEL_FILE) .defaults.image


# Build and image using the local Dockerfile and make the wheel for data-processing-lib
# available in the current directory for use by the Dockerfile (i.e. to install the library).
.PHONY: .defaults.python-lib-whl-image
Expand All @@ -270,28 +272,9 @@ __check_defined = \
@# Help: Build the Python $(DOCKER_LOCAL_IMAGE) using the the wheel file for the library
@$(eval LIB_WHEEL_FILE := $(shell find data-processing-dist/*.whl))
$(eval LIB_WHEEL_FILE := $(shell basename $(LIB_WHEEL_FILE)))
$(MAKE) DOCKER_IMAGE=$(DOCKER_LOCAL_IMAGE) DPK_WHEEL_FILE_NAME=$(LIB_WHEEL_FILE) .defaults.image
$(MAKE) DPK_WHEEL_FILE_NAME=$(LIB_WHEEL_FILE) .defaults.image
-rm -rf data-processing-dist

# Build an image using the local Dockerfile and make the data-processing-lib/ray
# available in the current directory for use by the Dockerfile (i.e. to install the library).
# Note that this looks for the ../python directory, which is currently only used in the transform projects,
# but we add it here as a convenience to avoid duplicating a lot of this in transforms/.make.transforms.
#.PHONY: .defaults.ray-lib-src-image
#.defaults.ray-lib-src-image:: # Must be called with a DOCKER_LOCAL_IMAGE= settings.
# @# Help: Build the Ray $(DOCKER_LOCAL_IMAGE) using the $(DOCKER_FILE), requirements.txt and data-processing-libs source
#ifeq ($(USE_REPO_LIB_SRC), 1)
# $(MAKE) LIB_PATH=$(DPK_PYTHON_LIB_DIR) LIB_NAME=data-processing-lib-python .defaults.copy-lib
# $(MAKE) LIB_PATH=$(DPK_RAY_LIB_DIR) LIB_NAME=data-processing-lib-ray .defaults.copy-lib
#endif
# if [ -e ../python ]; then \
# $(MAKE) LIB_PATH=../python LIB_NAME=python-transform .defaults.copy-lib; \
# fi
# $(MAKE) DOCKER_IMAGE=$(DOCKER_LOCAL_IMAGE) .defaults.image
# -rm -rf data-processing-lib-python
# -rm -rf data-processing-lib-ray
# -rm -rf python-transform


# Build an image using the local Dockerfile and make the data-processing wheel
# available in the current directory for use by the Dockerfile (i.e. to install the library).
Expand All @@ -306,7 +289,7 @@ __check_defined = \
if [ -e ../python ]; then \
$(MAKE) LIB_PATH=../python LIB_NAME=python-transform .defaults.copy-lib; \
fi
$(MAKE) DOCKER_IMAGE=$(DOCKER_LOCAL_IMAGE) DPK_WHEEL_FILE_NAME=$(LIB_WHEEL_FILE) .defaults.image
$(MAKE) DPK_WHEEL_FILE_NAME=$(LIB_WHEEL_FILE) .defaults.image
-rm -rf python-transform
-rm -rf data-processing-dist

Expand All @@ -316,24 +299,6 @@ __check_defined = \
.defaults.spark-lib-base-image:
$(MAKE) -C $(DPK_SPARK_LIB_DIR) image

# Note that this looks for the ../python directory, which is currently only used in the transform projects,
# but we add it here as a convenience to avoid duplicating a lot of this in transforms/.make.transforms.
# Must be called with a DOCKER_LOCAL_IMAGE= settings.
#.PHONY: .defaults.spark-lib-src-image
#.defaults.spark-lib-src-image:: .defaults.spark-lib-base-image
# @# Help: Build the Spark $(DOCKER_LOCAL_IMAGE) using the $(DOCKER_FILE), requirements.txt and data-processing-libs source
# $(MAKE) IMAGE_NAME_TO_VERIFY=$(DOCKER_SPARK_BASE_IMAGE_NAME) .defaults.verify-image-availability
#ifeq ($(USE_REPO_LIB_SRC), 1)
# $(MAKE) LIB_PATH=$(DPK_PYTHON_LIB_DIR) LIB_NAME=data-processing-lib-python .defaults.copy-lib
# $(MAKE) LIB_PATH=$(DPK_SPARK_LIB_DIR) LIB_NAME=data-processing-lib-spark .defaults.copy-lib
#endif
# if [ -e ../python ]; then \
# $(MAKE) LIB_PATH=../python LIB_NAME=python-transform .defaults.copy-lib; \
# fi
# $(MAKE) DOCKER_IMAGE=$(DOCKER_LOCAL_IMAGE) BASE_IMAGE=$(DOCKER_SPARK_BASE_IMAGE) .defaults.image
# -rm -rf data-processing-lib-python
# -rm -rf data-processing-lib-spark
# -rm -rf python-transform

.PHONY: .defaults.spark-lib-whl-image
.defaults.spark-lib-whl-image:: .default.build-lib-wheel .defaults.spark-lib-base-image
Expand All @@ -345,7 +310,7 @@ __check_defined = \
if [ -e ../python ]; then \
$(MAKE) LIB_PATH=../python LIB_NAME=python-transform .defaults.copy-lib; \
fi
$(MAKE) DOCKER_IMAGE=$(DOCKER_LOCAL_IMAGE) BASE_IMAGE=$(DOCKER_SPARK_BASE_IMAGE) DPK_WHEEL_FILE_NAME=$(LIB_WHEEL_FILE) .defaults.image
$(MAKE) BASE_IMAGE=$(DOCKER_SPARK_BASE_IMAGE) DPK_WHEEL_FILE_NAME=$(LIB_WHEEL_FILE) .defaults.image
-rm -rf python-transform
-rm -rf data-processing-dist

Expand Down
2 changes: 1 addition & 1 deletion .make.versions
Original file line number Diff line number Diff line change
Expand Up @@ -66,4 +66,4 @@ endif
#
# If you change the versions numbers, be sure to run "make set-versions" to
# update version numbers across the transform (e.g., pyproject.toml).
TRANSFORMS_PKG_VERSION=0.2.3.dev0
TRANSFORMS_PKG_VERSION=0.2.3.dev1
2 changes: 1 addition & 1 deletion kfp/kfp_support_lib/shared_workflow_support/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ authors = [
dependencies = [
"requests",
"kubernetes",
"data-prep-toolkit[ray]==0.2.3.dev0",
"data-prep-toolkit[ray]>=0.2.3.dev0",
]

[build-system]
Expand Down
153 changes: 104 additions & 49 deletions transforms/.make.cicd.targets
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,15 @@
include $(REPOROOT)/transforms/.make.transforms

######################################################################
## Default setting for TRANSFORM_RUNTIME uses folder name-- Old layout
TRANSFORM_PYTHON_RUNTIME_SRC_FILE=-m dpk_$(TRANSFORM_NAME).transform
TRANSFORM_RAY_RUNTIME_SRC_FILE=-m dpk_$(TRANSFORM_NAME).ray.transform
TRANSFORM_PYTHON_RUNTIME_SRC_FILE=-m dpk_$(TRANSFORM_NAME).spark.transform
## Default setting for TRANSFORM_RUNTIME entry point:
# python -m dpk_html2parquet.ray.transform --help
# or
# python -m dpk_html2parquet.transform_python --help
#
TRANSFORM_PYTHON_SRC?="-m dpk_$(TRANSFORM_NAME).transform_python"
TRANSFORM_RAY_SRC?="-m dpk_$(TRANSFORM_NAME).ray.transform"
TRANSFORM_SPARK_SRC?="-m dpk_$(TRANSFORM_NAME).spark.transform"


venv:: .defaults.create-venv
source venv/bin/activate && $(PIP) install -e $(REPOROOT)/data-processing-lib[ray,spark]
Expand All @@ -19,7 +24,6 @@ venv:: .defaults.create-venv
source venv/bin/activate && $(PIP) install -r requirements.txt; \
fi;


test:: .transforms.test-src test-image

clean:: .transforms.clean
Expand All @@ -28,62 +32,113 @@ clean:: .transforms.clean
set-versions::

## We need to think how we want to do this going forward
build::

image::
@if [ -e Dockerfile ]; then \
$(MAKE) image-default ; \
else \
echo "Skipping image for $(shell pwd) since no Dockerfile is present"; \
build:: image

publish:
@if [ -e Dockerfile.python ]; then \
$(MAKE) DOCKER_REMOTE_IMAGE=$(DOCKER_REGISTRY_ENDPOINT)/$(TRANSFORM_NAME)-python:$(DOCKER_IMAGE_VERSION) \
.defaults.publish-image ; \
fi
@if [ -e Dockerfile.ray ]; then \
$(MAKE) DOCKER_REMOTE_IMAGE=$(DOCKER_REGISTRY_ENDPOINT)/$(TRANSFORM_NAME)-ray:$(DOCKER_IMAGE_VERSION) \
.defaults.publish-image ; \
fi
@if [ -e Dockerfile.spark ]; then \
$(MAKE) DOCKER_REMOTE_IMAGE=$(DOCKER_REGISTRY_ENDPOINT)/$(TRANSFORM_NAME)-spark:$(DOCKER_IMAGE_VERSION) \
.defaults.publish-image ; \
fi

test-image-sequence:: .defaults.lib-whl-image .transforms.test-image-help .transforms.clean

test-image:: .default.build-lib-wheel
@if [ -e Dockerfile.python ]; then \
$(MAKE) DOCKER_FILE=Dockerfile.python \
TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_PYTHON_SRC) \
DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-python \
test-image-sequence ; \
fi
@if [ -e Dockerfile.ray ]; then \
$(MAKE) DOCKER_FILE=Dockerfile.ray \
TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_RAY_SRC) \
DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-ray \
BASE_IMAGE=$(RAY_BASE_IMAGE) \
test-image-sequence ; \
fi
@if [ -e Dockerfile.spark ]; then \
$(MAKE) DOCKER_FILE=Dockerfile.spark \
TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_SPARK_SRC) \
DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-spark \
BASE_IMAGE=$(SPARK_BASE_IMAGE) \
test-image-sequence ; \
fi
-rm -rf data-processing-dist


publish::
@if [ -e Dockerfile ]; then \
$(MAKE) publish-default ; \
else \
echo "Skipping publish for $(shell pwd) since no Dockerfile is present"; \
image-python:
@if [ -e Dockerfile.python ]; then \
$(MAKE) DOCKER_FILE=Dockerfile.python \
DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-python \
.defaults.lib-whl-image ; \
fi

publish-image::
@if [ -e Dockerfile ]; then \
$(MAKE) publish-image-default ; \
else \
echo "Skipping publish-image for $(shell pwd) since no Dockerfile is present"; \
image-ray:
@if [ -e Dockerfile.ray ]; then \
$(MAKE) DOCKER_FILE=Dockerfile.ray \
DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-ray \
BASE_IMAGE=$(RAY_BASE_IMAGE) \
.defaults.lib-whl-image ; \
fi

test-image::
@if [ -e Dockerfile ]; then \
$(MAKE) test-image-default ; \
else \
echo "Skipping test-image for $(shell pwd) since no Dockerfile is present"; \
image-spark:
@if [ -e Dockerfile.spark ]; then \
$(MAKE) DOCKER_FILE=Dockerfile.spark \
DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-spark \
BASE_IMAGE=$(SPARK_BASE_IMAGE) \
.defaults.lib-whl-image ; \
fi

image:: .default.build-lib-wheel
## Build all possible images unless a specific runtime is specified
@if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "python" ]; then \
$(MAKE) image-python ; \
fi
@if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "ray" ]; then \
$(MAKE) image-ray ; \
fi
@if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "spark" ]; then \
$(MAKE) image-spark ; \
fi
-rm -rf data-processing-dist

test-src:: .transforms.test-src

setup:: .transforms.setup

publish-default:: publish-image

publish-image-default:: .defaults.publish-image

test-image-default:: image .transforms.test-image-help .defaults.test-image-pytest .transforms.clean

build-lib-wheel:
make -C $(REPOROOT)/data-processing-lib build-pkg-dist

image-default:: build-lib-wheel
@$(eval LIB_WHEEL_FILE := $(shell find $(REPOROOT)/data-processing-lib/dist/*.whl))
rm -fr dist && mv $(REPOROOT)/data-processing-lib/dist .
$(eval WHEEL_FILE_NAME := $(shell basename $(LIB_WHEEL_FILE)))
$(DOCKER) build -t $(DOCKER_IMAGE_NAME) $(DOCKER_BUILD_EXTRA_ARGS) \
--platform $(DOCKER_PLATFORM) \
--build-arg EXTRA_INDEX_URL=$(EXTRA_INDEX_URL) \
--build-arg BASE_IMAGE=$(RAY_BASE_IMAGE) \
--build-arg BUILD_DATE=$(shell date -u +'%Y-%m-%dT%H:%M:%SZ') \
--build-arg WHEEL_FILE_NAME=$(WHEEL_FILE_NAME) \
--build-arg TRANSFORM_NAME=$(TRANSFORM_NAME) \
--build-arg GIT_COMMIT=$(shell git log -1 --format=%h) .
$(DOCKER) tag $(DOCKER_LOCAL_IMAGE) $(DOCKER_REMOTE_IMAGE)
rm -fr dist
kind-load-image:: .transforms.kind-load-image

.PHONY: workflow-vent
workflow-venv:
if [ -e kfp_ray ]; then \
$(MAKE) -C kfp_ray TRANSFORM_NAME=$(TRANSFORM_NAME) workflow-venv; \
fi

.PHONY: workflow-test
workflow-test:
if [ -e kfp_ray ]; then \
$(MAKE) -C kfp_ray TRANSFORM_NAME=$(TRANSFORM_NAME) workflow-test; \
fi

.PHONY: workflow-upload
workflow-upload:
if [ -e kfp_ray ]; then \
$(MAKE) -C kfp_ray TRANSFORM_NAME=$(TRANSFORM_NAME) workflow-upload; \
fi

.PHONY: workflow-build
workflow-build:
if [ -e kfp_ray ]; then \
$(MAKE) -C kfp_ray TRANSFORM_NAME=$(TRANSFORM_NAME) workflow-build; \
fi


2 changes: 1 addition & 1 deletion transforms/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ build-pkg-dist:
-rm -fr src
mkdir src
# Copy all the src folders recursively (not clear if they have subfolders)
for x in $(shell find . | grep '[ray| python]/src$$') ; do \
for x in $(shell find . | grep '[ray| python | spark]/src$$') ; do \
echo $$x ; \
if [ -d "$$x" ]; then \
cp -r $$x/* src ; \
Expand Down
2 changes: 1 addition & 1 deletion transforms/code/code2parquet/python/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
data-prep-toolkit==0.2.3.dev0
data-prep-toolkit>=0.2.3.dev0
parameterized
pandas
2 changes: 1 addition & 1 deletion transforms/code/code2parquet/ray/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ authors = [
{ name = "Boris Lublinsky", email = "[email protected]" },
]
dependencies = [
"data-prep-toolkit[ray]==0.2.3.dev0",
"data-prep-toolkit[ray]>=0.2.3.dev0",
"dpk-code2parquet-transform-python==0.2.3.dev0",
"parameterized",
"pandas",
Expand Down
2 changes: 1 addition & 1 deletion transforms/code/code_profiler/python/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
data-prep-toolkit==0.2.3.dev0
data-prep-toolkit>=0.2.3.dev0
parameterized
pandas
aiolimiter==1.1.0
Expand Down
2 changes: 1 addition & 1 deletion transforms/code/code_profiler/ray/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ authors = [
]
dependencies = [
"dpk-code-profiler-transform-python==0.2.3.dev0",
"data-prep-toolkit[ray]==0.2.3.dev0",
"data-prep-toolkit[ray]>=0.2.3.dev0",
]

[build-system]
Expand Down
2 changes: 1 addition & 1 deletion transforms/code/code_quality/python/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
data-prep-toolkit==0.2.3.dev0
data-prep-toolkit>=0.2.3.dev0
bs4==0.0.2
transformers==4.38.2
2 changes: 1 addition & 1 deletion transforms/code/code_quality/ray/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ authors = [
]
dependencies = [
"dpk-code-quality-transform-python==0.2.3.dev0",
"data-prep-toolkit[ray]==0.2.3.dev0",
"data-prep-toolkit[ray]>=0.2.3.dev0",
]

[build-system]
Expand Down
2 changes: 1 addition & 1 deletion transforms/code/header_cleanser/python/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
data-prep-toolkit==0.2.3.dev0
data-prep-toolkit>=0.2.3.dev0
scancode-toolkit==32.1.0 ; platform_system != 'Darwin'

Loading