From dae2276edf28bcf3c7eb570e7a7d6c8a8fd41e38 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Tue, 26 Nov 2024 13:45:12 -0500 Subject: [PATCH 1/4] build transforms==0.2.3.dev0 release with toolkit==0.2.2 Signed-off-by: Maroun Touma --- .make.versions | 4 +- data-processing-lib/pyproject.toml | 2 +- .../createRayClusterComponent.yaml | 2 +- .../deleteRayClusterComponent.yaml | 2 +- .../executeRayJobComponent.yaml | 2 +- .../executeRayJobComponent_multi_s3.yaml | 2 +- .../executeSubWorkflowComponent.yaml | 2 +- .../kfp_v1_workflow_support/pyproject.toml | 4 +- .../kfp_v2_workflow_support/pyproject.toml | 4 +- .../shared_workflow_support/pyproject.toml | 4 +- transforms/Makefile | 2 +- .../code2parquet/kfp_ray/code2parquet_wf.py | 2 +- .../code/code2parquet/python/pyproject.toml | 2 +- .../code/code2parquet/python/requirements.txt | 2 +- .../code/code2parquet/ray/pyproject.toml | 6 +- .../code/code_profiler/python/pyproject.toml | 2 +- .../code_profiler/python/requirements.txt | 2 +- .../code/code_profiler/ray/pyproject.toml | 6 +- .../code_quality/kfp_ray/code_quality_wf.py | 2 +- .../code/code_quality/python/pyproject.toml | 2 +- .../code/code_quality/python/requirements.txt | 2 +- .../code/code_quality/ray/pyproject.toml | 6 +- .../kfp_ray/header_cleanser_wf.py | 2 +- .../header_cleanser/python/pyproject.toml | 2 +- .../header_cleanser/python/requirements.txt | 2 +- .../code/header_cleanser/ray/pyproject.toml | 6 +- .../kfp_ray/license_select_wf.py | 2 +- .../code/license_select/python/pyproject.toml | 2 +- .../license_select/python/requirements.txt | 2 +- .../code/license_select/ray/pyproject.toml | 6 +- transforms/code/malware/kfp_ray/malware_wf.py | 2 +- transforms/code/malware/python/pyproject.toml | 4 +- transforms/code/malware/ray/pyproject.toml | 6 +- .../kfp_ray/proglang_select_wf.py | 2 +- .../proglang_select/python/pyproject.toml | 2 +- .../proglang_select/python/requirements.txt | 2 +- .../code/proglang_select/ray/pyproject.toml | 6 +- .../kfp_ray/repo_level_order_wf.py | 2 +- .../repo_level_ordering/ray/pyproject.toml | 4 +- .../kfp_ray/doc_chunk_multiple_wf.py | 2 +- .../doc_chunk/kfp_ray/doc_chunk_wf.py | 2 +- .../doc_chunk/python/requirements.txt | 2 +- .../language/doc_chunk/ray/pyproject.toml | 2 +- .../kfp_ray/doc_quality_multiple_wf.py | 2 +- .../doc_quality/kfp_ray/doc_quality_wf.py | 2 +- .../doc_quality/python/pyproject.toml | 2 +- .../doc_quality/python/requirements.txt | 2 +- .../language/doc_quality/ray/pyproject.toml | 6 +- .../html2parquet/kfp_ray/html2parquet_wf.py | 2 +- .../html2parquet/python/pyproject.toml | 2 +- .../html2parquet/python/requirements.txt | 2 +- .../language/html2parquet/ray/pyproject.toml | 2 +- .../html2parquet/ray/requirements.txt | 4 +- .../lang_id/kfp_ray/lang_id_multiple_wf.py | 2 +- .../language/lang_id/kfp_ray/lang_id_wf.py | 2 +- .../language/lang_id/python/pyproject.toml | 2 +- .../language/lang_id/python/requirements.txt | 2 +- .../language/lang_id/ray/pyproject.toml | 6 +- .../kfp_ray/pdf2parquet_multiple_wf.py | 2 +- .../pdf2parquet/kfp_ray/pdf2parquet_wf.py | 2 +- .../pdf2parquet/python/requirements.txt | 2 +- .../language/pdf2parquet/ray/requirements.txt | 2 +- .../pii_redactor/python/pyproject.toml | 2 +- .../pii_redactor/python/requirements.txt | 2 +- .../language/pii_redactor/ray/pyproject.toml | 6 +- .../kfp_ray/text_encoder_multiple_wf.py | 2 +- .../text_encoder/kfp_ray/text_encoder_wf.py | 2 +- .../text_encoder/python/pyproject.toml | 2 +- .../text_encoder/python/requirements.txt | 2 +- .../language/text_encoder/ray/pyproject.toml | 6 +- transforms/requirements-ray.txt | 2 +- transforms/requirements.txt | 2 +- .../universal/doc_id/kfp_ray/doc_id_wf.py | 2 +- .../universal/doc_id/python/pyproject.toml | 2 +- .../universal/doc_id/python/requirements.txt | 2 +- .../universal/doc_id/ray/pyproject.toml | 6 +- .../universal/doc_id/spark/pyproject.toml | 4 +- .../universal/ededup/kfp_ray/ededup_wf.py | 2 +- .../universal/ededup/python/pyproject.toml | 2 +- .../universal/ededup/python/requirements.txt | 2 +- .../universal/ededup/ray/pyproject.toml | 6 +- .../universal/fdedup/fdedup_python.ipynb | 377 +++++++++++++++- transforms/universal/fdedup/fdedup_ray.ipynb | 417 +++++++++++++++++- .../universal/fdedup/kfp_ray/fdedup_wf.py | 2 +- .../universal/fdedup/python/pyproject.toml | 2 +- .../universal/fdedup/python/requirements.txt | 2 +- .../universal/fdedup/ray/pyproject.toml | 2 +- .../universal/fdedup/ray/requirements.txt | 4 +- .../universal/fdedup/spark/pyproject.toml | 2 +- .../universal/fdedup/spark/requirements.txt | 4 +- .../universal/filter/kfp_ray/filter_wf.py | 2 +- .../universal/filter/python/pyproject.toml | 2 +- .../universal/filter/python/requirements.txt | 2 +- .../universal/filter/ray/pyproject.toml | 6 +- .../universal/filter/spark/pyproject.toml | 4 +- .../universal/hap/kfp_ray.disable/hap_wf.py | 2 +- .../universal/hap/python/pyproject.toml | 2 +- .../universal/hap/python/requirements.txt | 2 +- transforms/universal/hap/ray/pyproject.toml | 2 +- transforms/universal/hap/ray/requirements.txt | 4 +- .../noop/kfp_ray/noop_multiple_wf.py | 2 +- transforms/universal/noop/kfp_ray/noop_wf.py | 2 +- .../universal/noop/python/pyproject.toml | 4 +- transforms/universal/noop/ray/pyproject.toml | 6 +- .../universal/noop/spark/pyproject.toml | 6 +- .../universal/profiler/kfp_ray/profiler_wf.py | 2 +- .../universal/profiler/python/pyproject.toml | 2 +- .../profiler/python/requirements.txt | 2 +- .../universal/profiler/ray/pyproject.toml | 6 +- .../universal/profiler/spark/pyproject.toml | 6 +- .../universal/resize/kfp_ray/resize_wf.py | 2 +- .../universal/resize/python/pyproject.toml | 2 +- .../universal/resize/python/requirements.txt | 2 +- .../universal/resize/ray/pyproject.toml | 6 +- .../universal/resize/spark/pyproject.toml | 6 +- .../tokenization/kfp_ray/tokenization_wf.py | 2 +- .../tokenization/python/pyproject.toml | 2 +- .../tokenization/python/requirements.txt | 2 +- .../universal/tokenization/ray/pyproject.toml | 6 +- .../universal/web2parquet/requirements.txt | 2 +- 120 files changed, 938 insertions(+), 202 deletions(-) diff --git a/.make.versions b/.make.versions index 564caa214..ba5e87b0f 100644 --- a/.make.versions +++ b/.make.versions @@ -16,10 +16,10 @@ DPK_MAJOR_VERSION=0 # The minor version is incremented manually when significant features have been added that are backward compatible with the previous major.minor release. DPK_MINOR_VERSION=2 # The minor version is incremented AUTOMATICALLY by the release.sh script when a new release is set. -DPK_MICRO_VERSION=3 +DPK_MICRO_VERSION=2 # The suffix is generally always set in the main/development branch and only nulled out when creating release branches. # It can be manually incremented, for example, to allow publishing a new intermediate version wheel to pypi. -DPK_VERSION_SUFFIX=.dev0 +DPK_VERSION_SUFFIX= DPK_VERSION=$(DPK_MAJOR_VERSION).$(DPK_MINOR_VERSION).$(DPK_MICRO_VERSION)$(DPK_VERSION_SUFFIX) diff --git a/data-processing-lib/pyproject.toml b/data-processing-lib/pyproject.toml index a347a14a1..4f5734be1 100644 --- a/data-processing-lib/pyproject.toml +++ b/data-processing-lib/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit" -version = "0.2.3.dev0" +version = "0.2.2" keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] requires-python = ">=3.10,<3.13" description = "Data Preparation Toolkit Library for Ray and Python" diff --git a/kfp/kfp_ray_components/createRayClusterComponent.yaml b/kfp/kfp_ray_components/createRayClusterComponent.yaml index 30b0b66d8..78976a97c 100644 --- a/kfp/kfp_ray_components/createRayClusterComponent.yaml +++ b/kfp/kfp_ray_components/createRayClusterComponent.yaml @@ -11,7 +11,7 @@ inputs: implementation: container: - image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" + image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/deleteRayClusterComponent.yaml b/kfp/kfp_ray_components/deleteRayClusterComponent.yaml index 44e199c47..c75554d5f 100644 --- a/kfp/kfp_ray_components/deleteRayClusterComponent.yaml +++ b/kfp/kfp_ray_components/deleteRayClusterComponent.yaml @@ -9,7 +9,7 @@ inputs: implementation: container: - image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" + image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/executeRayJobComponent.yaml b/kfp/kfp_ray_components/executeRayJobComponent.yaml index 7ab517bff..2e02c3adf 100644 --- a/kfp/kfp_ray_components/executeRayJobComponent.yaml +++ b/kfp/kfp_ray_components/executeRayJobComponent.yaml @@ -12,7 +12,7 @@ inputs: implementation: container: - image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" + image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml b/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml index 9b98912f0..37c0198bf 100644 --- a/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml +++ b/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml @@ -13,7 +13,7 @@ inputs: implementation: container: - image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" + image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml b/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml index 6b261a003..ec82e9484 100644 --- a/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml +++ b/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml @@ -27,7 +27,7 @@ outputs: implementation: container: - image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" + image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists, and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml index f09b2f32a..daa903aaf 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_kfp_v1" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Data Preparation Kit Library. KFP support" license = {text = "Apache-2.0"} @@ -13,7 +13,7 @@ authors = [ ] dependencies = [ "kfp==1.8.22", - "data-prep-toolkit-kfp-shared==0.2.3.dev0", + "data-prep-toolkit-kfp-shared==0.2.2", ] [build-system] diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml index 01c5b3e17..61f54663f 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_kfp_v2" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Data Preparation Kit Library. KFP support" license = {text = "Apache-2.0"} @@ -14,7 +14,7 @@ authors = [ dependencies = [ "kfp==2.8.0", "kfp-kubernetes==1.2.0", - "data-prep-toolkit-kfp-shared==0.2.3.dev0", + "data-prep-toolkit-kfp-shared==0.2.2", ] [build-system] diff --git a/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml b/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml index aa7a6dd3a..17ed57ea9 100644 --- a/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_kfp_shared" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Data Preparation Kit Library. KFP support" license = {text = "Apache-2.0"} @@ -14,7 +14,7 @@ authors = [ dependencies = [ "requests", "kubernetes", - "data-prep-toolkit[ray]==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.2", ] [build-system] diff --git a/transforms/Makefile b/transforms/Makefile index 3e8b9cfde..ed492db4d 100644 --- a/transforms/Makefile +++ b/transforms/Makefile @@ -107,7 +107,7 @@ build-pkg-dist: -rm -fr src mkdir src # Copy all the src folders recursively (not clear if they have subfolders) - for x in $(shell find . | grep '[ray| python]/src$$') ; do \ + for x in $(shell find . | grep '[ray| python | spark]/src$$') ; do \ echo $$x ; \ if [ -d "$$x" ]; then \ cp -r $$x/* src ; \ diff --git a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py b/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py index f3f491e4b..3e5f262b9 100644 --- a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py +++ b/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py @@ -25,7 +25,7 @@ # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/code2parquet/python/pyproject.toml b/transforms/code/code2parquet/python/pyproject.toml index be84b2f20..d4f8c11cf 100644 --- a/transforms/code/code2parquet/python/pyproject.toml +++ b/transforms/code/code2parquet/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code2parquet_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "code2parquet Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/code2parquet/python/requirements.txt b/transforms/code/code2parquet/python/requirements.txt index cec7f9c5f..0ce538837 100644 --- a/transforms/code/code2parquet/python/requirements.txt +++ b/transforms/code/code2parquet/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit==0.2.3.dev0 +data-prep-toolkit>=0.2.2 parameterized pandas diff --git a/transforms/code/code2parquet/ray/pyproject.toml b/transforms/code/code2parquet/ray/pyproject.toml index d56fed1e8..666551e94 100644 --- a/transforms/code/code2parquet/ray/pyproject.toml +++ b/transforms/code/code2parquet/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code2parquet_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "code2parquet Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[ray]==0.2.3.dev0", - "dpk-code2parquet-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.2", + "dpk-code2parquet-transform-python==0.2.2", "parameterized", "pandas", ] diff --git a/transforms/code/code_profiler/python/pyproject.toml b/transforms/code/code_profiler/python/pyproject.toml index 334c86fed..d3c2c2196 100644 --- a/transforms/code/code_profiler/python/pyproject.toml +++ b/transforms/code/code_profiler/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code_profiler_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Code Profiler Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/code_profiler/python/requirements.txt b/transforms/code/code_profiler/python/requirements.txt index 27706b467..a38213e0f 100644 --- a/transforms/code/code_profiler/python/requirements.txt +++ b/transforms/code/code_profiler/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.3.dev0 +data-prep-toolkit>=0.2.2 parameterized pandas aiolimiter==1.1.0 diff --git a/transforms/code/code_profiler/ray/pyproject.toml b/transforms/code/code_profiler/ray/pyproject.toml index 9b760c1c3..773ae353b 100644 --- a/transforms/code/code_profiler/ray/pyproject.toml +++ b/transforms/code/code_profiler/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code_profiler_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Code Profiler Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Pankaj Thorat", email = "pankaj.thorat@ibm.com" }, ] dependencies = [ - "dpk-code-profiler-transform-python==0.2.3.dev0", - "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk-code-profiler-transform-python==0.2.2", + "data-prep-toolkit[ray]>=0.2.2", ] [build-system] diff --git a/transforms/code/code_quality/kfp_ray/code_quality_wf.py b/transforms/code/code_quality/kfp_ray/code_quality_wf.py index 6a4ccec1b..7f5aa9768 100644 --- a/transforms/code/code_quality/kfp_ray/code_quality_wf.py +++ b/transforms/code/code_quality/kfp_ray/code_quality_wf.py @@ -24,7 +24,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/code_quality-ray:latest" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/code_quality/python/pyproject.toml b/transforms/code/code_quality/python/pyproject.toml index 17cbce67d..d7b452d6b 100644 --- a/transforms/code/code_quality/python/pyproject.toml +++ b/transforms/code/code_quality/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code_quality_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Code Quality Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/code_quality/python/requirements.txt b/transforms/code/code_quality/python/requirements.txt index ef627d39f..10eb1001b 100644 --- a/transforms/code/code_quality/python/requirements.txt +++ b/transforms/code/code_quality/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit==0.2.3.dev0 +data-prep-toolkit>=0.2.2 bs4==0.0.2 transformers==4.38.2 diff --git a/transforms/code/code_quality/ray/pyproject.toml b/transforms/code/code_quality/ray/pyproject.toml index eceee32ed..5bf3d2dff 100644 --- a/transforms/code/code_quality/ray/pyproject.toml +++ b/transforms/code/code_quality/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code_quality_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Code Quality Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, ] dependencies = [ - "dpk-code-quality-transform-python==0.2.3.dev0", - "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk-code-quality-transform-python==0.2.2", + "data-prep-toolkit[ray]>=0.2.2", ] [build-system] diff --git a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py index 9bb315569..5049a9c11 100644 --- a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py +++ b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py @@ -24,7 +24,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/header_cleanser-ray:latest" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/header_cleanser/python/pyproject.toml b/transforms/code/header_cleanser/python/pyproject.toml index 3703ec55f..2dadeaf02 100644 --- a/transforms/code/header_cleanser/python/pyproject.toml +++ b/transforms/code/header_cleanser/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_header_cleanser_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "License and Copyright Removal Transform for Python" license = {text = "Apache-2.0"} diff --git a/transforms/code/header_cleanser/python/requirements.txt b/transforms/code/header_cleanser/python/requirements.txt index 915a462dc..9123fc955 100644 --- a/transforms/code/header_cleanser/python/requirements.txt +++ b/transforms/code/header_cleanser/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit==0.2.3.dev0 +data-prep-toolkit>=0.2.2 scancode-toolkit==32.1.0 ; platform_system != 'Darwin' diff --git a/transforms/code/header_cleanser/ray/pyproject.toml b/transforms/code/header_cleanser/ray/pyproject.toml index 5fb1bcf26..179aa7769 100644 --- a/transforms/code/header_cleanser/ray/pyproject.toml +++ b/transforms/code/header_cleanser/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_header_cleanser_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "License and copyright removal Transform for Ray" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Yash kalathiya", email = "yashkalathiya164@gmail.com" }, ] dependencies = [ - "dpk-header-cleanser-transform-python==0.2.3.dev0", - "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk-header-cleanser-transform-python==0.2.2", + "data-prep-toolkit[ray]>=0.2.2", "scancode-toolkit==32.1.0", ] diff --git a/transforms/code/license_select/kfp_ray/license_select_wf.py b/transforms/code/license_select/kfp_ray/license_select_wf.py index 7dba0d9d1..9bdcc6e96 100644 --- a/transforms/code/license_select/kfp_ray/license_select_wf.py +++ b/transforms/code/license_select/kfp_ray/license_select_wf.py @@ -25,7 +25,7 @@ # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/license_select/python/pyproject.toml b/transforms/code/license_select/python/pyproject.toml index 3345d3a5a..b445c6b09 100644 --- a/transforms/code/license_select/python/pyproject.toml +++ b/transforms/code/license_select/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_license_select_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "License Select Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/license_select/python/requirements.txt b/transforms/code/license_select/python/requirements.txt index 2f67f6a80..e9abc2535 100644 --- a/transforms/code/license_select/python/requirements.txt +++ b/transforms/code/license_select/python/requirements.txt @@ -1 +1 @@ -data-prep-toolkit==0.2.3.dev0 \ No newline at end of file +data-prep-toolkit>=0.2.2 \ No newline at end of file diff --git a/transforms/code/license_select/ray/pyproject.toml b/transforms/code/license_select/ray/pyproject.toml index ce5979d62..96b293364 100644 --- a/transforms/code/license_select/ray/pyproject.toml +++ b/transforms/code/license_select/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_license_select_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "License Select Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Mark Lewis", email = "mark_lewis@uk.ibm.com" }, ] dependencies = [ - "dpk-license-select-transform-python==0.2.3.dev0", - "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk-license-select-transform-python==0.2.2", + "data-prep-toolkit[ray]>=0.2.2", ] [build-system] diff --git a/transforms/code/malware/kfp_ray/malware_wf.py b/transforms/code/malware/kfp_ray/malware_wf.py index bede80b88..89eb9d730 100644 --- a/transforms/code/malware/kfp_ray/malware_wf.py +++ b/transforms/code/malware/kfp_ray/malware_wf.py @@ -24,7 +24,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/malware-ray:latest" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/malware/python/pyproject.toml b/transforms/code/malware/python/pyproject.toml index a1bc05ab4..4dc1a9012 100644 --- a/transforms/code/malware/python/pyproject.toml +++ b/transforms/code/malware/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_malware_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Malware Python Transform" license = {text = "Apache-2.0"} @@ -9,7 +9,7 @@ authors = [ { name = "Takuya Goto", email = "tkyg@jp.ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.3.dev0", + "data-prep-toolkit>=0.2.2", "clamd==1.0.2", ] diff --git a/transforms/code/malware/ray/pyproject.toml b/transforms/code/malware/ray/pyproject.toml index 659ee62ef..22e7ecc28 100644 --- a/transforms/code/malware/ray/pyproject.toml +++ b/transforms/code/malware/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_malware_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Malware Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Takuya Goto", email = "tkyg@jp.ibm.com" }, ] dependencies = [ - "dpk-malware-transform-python==0.2.3.dev0", - "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk-malware-transform-python==0.2.2", + "data-prep-toolkit[ray]>=0.2.2", ] [build-system] diff --git a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py index 11f001bfa..bb114e3d6 100644 --- a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py +++ b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py @@ -24,7 +24,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/proglang_select-ray:latest" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/proglang_select/python/pyproject.toml b/transforms/code/proglang_select/python/pyproject.toml index e5736a9c7..e20a62f7c 100644 --- a/transforms/code/proglang_select/python/pyproject.toml +++ b/transforms/code/proglang_select/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_proglang_select_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Programming Language Selection Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/proglang_select/python/requirements.txt b/transforms/code/proglang_select/python/requirements.txt index 2f67f6a80..e9abc2535 100644 --- a/transforms/code/proglang_select/python/requirements.txt +++ b/transforms/code/proglang_select/python/requirements.txt @@ -1 +1 @@ -data-prep-toolkit==0.2.3.dev0 \ No newline at end of file +data-prep-toolkit>=0.2.2 \ No newline at end of file diff --git a/transforms/code/proglang_select/ray/pyproject.toml b/transforms/code/proglang_select/ray/pyproject.toml index d8288d189..3d330d3cc 100644 --- a/transforms/code/proglang_select/ray/pyproject.toml +++ b/transforms/code/proglang_select/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_proglang_select_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Programming Language Selection Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, ] dependencies = [ - "dpk-proglang-select-transform-python==0.2.3.dev0", - "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk-proglang-select-transform-python==0.2.2", + "data-prep-toolkit[ray]>=0.2.2", ] [build-system] diff --git a/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py b/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py index 38a829fab..fa739bfd0 100644 --- a/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py +++ b/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py @@ -24,7 +24,7 @@ EXEC_SCRIPT_NAME: str = "repo_level_order_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/repo_level_ordering/ray/pyproject.toml b/transforms/code/repo_level_ordering/ray/pyproject.toml index 9581c8941..602799503 100644 --- a/transforms/code/repo_level_ordering/ray/pyproject.toml +++ b/transforms/code/repo_level_ordering/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_repo_level_order_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "repo_level_order Ray Transform" license = {text = "Apache-2.0"} @@ -11,7 +11,7 @@ authors = [ { name = "Shanmukha Guttula", email = "shagutt1@in.ibm.com" }, ] dependencies = [ - "data-prep-toolkit[ray]==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.2", "networkx==3.3", "colorlog==6.8.2", "func-timeout==4.3.5", diff --git a/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py b/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py index 7e30ee8b8..1fd927356 100644 --- a/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py +++ b/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "doc_chunk_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py b/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py index 387c3bda7..e128df8b0 100644 --- a/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py +++ b/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "doc_chunk_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/doc_chunk/python/requirements.txt b/transforms/language/doc_chunk/python/requirements.txt index 207ab9249..2d282a8ac 100644 --- a/transforms/language/doc_chunk/python/requirements.txt +++ b/transforms/language/doc_chunk/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.3.dev0 +data-prep-toolkit>=0.2.2 docling-core==2.3.0 pydantic>=2.0.0,<2.10.0 llama-index-core>=0.11.22,<0.12.0 diff --git a/transforms/language/doc_chunk/ray/pyproject.toml b/transforms/language/doc_chunk/ray/pyproject.toml index 4fb356038..6694456ce 100644 --- a/transforms/language/doc_chunk/ray/pyproject.toml +++ b/transforms/language/doc_chunk/ray/pyproject.toml @@ -12,7 +12,7 @@ authors = [ ] dependencies = [ "dpk-doc-chunk-transform-python==0.3.0", - "data-prep-toolkit[ray]==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.2", ] [build-system] diff --git a/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py b/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py index 436d93ff3..f103b7269 100644 --- a/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py +++ b/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "doc_quality_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py b/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py index f39fd7e39..0ca4fb865 100644 --- a/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py +++ b/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "doc_quality_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/doc_quality/python/pyproject.toml b/transforms/language/doc_quality/python/pyproject.toml index 23538b8c7..f3abe0337 100644 --- a/transforms/language/doc_quality/python/pyproject.toml +++ b/transforms/language/doc_quality/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_quality_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Document Quality Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/doc_quality/python/requirements.txt b/transforms/language/doc_quality/python/requirements.txt index 4aa2d8111..2e29c9cb4 100644 --- a/transforms/language/doc_quality/python/requirements.txt +++ b/transforms/language/doc_quality/python/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit==0.2.3.dev0 +data-prep-toolkit>=0.2.2 diff --git a/transforms/language/doc_quality/ray/pyproject.toml b/transforms/language/doc_quality/ray/pyproject.toml index ec56ac2c7..62f97e538 100644 --- a/transforms/language/doc_quality/ray/pyproject.toml +++ b/transforms/language/doc_quality/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_quality_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Document Quality Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Daiki Tsuzuku", email = "dtsuzuku@jp.ibm.com" } ] dependencies = [ - "dpk-doc_quality-transform-python==0.2.3.dev0", - "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk-doc_quality-transform-python==0.2.2", + "data-prep-toolkit[ray]>=0.2.2", ] [build-system] diff --git a/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py b/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py index 4eb8b9de1..4eaef2fea 100644 --- a/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py +++ b/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "html2parquet_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/html2parquet/python/pyproject.toml b/transforms/language/html2parquet/python/pyproject.toml index 3a7a6efbc..af6b64763 100644 --- a/transforms/language/html2parquet/python/pyproject.toml +++ b/transforms/language/html2parquet/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_html2parquet_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "HTML2PARQUET Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/html2parquet/python/requirements.txt b/transforms/language/html2parquet/python/requirements.txt index f21e65774..42e2459b2 100644 --- a/transforms/language/html2parquet/python/requirements.txt +++ b/transforms/language/html2parquet/python/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit==0.2.3.dev0 +data-prep-toolkit>=0.2.2 trafilatura==1.12.0 diff --git a/transforms/language/html2parquet/ray/pyproject.toml b/transforms/language/html2parquet/ray/pyproject.toml index 5e888748c..859706621 100644 --- a/transforms/language/html2parquet/ray/pyproject.toml +++ b/transforms/language/html2parquet/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_html2parquet_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "HTML2PARQUET Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/html2parquet/ray/requirements.txt b/transforms/language/html2parquet/ray/requirements.txt index 9aa193432..700267692 100644 --- a/transforms/language/html2parquet/ray/requirements.txt +++ b/transforms/language/html2parquet/ray/requirements.txt @@ -1,3 +1,3 @@ -dpk-html2parquet-transform-python==0.2.3.dev0 -data-prep-toolkit[ray]==0.2.3.dev0 +dpk-html2parquet-transform-python==0.2.2 +data-prep-toolkit[ray]>=0.2.2 trafilatura==1.12.0 \ No newline at end of file diff --git a/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py b/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py index a89c54ab3..e853c2328 100644 --- a/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py +++ b/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "lang_id_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/lang_id/kfp_ray/lang_id_wf.py b/transforms/language/lang_id/kfp_ray/lang_id_wf.py index 2ac84645d..5aed719c5 100644 --- a/transforms/language/lang_id/kfp_ray/lang_id_wf.py +++ b/transforms/language/lang_id/kfp_ray/lang_id_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "lang_id_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/lang_id/python/pyproject.toml b/transforms/language/lang_id/python/pyproject.toml index a69724a2d..43650a50a 100644 --- a/transforms/language/lang_id/python/pyproject.toml +++ b/transforms/language/lang_id/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_lang_id_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Language Identification Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/lang_id/python/requirements.txt b/transforms/language/lang_id/python/requirements.txt index 06bec1ab9..1f90bcd54 100644 --- a/transforms/language/lang_id/python/requirements.txt +++ b/transforms/language/lang_id/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.3.dev0 +data-prep-toolkit>=0.2.2 fasttext==0.9.2 langcodes==3.3.0 huggingface-hub >= 0.21.4, <1.0.0 diff --git a/transforms/language/lang_id/ray/pyproject.toml b/transforms/language/lang_id/ray/pyproject.toml index dba929905..b60a3a5bb 100644 --- a/transforms/language/lang_id/ray/pyproject.toml +++ b/transforms/language/lang_id/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_lang_id_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Language Identification Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Daiki Tsuzuku", email = "dtsuzuku@jp.ibm.com" } ] dependencies = [ - "dpk-lang_id-transform-python==0.2.3.dev0", - "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk-lang_id-transform-python==0.2.2", + "data-prep-toolkit[ray]>=0.2.2", ] [build-system] diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py index 8992f1145..56e881b5e 100644 --- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py +++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "pdf2parquet_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py index c9cdbf652..395918ac3 100644 --- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py +++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "pdf2parquet_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/pdf2parquet/python/requirements.txt b/transforms/language/pdf2parquet/python/requirements.txt index 310909164..1d1aa2570 100644 --- a/transforms/language/pdf2parquet/python/requirements.txt +++ b/transforms/language/pdf2parquet/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.3.dev0 +data-prep-toolkit>=0.2.2 docling-core==2.3.0 docling-ibm-models==2.0.3 deepsearch-glm==0.26.1 diff --git a/transforms/language/pdf2parquet/ray/requirements.txt b/transforms/language/pdf2parquet/ray/requirements.txt index 34831cde8..40650d1a5 100644 --- a/transforms/language/pdf2parquet/ray/requirements.txt +++ b/transforms/language/pdf2parquet/ray/requirements.txt @@ -1,5 +1,5 @@ dpk-pdf2parquet-transform-python==0.3.0 -data-prep-toolkit[ray]==0.2.3.dev0 +data-prep-toolkit[ray]>=0.2.2 # docling-core==1.7.2 # docling-ibm-models==2.0.0 # deepsearch-glm==0.22.0 diff --git a/transforms/language/pii_redactor/python/pyproject.toml b/transforms/language/pii_redactor/python/pyproject.toml index 72c1bf783..4a159bba0 100644 --- a/transforms/language/pii_redactor/python/pyproject.toml +++ b/transforms/language/pii_redactor/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_pii_redactor_transform_python" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "PII redactor Transform for Python" license = {text = "Apache-2.0"} diff --git a/transforms/language/pii_redactor/python/requirements.txt b/transforms/language/pii_redactor/python/requirements.txt index 0abcc1d96..51fbd2494 100644 --- a/transforms/language/pii_redactor/python/requirements.txt +++ b/transforms/language/pii_redactor/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.3.dev0 +data-prep-toolkit>=0.2.2 presidio-analyzer>=2.2.355 presidio-anonymizer>=2.2.355 flair>=0.14.0 diff --git a/transforms/language/pii_redactor/ray/pyproject.toml b/transforms/language/pii_redactor/ray/pyproject.toml index 4549851d0..a65aa5913 100644 --- a/transforms/language/pii_redactor/ray/pyproject.toml +++ b/transforms/language/pii_redactor/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_pii_redactor_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "PII Redactor Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "dpk_pii_redactor_transform_python==0.2.3.dev0", - "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk_pii_redactor_transform_python==0.2.2", + "data-prep-toolkit[ray]>=0.2.2", "presidio-analyzer>=2.2.355", "presidio-anonymizer>=2.2.355", "flair>=0.14.0", diff --git a/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py b/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py index e522737a1..bad5e24cd 100644 --- a/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py +++ b/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "text_encoder_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py b/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py index f88fe9eef..5c762c2a1 100644 --- a/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py +++ b/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "text_encoder_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/text_encoder/python/pyproject.toml b/transforms/language/text_encoder/python/pyproject.toml index dc15beb6e..62182b27b 100644 --- a/transforms/language/text_encoder/python/pyproject.toml +++ b/transforms/language/text_encoder/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_text_encoder_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Text Encoder Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/text_encoder/python/requirements.txt b/transforms/language/text_encoder/python/requirements.txt index 3ac880bba..0d8160151 100644 --- a/transforms/language/text_encoder/python/requirements.txt +++ b/transforms/language/text_encoder/python/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit==0.2.3.dev0 +data-prep-toolkit>=0.2.2 sentence-transformers==3.0.1 diff --git a/transforms/language/text_encoder/ray/pyproject.toml b/transforms/language/text_encoder/ray/pyproject.toml index f1b2c09d5..2f8483e2d 100644 --- a/transforms/language/text_encoder/ray/pyproject.toml +++ b/transforms/language/text_encoder/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_text_encoder_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Text Encoder Ray Transform" license = {text = "Apache-2.0"} @@ -11,8 +11,8 @@ authors = [ { name = "Peter Staar", email = "taa@zurich.ibm.com" }, ] dependencies = [ - "dpk-text_encoder-transform-python==0.2.3.dev0", - "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk-text_encoder-transform-python==0.2.2", + "data-prep-toolkit[ray]>=0.2.2", ] [build-system] diff --git a/transforms/requirements-ray.txt b/transforms/requirements-ray.txt index b0527bdd6..11d0decf5 100644 --- a/transforms/requirements-ray.txt +++ b/transforms/requirements-ray.txt @@ -1,4 +1,4 @@ -data-prep-toolkit[ray]>=0.2.3.dev0 +data-prep-toolkit[ray]>=0.2.2 networkx==3.3 colorlog==6.8.2 func-timeout==4.3.5 diff --git a/transforms/requirements.txt b/transforms/requirements.txt index 934c95182..7317d33e3 100644 --- a/transforms/requirements.txt +++ b/transforms/requirements.txt @@ -1 +1 @@ -data-prep-toolkit>=0.2.3.dev0 +data-prep-toolkit>=0.2.2 diff --git a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py index f41231159..7e1bd0b8e 100644 --- a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py +++ b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py @@ -22,7 +22,7 @@ # the name of the job script EXEC_SCRIPT_NAME: str = "doc_id_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/doc_id/python/pyproject.toml b/transforms/universal/doc_id/python/pyproject.toml index 1a962662d..a9e69f0bf 100644 --- a/transforms/universal/doc_id/python/pyproject.toml +++ b/transforms/universal/doc_id/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_id_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "ededup Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/doc_id/python/requirements.txt b/transforms/universal/doc_id/python/requirements.txt index 2f67f6a80..e9abc2535 100644 --- a/transforms/universal/doc_id/python/requirements.txt +++ b/transforms/universal/doc_id/python/requirements.txt @@ -1 +1 @@ -data-prep-toolkit==0.2.3.dev0 \ No newline at end of file +data-prep-toolkit>=0.2.2 \ No newline at end of file diff --git a/transforms/universal/doc_id/ray/pyproject.toml b/transforms/universal/doc_id/ray/pyproject.toml index da34dded3..ee022af54 100644 --- a/transforms/universal/doc_id/ray/pyproject.toml +++ b/transforms/universal/doc_id/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_id_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "docid Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk_doc_id_transform_python==0.2.3.dev0", - "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk_doc_id_transform_python==0.2.2", + "data-prep-toolkit[ray]>=0.2.2", ] [build-system] diff --git a/transforms/universal/doc_id/spark/pyproject.toml b/transforms/universal/doc_id/spark/pyproject.toml index 369a1bb72..f50d4f70d 100644 --- a/transforms/universal/doc_id/spark/pyproject.toml +++ b/transforms/universal/doc_id/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_id_transform_spark" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Doc ID Spark Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[spark]==0.2.3.dev0", + "data-prep-toolkit[spark]==0.2.2", ] [build-system] diff --git a/transforms/universal/ededup/kfp_ray/ededup_wf.py b/transforms/universal/ededup/kfp_ray/ededup_wf.py index ab46daadb..d878bd3e2 100644 --- a/transforms/universal/ededup/kfp_ray/ededup_wf.py +++ b/transforms/universal/ededup/kfp_ray/ededup_wf.py @@ -24,7 +24,7 @@ EXEC_SCRIPT_NAME: str = "ededup_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/ededup/python/pyproject.toml b/transforms/universal/ededup/python/pyproject.toml index da28e715f..67fd0f758 100644 --- a/transforms/universal/ededup/python/pyproject.toml +++ b/transforms/universal/ededup/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_ededup_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "ededup Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/ededup/python/requirements.txt b/transforms/universal/ededup/python/requirements.txt index aa73a106a..9fe419975 100644 --- a/transforms/universal/ededup/python/requirements.txt +++ b/transforms/universal/ededup/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit==0.2.3.dev0 +data-prep-toolkit>=0.2.2 mmh3>=4.1.0 xxhash==3.4.1 diff --git a/transforms/universal/ededup/ray/pyproject.toml b/transforms/universal/ededup/ray/pyproject.toml index 424e220fd..58b39d7d7 100644 --- a/transforms/universal/ededup/ray/pyproject.toml +++ b/transforms/universal/ededup/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_ededup_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "ededup Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[ray]==0.2.3.dev0", - "dpk_ededup_transform_python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.2", + "dpk_ededup_transform_python==0.2.2", "tqdm==4.66.3", ] diff --git a/transforms/universal/fdedup/fdedup_python.ipynb b/transforms/universal/fdedup/fdedup_python.ipynb index 83f9bd600..684583ffd 100644 --- a/transforms/universal/fdedup/fdedup_python.ipynb +++ b/transforms/universal/fdedup/fdedup_python.ipynb @@ -14,7 +14,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", "metadata": {}, "outputs": [], @@ -37,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", "metadata": {}, "outputs": [], @@ -71,7 +71,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "e90a853e-412f-45d7-af3d-959e755aeebb", "metadata": {}, "outputs": [], @@ -102,10 +102,102 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "0775e400-7469-49a6-8998-bd4772931459", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "13:30:29 INFO - Starting SignatureCalculation step\n", + "13:30:29 INFO - Got parameters for SignatureCalculation\n", + "13:30:29 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.75, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n", + "13:30:29 INFO - data factory scdata_ is using local configuration without input/output path\n", + "13:30:29 INFO - data factory scdata_ max_files -1, n_sample -1\n", + "13:30:29 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:30:29 INFO - pipeline id pipeline_id\n", + "13:30:29 INFO - code location None\n", + "13:30:29 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/test-data/input output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/output\n", + "13:30:29 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:30:29 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:30:29 INFO - orchestrator minhash started at 2024-11-26 13:30:29\n", + "13:30:29 INFO - Number of files is 2, source profile {'max_file_size': 0.0029497146606445312, 'min_file_size': 0.0013322830200195312, 'total_file_size': 0.0042819976806640625}\n", + "13:30:33 INFO - Completed 1 files (50.0%) in 0.074 min\n", + "13:30:33 INFO - Completed 2 files (100.0%) in 0.074 min\n", + "13:30:33 INFO - Done processing 2 files, waiting for flush() completion.\n", + "13:30:33 INFO - Starting flush()\n", + "13:30:34 INFO - Wrote 14 tables with a total size of 80,640 bytes\n", + "13:30:34 INFO - done flushing in 0.063 sec\n", + "13:30:34 INFO - Completed execution in 0.075 min, execution result 0\n", + "13:30:34 INFO - SignatureCalculation completed successfully\n", + "13:30:34 INFO - Starting ClusterAnalysis step\n", + "13:30:34 INFO - Got parameters for ClusterAnalysis\n", + "13:30:34 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.75, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n", + "13:30:34 INFO - pipeline id pipeline_id\n", + "13:30:34 INFO - code location None\n", + "13:30:34 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/output/bands output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/output/docs_to_remove\n", + "13:30:34 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:30:34 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:30:34 INFO - orchestrator cluster started at 2024-11-26 13:30:34\n", + "13:30:34 INFO - Number of folders is 14\n", + "13:30:34 INFO - Completed 1 files (7.14%) in 0.0 min\n", + "13:30:34 INFO - Completed 2 files (14.29%) in 0.0 min\n", + "13:30:34 INFO - Completed 3 files (21.43%) in 0.001 min\n", + "13:30:34 INFO - Completed 4 files (28.57%) in 0.001 min\n", + "13:30:34 INFO - Completed 5 files (35.71%) in 0.001 min\n", + "13:30:34 INFO - Completed 6 files (42.86%) in 0.001 min\n", + "13:30:34 INFO - Completed 7 files (50.0%) in 0.001 min\n", + "13:30:34 INFO - Completed 8 files (57.14%) in 0.002 min\n", + "13:30:34 INFO - Completed 9 files (64.29%) in 0.002 min\n", + "13:30:34 INFO - Completed 10 files (71.43%) in 0.002 min\n", + "13:30:34 INFO - Completed 11 files (78.57%) in 0.002 min\n", + "13:30:34 INFO - Completed 12 files (85.71%) in 0.002 min\n", + "13:30:34 INFO - Completed 13 files (92.86%) in 0.002 min\n", + "13:30:34 INFO - Completed 14 files (100.0%) in 0.003 min\n", + "13:30:34 INFO - Done processing 14 files, waiting for flush() completion.\n", + "13:30:34 INFO - done flushing in 0.0 sec\n", + "13:30:34 INFO - Completed execution in 0.003 min, execution result 0\n", + "13:30:34 INFO - ClusterAnalysis completed successfully\n", + "13:30:34 INFO - Starting GetDuplicateList step\n", + "13:30:34 INFO - Got parameters for GetDuplicateList\n", + "13:30:34 INFO - fdlist parameters are : {'docs_to_remove': 'docs_to_remove', 'consolidated_filename': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'sort_output': False}\n", + "13:30:34 INFO - pipeline id pipeline_id\n", + "13:30:34 INFO - code location None\n", + "13:30:34 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/output output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/output\n", + "13:30:34 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:30:34 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:30:34 INFO - orchestrator fdlist started at 2024-11-26 13:30:34\n", + "13:30:34 INFO - Number of folders is 1\n", + "13:30:34 INFO - Get Duplicate List for folder docs_to_remove\n", + "13:30:34 INFO - 8 documents marked as duplicates\n", + "13:30:34 INFO - Completed 1 files (100.0%) in 0.0 min\n", + "13:30:34 INFO - Done processing 1 files, waiting for flush() completion.\n", + "13:30:34 INFO - done flushing in 0.0 sec\n", + "13:30:34 INFO - Completed execution in 0.001 min, execution result 0\n", + "13:30:34 INFO - GetDuplicateList completed successfully\n", + "13:30:34 INFO - Starting DataCleaning step\n", + "13:30:34 INFO - Got parameters for DataCleaning\n", + "13:30:34 INFO - fdclean parameters are : {'document_id_column': 'int_id_column', 'duplicate_list_location': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'operation_mode': 'filter_duplicates'}\n", + "13:30:34 INFO - data factory dcdata_ is using local configuration without input/output path\n", + "13:30:34 INFO - data factory dcdata_ max_files -1, n_sample -1\n", + "13:30:34 INFO - data factory dcdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:30:34 INFO - pipeline id pipeline_id\n", + "13:30:34 INFO - code location None\n", + "13:30:34 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/test-data/input output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/output/cleaned\n", + "13:30:34 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:30:34 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:30:34 INFO - orchestrator fdclean started at 2024-11-26 13:30:34\n", + "13:30:34 INFO - Number of files is 2, source profile {'max_file_size': 0.0029497146606445312, 'min_file_size': 0.0013322830200195312, 'total_file_size': 0.0042819976806640625}\n", + "13:30:34 INFO - Completed 1 files (50.0%) in 0.0 min\n", + "13:30:34 INFO - Completed 2 files (100.0%) in 0.0 min\n", + "13:30:34 INFO - Done processing 2 files, waiting for flush() completion.\n", + "13:30:34 INFO - done flushing in 0.0 sec\n", + "13:30:34 INFO - Completed execution in 0.0 min, execution result 0\n", + "13:30:34 INFO - DataCleaning completed successfully\n" + ] + } + ], "source": [ "\n", "sys.argv = ParamsUtils.dict_to_req(d=params)\n", @@ -126,10 +218,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "7276fe84-6512-4605-ab65-747351e13a7c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['python/output/cleaned/metadata.json',\n", + " 'python/output/cleaned/data_1',\n", + " 'python/output/cleaned/data_2']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import glob\n", "glob.glob(\"python/output/cleaned/*\")" @@ -145,10 +250,167 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "5b22234f-f7a1-4b92-b2ac-376b2545abce", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape: (12, 2)\n", + "┌───────────────┬──────────────────────────────────────────────────────────────────────────────────┐\n", + "│ int_id_column ┆ contents │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ str │\n", + "╞═══════════════╪══════════════════════════════════════════════════════════════════════════════════╡\n", + "│ 1 ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ PR Newswire October 12, 2019 │\n", + "│ ┆ 145-year-old Retailer Anchors Woodland Mall Just Outside Grand Rapids; │\n", + "│ ┆ New Location Continues Strategic National Expansion Plans │\n", + "│ ┆ DAVENPORT, Iowa, Oct. 12, 2019 /PRNewswire/ -- Von Maur Department Stores opened │\n", + "│ ┆ a new store today at Woodland Mall in Kentwood, Mich. The 90,000-square-foot │\n", + "│ ┆ store is the Company's third location in Michigan. │\n", + "│ ┆ Known for its outstanding selection of brand name and specialty apparel, shoes, │\n", + "│ ┆ accessories and gifts, the store features products from leading brands such as │\n", + "│ ┆ Eileen Fisher, Vineyard Vines, Free People, and Kendra Scott, among many others. │\n", + "│ ┆ Von Maur is also widely-regarded for its superior customer service, including an │\n", + "│ ┆ interest-free charge card, accommodating return policy, free gift wrapping and │\n", + "│ ┆ free shipping services. │\n", + "│ ┆ Today's opening continues to build upon the momentum of the family-owned │\n", + "│ ┆ Company's targeted national growth strategy. Von Maur opened its first Wisconsin │\n", + "│ ┆ location in 2017 and a second Minnesota location in 2018, and it has grown in │\n", + "│ ┆ new states beyond its Midwestern footprint, including New York, Alabama and │\n", + "│ ┆ Oklahoma. Additionally, the Company has plans to open its second Wisconsin │\n", + "│ ┆ location in Madison in Fall 2021. │\n", + "│ ┆ \"With its easy accessibility to the larger Grand Rapids area and exceptional │\n", + "│ ┆ collection of shopping, dining and entertainment options, Woodland Mall is a │\n", + "│ ┆ fantastic location for us to continue growing our brand in Michigan,\" said Jim │\n", + "│ ┆ von Maur, president of Von Maur. \"From the moment shoppers walk through our │\n", + "│ ┆ doors, creating an unrivaled shopping experience is the motivation behind │\n", + "│ ┆ everything we do. We look forward to extending our offerings of brand name │\n", + "│ ┆ merchandise and signature customer service to the Grand Rapids area for many │\n", + "│ ┆ years to come.\" │\n", + "│ ┆ \"We are thrilled to welcome Von Maur, known for their high-quality merchandise │\n", + "│ ┆ and exceptional service, as the anchor of the newly developed wing at Woodland │\n", + "│ ┆ Mall,\" said Joe Coradino, CEO of PREIT. \"The addition most certainly solidifies │\n", + "│ ┆ Woodland Mall's place as the premier retail and entertainment destination in │\n", + "│ ┆ Grand Rapids, driving its place as a top-performing PREIT property.\" │\n", + "│ ┆ Centrally-located for shoppers from Grand Rapids and the surrounding areas, the │\n", + "│ ┆ new single story Von Maur store features the Company's signature exterior brick │\n", + "│ ┆ façade, open expansive floor plan, and residential ambiance, including music │\n", + "│ ┆ from the store's grand piano. │\n", + "│ ┆ The Woodland Mall store will eventually employ up to 150 associates; the │\n", + "│ ┆ majority of them will be full-time. Von Maur offers above-market wages, │\n", + "│ ┆ excellent benefits and a positive, professional work environment. Hours of │\n", + "│ ┆ operation are Monday to Saturday, 10 a.m. – 9 p.m. ET, and Sunday, 12 p.m. – 6 │\n", + "│ ┆ p.m. ET. │\n", + "│ ┆ About Von Maur │\n", + "│ ┆ Von Maur was founded 145 years ago in downtown Davenport, Iowa. The Company │\n", + "│ ┆ currently operates 35 stores in 15 states, along with a 120,000 square foot │\n", + "│ ┆ E-Commerce facility that drives its successful online business at vonmaur.com. │\n", + "│ ┆ Courtney Smith │\n", + "│ ┆ courtney@reputationpartners.com │\n", + "│ ┆ View original content:http://www.prnewswire.com/news-releases/von-maur-departmen │\n", + "│ ┆ t-store-opens-third-location-in-michigan-300937186.html │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ 3 ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful minds of our time about │\n", + "│ ┆ what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "│ 4 ┆ │\n", + "│ ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", + "│ ┆ minds of our time about what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ ┆ │\n", + "│ 5 ┆ │\n", + "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", + "│ ┆ minds of our time about what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "│ ┆ │\n", + "│ 6 ┆ │\n", + "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", + "│ ┆ minds of our time about what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ ┆ │\n", + "│ 11 ┆ A couple of capricious capybaras chatted coolly by the cactus, curiously │\n", + "│ ┆ considering another capy capably chewing on cantaloupe. Yesterday, a pair of │\n", + "│ ┆ capricious pigeons prattled placidly by the cactus, curiously considering │\n", + "│ ┆ another pigeon capably pecking at cantaloupe. The lazy llama lightly limped │\n", + "│ ┆ through the lilacs, laboriously longing for a lozenge │\n", + "│ 12 ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus, │\n", + "│ ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy │\n", + "│ ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A │\n", + "│ ┆ couple of capricious capybaras chatted coolly by the cactus, curiously │\n", + "│ ┆ considering another capy capably chewing on cantaloupe. │\n", + "│ 13 ┆ The lazy llama lightly limped through the lilacs, laboriously longing for a │\n", + "│ ┆ lozenge. A couple of capricious capybaras chatted coolly by the cactus, │\n", + "│ ┆ curiously considering another capy capably chewing on cantaloupe. Yesterday, a │\n", + "│ ┆ pair of capricious pigeons prattled placidly by the cactus, curiously │\n", + "│ ┆ considering another pigeon capably pecking at cantaloupe. │\n", + "│ 14 ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus, │\n", + "│ ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy │\n", + "│ ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A │\n", + "│ ┆ couple of capricious capybaras chatted coolly by the cactus, curiously pondering │\n", + "│ ┆ another capy capably chewing on cantaloupe │\n", + "│ 15 ┆ The new sheepskin leather coat with natural fur is 46-48 times warmer. The color │\n", + "│ ┆ is very beautiful bright green looks very beautiful. Purchased by the shopping │\n", + "│ ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time. │\n", + "│ 16 ┆ New sheepskin leather coat with natural fur is 50 times warmer. The color is │\n", + "│ ┆ very beautiful bright green looks very beautiful. Purchased by the shopping │\n", + "│ ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time. │\n", + "│ 17 ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful minds of our time about │\n", + "│ ┆ what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "└───────────────┴──────────────────────────────────────────────────────────────────────────────────┘\n" + ] + } + ], "source": [ "import polars as pl\n", "input_df_1 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"python\", \"test-data\", \"input\", \"data_1\", \"df1.parquet\"))\n", @@ -169,10 +431,97 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "0b2eddb9-4fb6-41eb-916c-3741b9129f2c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape: (4, 2)\n", + "┌───────────────┬──────────────────────────────────────────────────────────────────────────────────┐\n", + "│ int_id_column ┆ contents │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ str │\n", + "╞═══════════════╪══════════════════════════════════════════════════════════════════════════════════╡\n", + "│ 1 ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ PR Newswire October 12, 2019 │\n", + "│ ┆ 145-year-old Retailer Anchors Woodland Mall Just Outside Grand Rapids; │\n", + "│ ┆ New Location Continues Strategic National Expansion Plans │\n", + "│ ┆ DAVENPORT, Iowa, Oct. 12, 2019 /PRNewswire/ -- Von Maur Department Stores opened │\n", + "│ ┆ a new store today at Woodland Mall in Kentwood, Mich. The 90,000-square-foot │\n", + "│ ┆ store is the Company's third location in Michigan. │\n", + "│ ┆ Known for its outstanding selection of brand name and specialty apparel, shoes, │\n", + "│ ┆ accessories and gifts, the store features products from leading brands such as │\n", + "│ ┆ Eileen Fisher, Vineyard Vines, Free People, and Kendra Scott, among many others. │\n", + "│ ┆ Von Maur is also widely-regarded for its superior customer service, including an │\n", + "│ ┆ interest-free charge card, accommodating return policy, free gift wrapping and │\n", + "│ ┆ free shipping services. │\n", + "│ ┆ Today's opening continues to build upon the momentum of the family-owned │\n", + "│ ┆ Company's targeted national growth strategy. Von Maur opened its first Wisconsin │\n", + "│ ┆ location in 2017 and a second Minnesota location in 2018, and it has grown in │\n", + "│ ┆ new states beyond its Midwestern footprint, including New York, Alabama and │\n", + "│ ┆ Oklahoma. Additionally, the Company has plans to open its second Wisconsin │\n", + "│ ┆ location in Madison in Fall 2021. │\n", + "│ ┆ \"With its easy accessibility to the larger Grand Rapids area and exceptional │\n", + "│ ┆ collection of shopping, dining and entertainment options, Woodland Mall is a │\n", + "│ ┆ fantastic location for us to continue growing our brand in Michigan,\" said Jim │\n", + "│ ┆ von Maur, president of Von Maur. \"From the moment shoppers walk through our │\n", + "│ ┆ doors, creating an unrivaled shopping experience is the motivation behind │\n", + "│ ┆ everything we do. We look forward to extending our offerings of brand name │\n", + "│ ┆ merchandise and signature customer service to the Grand Rapids area for many │\n", + "│ ┆ years to come.\" │\n", + "│ ┆ \"We are thrilled to welcome Von Maur, known for their high-quality merchandise │\n", + "│ ┆ and exceptional service, as the anchor of the newly developed wing at Woodland │\n", + "│ ┆ Mall,\" said Joe Coradino, CEO of PREIT. \"The addition most certainly solidifies │\n", + "│ ┆ Woodland Mall's place as the premier retail and entertainment destination in │\n", + "│ ┆ Grand Rapids, driving its place as a top-performing PREIT property.\" │\n", + "│ ┆ Centrally-located for shoppers from Grand Rapids and the surrounding areas, the │\n", + "│ ┆ new single story Von Maur store features the Company's signature exterior brick │\n", + "│ ┆ façade, open expansive floor plan, and residential ambiance, including music │\n", + "│ ┆ from the store's grand piano. │\n", + "│ ┆ The Woodland Mall store will eventually employ up to 150 associates; the │\n", + "│ ┆ majority of them will be full-time. Von Maur offers above-market wages, │\n", + "│ ┆ excellent benefits and a positive, professional work environment. Hours of │\n", + "│ ┆ operation are Monday to Saturday, 10 a.m. – 9 p.m. ET, and Sunday, 12 p.m. – 6 │\n", + "│ ┆ p.m. ET. │\n", + "│ ┆ About Von Maur │\n", + "│ ┆ Von Maur was founded 145 years ago in downtown Davenport, Iowa. The Company │\n", + "│ ┆ currently operates 35 stores in 15 states, along with a 120,000 square foot │\n", + "│ ┆ E-Commerce facility that drives its successful online business at vonmaur.com. │\n", + "│ ┆ Courtney Smith │\n", + "│ ┆ courtney@reputationpartners.com │\n", + "│ ┆ View original content:http://www.prnewswire.com/news-releases/von-maur-departmen │\n", + "│ ┆ t-store-opens-third-location-in-michigan-300937186.html │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ 4 ┆ │\n", + "│ ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", + "│ ┆ minds of our time about what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ ┆ │\n", + "│ 12 ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus, │\n", + "│ ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy │\n", + "│ ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A │\n", + "│ ┆ couple of capricious capybaras chatted coolly by the cactus, curiously │\n", + "│ ┆ considering another capy capably chewing on cantaloupe. │\n", + "│ 15 ┆ The new sheepskin leather coat with natural fur is 46-48 times warmer. The color │\n", + "│ ┆ is very beautiful bright green looks very beautiful. Purchased by the shopping │\n", + "│ ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time. │\n", + "└───────────────┴──────────────────────────────────────────────────────────────────────────────────┘\n" + ] + } + ], "source": [ "import polars as pl\n", "output_df_1 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"python\", \"output\", \"cleaned\", \"data_1\", \"df1.parquet\"))\n", @@ -193,9 +542,9 @@ ], "metadata": { "kernelspec": { - "display_name": "fdedup_ray", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "fdedup_ray" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -207,7 +556,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.11.10" } }, "nbformat": 4, diff --git a/transforms/universal/fdedup/fdedup_ray.ipynb b/transforms/universal/fdedup/fdedup_ray.ipynb index 533ca019f..bb69579a9 100644 --- a/transforms/universal/fdedup/fdedup_ray.ipynb +++ b/transforms/universal/fdedup/fdedup_ray.ipynb @@ -14,7 +14,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", "metadata": {}, "outputs": [], @@ -37,10 +37,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-11-26 13:30:56,482\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n" + ] + } + ], "source": [ "import ast\n", "import os\n", @@ -73,7 +81,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "e90a853e-412f-45d7-af3d-959e755aeebb", "metadata": {}, "outputs": [], @@ -106,10 +114,126 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "0775e400-7469-49a6-8998-bd4772931459", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "13:30:57 INFO - Starting SignatureCalculation step\n", + "13:30:57 INFO - Got parameters for SignatureCalculation\n", + "13:30:57 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.75, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n", + "13:30:57 INFO - data factory scdata_ is using local configuration without input/output path\n", + "13:30:57 INFO - data factory scdata_ max_files -1, n_sample -1\n", + "13:30:57 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:30:57 INFO - pipeline id pipeline_id\n", + "13:30:57 INFO - code location None\n", + "13:30:57 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "13:30:57 INFO - actor creation delay 0\n", + "13:30:57 INFO - job details {'job category': 'preprocessing', 'job name': 'minhash', 'job type': 'ray', 'job id': 'job_id'}\n", + "13:30:57 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/test-data/input output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output\n", + "13:30:57 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:30:57 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:30:57 INFO - Running locally\n", + "2024-11-26 13:31:08,860\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:12 INFO - orchestrator started at 2024-11-26 13:31:12\n", + "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:12 INFO - Number of files is 1, source profile {'max_file_size': 0.003920555114746094, 'min_file_size': 0.003920555114746094, 'total_file_size': 0.003920555114746094}\n", + "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:12 INFO - Cluster resources: {'cpus': 12, 'gpus': 0, 'memory': 11.162438202649355, 'object_store': 2.0}\n", + "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:12 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:14 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:14 INFO - Completed processing 1 files in 0.002 min\n", + "\u001b[36m(RayTransformFileProcessor pid=86984)\u001b[0m 13:31:14 INFO - Starting flush()\n", + "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:14 INFO - done flushing in 0.045 sec\n", + "\u001b[36m(RayTransformFileProcessor pid=86984)\u001b[0m 13:31:14 INFO - Wrote 14 tables with a total size of 80,640 bytes\n", + "13:31:24 INFO - Completed execution in 0.446 min, execution result 0\n", + "13:31:26 INFO - SignatureCalculation completed successfully\n", + "13:31:26 INFO - Starting ClusterAnalysis step\n", + "13:31:26 INFO - Got parameters for ClusterAnalysis\n", + "13:31:26 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.75, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n", + "13:31:26 INFO - pipeline id pipeline_id\n", + "13:31:26 INFO - code location None\n", + "13:31:26 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "13:31:26 INFO - actor creation delay 0\n", + "13:31:26 INFO - job details {'job category': 'preprocessing', 'job name': 'cluster', 'job type': 'ray', 'job id': 'job_id'}\n", + "13:31:26 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output/bands output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output/docs_to_remove\n", + "13:31:26 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:31:26 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:31:26 INFO - Running locally\n", + "2024-11-26 13:31:28,318\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:31 INFO - orchestrator started at 2024-11-26 13:31:31\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:31 INFO - Number of folders is 14\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:31 INFO - Cluster resources: {'cpus': 12, 'gpus': 0, 'memory': 11.77626838721335, 'object_store': 2.0}\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:31 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 1 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 2 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 3 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 4 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 5 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 6 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 7 files in 0.001 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 8 files in 0.001 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 9 files in 0.001 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 10 files in 0.001 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 11 files in 0.001 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 11 files (78.571%) in 0.001 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed processing 14 files in 0.001 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - done flushing in 0.001 sec\n", + "13:31:43 INFO - Completed execution in 0.292 min, execution result 0\n", + "13:31:45 INFO - ClusterAnalysis completed successfully\n", + "13:31:45 INFO - Starting GetDuplicateList step\n", + "13:31:45 INFO - Got parameters for GetDuplicateList\n", + "13:31:45 INFO - fdlist parameters are : {'docs_to_remove': 'docs_to_remove', 'consolidated_filename': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'sort_output': False}\n", + "13:31:45 INFO - pipeline id pipeline_id\n", + "13:31:45 INFO - code location None\n", + "13:31:45 INFO - number of workers 1 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "13:31:45 INFO - actor creation delay 0\n", + "13:31:45 INFO - job details {'job category': 'preprocessing', 'job name': 'fdlist', 'job type': 'ray', 'job id': 'job_id'}\n", + "13:31:45 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output\n", + "13:31:45 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:31:45 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:31:45 INFO - Running locally\n", + "2024-11-26 13:31:47,311\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:50 INFO - orchestrator started at 2024-11-26 13:31:50\n", + "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:50 INFO - Number of folders is 1\n", + "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:50 INFO - Cluster resources: {'cpus': 12, 'gpus': 0, 'memory': 11.749520111829042, 'object_store': 2.0}\n", + "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:50 INFO - Number of workers - 1 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:52 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:52 INFO - Completed processing 1 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:52 INFO - done flushing in 0.001 sec\n", + "\u001b[36m(RayTransformFileProcessor pid=87153)\u001b[0m 13:31:52 INFO - Get Duplicate List for folder docs_to_remove\n", + "\u001b[36m(RayTransformFileProcessor pid=87153)\u001b[0m 13:31:52 INFO - 8 documents marked as duplicates\n", + "13:32:02 INFO - Completed execution in 0.295 min, execution result 0\n", + "13:32:04 INFO - GetDuplicateList completed successfully\n", + "13:32:04 INFO - Starting DataCleaning step\n", + "13:32:04 INFO - Got parameters for DataCleaning\n", + "13:32:04 INFO - fdclean parameters are : {'document_id_column': 'int_id_column', 'duplicate_list_location': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'operation_mode': 'filter_duplicates'}\n", + "13:32:04 INFO - data factory dcdata_ is using local configuration without input/output path\n", + "13:32:04 INFO - data factory dcdata_ max_files -1, n_sample -1\n", + "13:32:04 INFO - data factory dcdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:32:04 INFO - pipeline id pipeline_id\n", + "13:32:04 INFO - code location None\n", + "13:32:04 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "13:32:04 INFO - actor creation delay 0\n", + "13:32:04 INFO - job details {'job category': 'preprocessing', 'job name': 'fdclean', 'job type': 'ray', 'job id': 'job_id'}\n", + "13:32:04 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/test-data/input output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output/cleaned\n", + "13:32:04 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:32:04 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:32:04 INFO - Running locally\n", + "2024-11-26 13:32:07,526\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:10 INFO - orchestrator started at 2024-11-26 13:32:10\n", + "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:10 INFO - Number of files is 1, source profile {'max_file_size': 0.003920555114746094, 'min_file_size': 0.003920555114746094, 'total_file_size': 0.003920555114746094}\n", + "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:10 INFO - Cluster resources: {'cpus': 12, 'gpus': 0, 'memory': 11.738976669497788, 'object_store': 2.0}\n", + "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:10 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:13 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:13 INFO - Completed processing 1 files in 0.002 min\n", + "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:13 INFO - done flushing in 0.003 sec\n", + "13:32:23 INFO - Completed execution in 0.313 min, execution result 0\n", + "13:32:24 INFO - DataCleaning completed successfully\n" + ] + } + ], "source": [ "\n", "sys.argv = ParamsUtils.dict_to_req(d=params)\n", @@ -130,10 +254,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "7276fe84-6512-4605-ab65-747351e13a7c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['ray/output/cleaned/metadata.json', 'ray/output/cleaned/df1.parquet']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import glob\n", "glob.glob(\"ray/output/cleaned/*\")" @@ -149,10 +284,167 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "5b22234f-f7a1-4b92-b2ac-376b2545abce", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape: (12, 2)\n", + "┌───────────────┬──────────────────────────────────────────────────────────────────────────────────┐\n", + "│ int_id_column ┆ contents │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ str │\n", + "╞═══════════════╪══════════════════════════════════════════════════════════════════════════════════╡\n", + "│ 1 ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ PR Newswire October 12, 2019 │\n", + "│ ┆ 145-year-old Retailer Anchors Woodland Mall Just Outside Grand Rapids; │\n", + "│ ┆ New Location Continues Strategic National Expansion Plans │\n", + "│ ┆ DAVENPORT, Iowa, Oct. 12, 2019 /PRNewswire/ -- Von Maur Department Stores opened │\n", + "│ ┆ a new store today at Woodland Mall in Kentwood, Mich. The 90,000-square-foot │\n", + "│ ┆ store is the Company's third location in Michigan. │\n", + "│ ┆ Known for its outstanding selection of brand name and specialty apparel, shoes, │\n", + "│ ┆ accessories and gifts, the store features products from leading brands such as │\n", + "│ ┆ Eileen Fisher, Vineyard Vines, Free People, and Kendra Scott, among many others. │\n", + "│ ┆ Von Maur is also widely-regarded for its superior customer service, including an │\n", + "│ ┆ interest-free charge card, accommodating return policy, free gift wrapping and │\n", + "│ ┆ free shipping services. │\n", + "│ ┆ Today's opening continues to build upon the momentum of the family-owned │\n", + "│ ┆ Company's targeted national growth strategy. Von Maur opened its first Wisconsin │\n", + "│ ┆ location in 2017 and a second Minnesota location in 2018, and it has grown in │\n", + "│ ┆ new states beyond its Midwestern footprint, including New York, Alabama and │\n", + "│ ┆ Oklahoma. Additionally, the Company has plans to open its second Wisconsin │\n", + "│ ┆ location in Madison in Fall 2021. │\n", + "│ ┆ \"With its easy accessibility to the larger Grand Rapids area and exceptional │\n", + "│ ┆ collection of shopping, dining and entertainment options, Woodland Mall is a │\n", + "│ ┆ fantastic location for us to continue growing our brand in Michigan,\" said Jim │\n", + "│ ┆ von Maur, president of Von Maur. \"From the moment shoppers walk through our │\n", + "│ ┆ doors, creating an unrivaled shopping experience is the motivation behind │\n", + "│ ┆ everything we do. We look forward to extending our offerings of brand name │\n", + "│ ┆ merchandise and signature customer service to the Grand Rapids area for many │\n", + "│ ┆ years to come.\" │\n", + "│ ┆ \"We are thrilled to welcome Von Maur, known for their high-quality merchandise │\n", + "│ ┆ and exceptional service, as the anchor of the newly developed wing at Woodland │\n", + "│ ┆ Mall,\" said Joe Coradino, CEO of PREIT. \"The addition most certainly solidifies │\n", + "│ ┆ Woodland Mall's place as the premier retail and entertainment destination in │\n", + "│ ┆ Grand Rapids, driving its place as a top-performing PREIT property.\" │\n", + "│ ┆ Centrally-located for shoppers from Grand Rapids and the surrounding areas, the │\n", + "│ ┆ new single story Von Maur store features the Company's signature exterior brick │\n", + "│ ┆ façade, open expansive floor plan, and residential ambiance, including music │\n", + "│ ┆ from the store's grand piano. │\n", + "│ ┆ The Woodland Mall store will eventually employ up to 150 associates; the │\n", + "│ ┆ majority of them will be full-time. Von Maur offers above-market wages, │\n", + "│ ┆ excellent benefits and a positive, professional work environment. Hours of │\n", + "│ ┆ operation are Monday to Saturday, 10 a.m. – 9 p.m. ET, and Sunday, 12 p.m. – 6 │\n", + "│ ┆ p.m. ET. │\n", + "│ ┆ About Von Maur │\n", + "│ ┆ Von Maur was founded 145 years ago in downtown Davenport, Iowa. The Company │\n", + "│ ┆ currently operates 35 stores in 15 states, along with a 120,000 square foot │\n", + "│ ┆ E-Commerce facility that drives its successful online business at vonmaur.com. │\n", + "│ ┆ Courtney Smith │\n", + "│ ┆ courtney@reputationpartners.com │\n", + "│ ┆ View original content:http://www.prnewswire.com/news-releases/von-maur-departmen │\n", + "│ ┆ t-store-opens-third-location-in-michigan-300937186.html │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ 3 ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful minds of our time about │\n", + "│ ┆ what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "│ 4 ┆ │\n", + "│ ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", + "│ ┆ minds of our time about what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ ┆ │\n", + "│ 5 ┆ │\n", + "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", + "│ ┆ minds of our time about what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "│ ┆ │\n", + "│ 6 ┆ │\n", + "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", + "│ ┆ minds of our time about what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ ┆ │\n", + "│ 11 ┆ A couple of capricious capybaras chatted coolly by the cactus, curiously │\n", + "│ ┆ considering another capy capably chewing on cantaloupe. Yesterday, a pair of │\n", + "│ ┆ capricious pigeons prattled placidly by the cactus, curiously considering │\n", + "│ ┆ another pigeon capably pecking at cantaloupe. The lazy llama lightly limped │\n", + "│ ┆ through the lilacs, laboriously longing for a lozenge │\n", + "│ 12 ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus, │\n", + "│ ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy │\n", + "│ ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A │\n", + "│ ┆ couple of capricious capybaras chatted coolly by the cactus, curiously │\n", + "│ ┆ considering another capy capably chewing on cantaloupe. │\n", + "│ 13 ┆ The lazy llama lightly limped through the lilacs, laboriously longing for a │\n", + "│ ┆ lozenge. A couple of capricious capybaras chatted coolly by the cactus, │\n", + "│ ┆ curiously considering another capy capably chewing on cantaloupe. Yesterday, a │\n", + "│ ┆ pair of capricious pigeons prattled placidly by the cactus, curiously │\n", + "│ ┆ considering another pigeon capably pecking at cantaloupe. │\n", + "│ 14 ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus, │\n", + "│ ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy │\n", + "│ ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A │\n", + "│ ┆ couple of capricious capybaras chatted coolly by the cactus, curiously pondering │\n", + "│ ┆ another capy capably chewing on cantaloupe │\n", + "│ 15 ┆ The new sheepskin leather coat with natural fur is 46-48 times warmer. The color │\n", + "│ ┆ is very beautiful bright green looks very beautiful. Purchased by the shopping │\n", + "│ ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time. │\n", + "│ 16 ┆ New sheepskin leather coat with natural fur is 50 times warmer. The color is │\n", + "│ ┆ very beautiful bright green looks very beautiful. Purchased by the shopping │\n", + "│ ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time. │\n", + "│ 17 ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful minds of our time about │\n", + "│ ┆ what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "└───────────────┴──────────────────────────────────────────────────────────────────────────────────┘\n" + ] + } + ], "source": [ "import polars as pl\n", "input_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\", \"df1.parquet\"))\n", @@ -170,10 +462,97 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "0b2eddb9-4fb6-41eb-916c-3741b9129f2c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape: (4, 2)\n", + "┌───────────────┬──────────────────────────────────────────────────────────────────────────────────┐\n", + "│ int_id_column ┆ contents │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ str │\n", + "╞═══════════════╪══════════════════════════════════════════════════════════════════════════════════╡\n", + "│ 1 ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ PR Newswire October 12, 2019 │\n", + "│ ┆ 145-year-old Retailer Anchors Woodland Mall Just Outside Grand Rapids; │\n", + "│ ┆ New Location Continues Strategic National Expansion Plans │\n", + "│ ┆ DAVENPORT, Iowa, Oct. 12, 2019 /PRNewswire/ -- Von Maur Department Stores opened │\n", + "│ ┆ a new store today at Woodland Mall in Kentwood, Mich. The 90,000-square-foot │\n", + "│ ┆ store is the Company's third location in Michigan. │\n", + "│ ┆ Known for its outstanding selection of brand name and specialty apparel, shoes, │\n", + "│ ┆ accessories and gifts, the store features products from leading brands such as │\n", + "│ ┆ Eileen Fisher, Vineyard Vines, Free People, and Kendra Scott, among many others. │\n", + "│ ┆ Von Maur is also widely-regarded for its superior customer service, including an │\n", + "│ ┆ interest-free charge card, accommodating return policy, free gift wrapping and │\n", + "│ ┆ free shipping services. │\n", + "│ ┆ Today's opening continues to build upon the momentum of the family-owned │\n", + "│ ┆ Company's targeted national growth strategy. Von Maur opened its first Wisconsin │\n", + "│ ┆ location in 2017 and a second Minnesota location in 2018, and it has grown in │\n", + "│ ┆ new states beyond its Midwestern footprint, including New York, Alabama and │\n", + "│ ┆ Oklahoma. Additionally, the Company has plans to open its second Wisconsin │\n", + "│ ┆ location in Madison in Fall 2021. │\n", + "│ ┆ \"With its easy accessibility to the larger Grand Rapids area and exceptional │\n", + "│ ┆ collection of shopping, dining and entertainment options, Woodland Mall is a │\n", + "│ ┆ fantastic location for us to continue growing our brand in Michigan,\" said Jim │\n", + "│ ┆ von Maur, president of Von Maur. \"From the moment shoppers walk through our │\n", + "│ ┆ doors, creating an unrivaled shopping experience is the motivation behind │\n", + "│ ┆ everything we do. We look forward to extending our offerings of brand name │\n", + "│ ┆ merchandise and signature customer service to the Grand Rapids area for many │\n", + "│ ┆ years to come.\" │\n", + "│ ┆ \"We are thrilled to welcome Von Maur, known for their high-quality merchandise │\n", + "│ ┆ and exceptional service, as the anchor of the newly developed wing at Woodland │\n", + "│ ┆ Mall,\" said Joe Coradino, CEO of PREIT. \"The addition most certainly solidifies │\n", + "│ ┆ Woodland Mall's place as the premier retail and entertainment destination in │\n", + "│ ┆ Grand Rapids, driving its place as a top-performing PREIT property.\" │\n", + "│ ┆ Centrally-located for shoppers from Grand Rapids and the surrounding areas, the │\n", + "│ ┆ new single story Von Maur store features the Company's signature exterior brick │\n", + "│ ┆ façade, open expansive floor plan, and residential ambiance, including music │\n", + "│ ┆ from the store's grand piano. │\n", + "│ ┆ The Woodland Mall store will eventually employ up to 150 associates; the │\n", + "│ ┆ majority of them will be full-time. Von Maur offers above-market wages, │\n", + "│ ┆ excellent benefits and a positive, professional work environment. Hours of │\n", + "│ ┆ operation are Monday to Saturday, 10 a.m. – 9 p.m. ET, and Sunday, 12 p.m. – 6 │\n", + "│ ┆ p.m. ET. │\n", + "│ ┆ About Von Maur │\n", + "│ ┆ Von Maur was founded 145 years ago in downtown Davenport, Iowa. The Company │\n", + "│ ┆ currently operates 35 stores in 15 states, along with a 120,000 square foot │\n", + "│ ┆ E-Commerce facility that drives its successful online business at vonmaur.com. │\n", + "│ ┆ Courtney Smith │\n", + "│ ┆ courtney@reputationpartners.com │\n", + "│ ┆ View original content:http://www.prnewswire.com/news-releases/von-maur-departmen │\n", + "│ ┆ t-store-opens-third-location-in-michigan-300937186.html │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ 4 ┆ │\n", + "│ ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", + "│ ┆ minds of our time about what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ ┆ │\n", + "│ 12 ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus, │\n", + "│ ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy │\n", + "│ ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A │\n", + "│ ┆ couple of capricious capybaras chatted coolly by the cactus, curiously │\n", + "│ ┆ considering another capy capably chewing on cantaloupe. │\n", + "│ 15 ┆ The new sheepskin leather coat with natural fur is 46-48 times warmer. The color │\n", + "│ ┆ is very beautiful bright green looks very beautiful. Purchased by the shopping │\n", + "│ ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time. │\n", + "└───────────────┴──────────────────────────────────────────────────────────────────────────────────┘\n" + ] + } + ], "source": [ "import polars as pl\n", "output_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"output\", \"cleaned\", \"df1.parquet\"))\n", @@ -188,13 +567,21 @@ "metadata": {}, "outputs": [], "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c11d3a4b-8ef9-417d-a8a2-f688db067a52", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "fdedup_ray", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "fdedup_ray" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -206,7 +593,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.11.10" } }, "nbformat": 4, diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py index ffc6f79bc..8e8795cce 100644 --- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py +++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py @@ -34,7 +34,7 @@ DATA_CLEANING_EXEC_SCRIPT_NAME: str = "data_cleaning_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/fdedup/python/pyproject.toml b/transforms/universal/fdedup/python/pyproject.toml index 08b20ed75..ff3666695 100644 --- a/transforms/universal/fdedup/python/pyproject.toml +++ b/transforms/universal/fdedup/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_fdedup_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Fuzzy Dedup Transform for Python" license = {text = "Apache-2.0"} diff --git a/transforms/universal/fdedup/python/requirements.txt b/transforms/universal/fdedup/python/requirements.txt index 3e5dfc16d..4cd06d819 100644 --- a/transforms/universal/fdedup/python/requirements.txt +++ b/transforms/universal/fdedup/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.3.dev0 +data-prep-toolkit>=0.2.2 pyyaml>=6.0.2 boto3>=1.34.69 kubernetes>=30.1.0 diff --git a/transforms/universal/fdedup/ray/pyproject.toml b/transforms/universal/fdedup/ray/pyproject.toml index 485d6de21..fa0627f00 100644 --- a/transforms/universal/fdedup/ray/pyproject.toml +++ b/transforms/universal/fdedup/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_fdedup_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "fdedup Ray Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/fdedup/ray/requirements.txt b/transforms/universal/fdedup/ray/requirements.txt index 81e48e5ee..ecb79fa77 100644 --- a/transforms/universal/fdedup/ray/requirements.txt +++ b/transforms/universal/fdedup/ray/requirements.txt @@ -1,5 +1,5 @@ -data-prep-toolkit[ray]==0.2.3.dev0 -dpk_fdedup_transform_python==0.2.3.dev0 +data-prep-toolkit[ray]>=0.2.2 +dpk_fdedup_transform_python==0.2.2 mmh3>=4.1.0 xxhash==3.4.1 tqdm==4.66.3 diff --git a/transforms/universal/fdedup/spark/pyproject.toml b/transforms/universal/fdedup/spark/pyproject.toml index 8a072b31b..798931552 100644 --- a/transforms/universal/fdedup/spark/pyproject.toml +++ b/transforms/universal/fdedup/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_fdedup_transform_spark" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Fuzzy Dedup Spark Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/fdedup/spark/requirements.txt b/transforms/universal/fdedup/spark/requirements.txt index bfb0f04a2..e70a880bd 100644 --- a/transforms/universal/fdedup/spark/requirements.txt +++ b/transforms/universal/fdedup/spark/requirements.txt @@ -1,5 +1,5 @@ -dpk_fdedup_transform_python==0.2.3.dev0 -data-prep-toolkit[spark]==0.2.3.dev0 +dpk_fdedup_transform_python==0.2.2 +data-prep-toolkit[spark]>=0.2.2 pyyaml>=6.0.2 boto3>=1.34.69 kubernetes>=30.1.0 diff --git a/transforms/universal/filter/kfp_ray/filter_wf.py b/transforms/universal/filter/kfp_ray/filter_wf.py index b856b1007..4b122d98f 100644 --- a/transforms/universal/filter/kfp_ray/filter_wf.py +++ b/transforms/universal/filter/kfp_ray/filter_wf.py @@ -24,7 +24,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/filter-ray:latest" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/filter/python/pyproject.toml b/transforms/universal/filter/python/pyproject.toml index fcf0f6419..8e9bb2366 100644 --- a/transforms/universal/filter/python/pyproject.toml +++ b/transforms/universal/filter/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_filter_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Filter Transform for Python" license = {text = "Apache-2.0"} diff --git a/transforms/universal/filter/python/requirements.txt b/transforms/universal/filter/python/requirements.txt index 100626f60..91f37927e 100644 --- a/transforms/universal/filter/python/requirements.txt +++ b/transforms/universal/filter/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit==0.2.3.dev0 +data-prep-toolkit>=0.2.2 duckdb>=0.10.1 diff --git a/transforms/universal/filter/ray/pyproject.toml b/transforms/universal/filter/ray/pyproject.toml index 64776e0c1..94df1cbac 100644 --- a/transforms/universal/filter/ray/pyproject.toml +++ b/transforms/universal/filter/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_filter_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Filter Transform for Ray" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] dependencies = [ - "dpk-filter-transform-python==0.2.3.dev0", - "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk-filter-transform-python==0.2.2", + "data-prep-toolkit[ray]>=0.2.2", ] [build-system] diff --git a/transforms/universal/filter/spark/pyproject.toml b/transforms/universal/filter/spark/pyproject.toml index ef46c9a1b..f62a81085 100644 --- a/transforms/universal/filter/spark/pyproject.toml +++ b/transforms/universal/filter/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_filter_transform_spark" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Filter Spark Transform" license = {text = "Apache-2.0"} @@ -9,7 +9,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[spark]==0.2.3.dev0", + "data-prep-toolkit[spark]>=0.2.2", ] [project.optional-dependencies] diff --git a/transforms/universal/hap/kfp_ray.disable/hap_wf.py b/transforms/universal/hap/kfp_ray.disable/hap_wf.py index 786011d4d..8069ec181 100644 --- a/transforms/universal/hap/kfp_ray.disable/hap_wf.py +++ b/transforms/universal/hap/kfp_ray.disable/hap_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "hap_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/hap/python/pyproject.toml b/transforms/universal/hap/python/pyproject.toml index bf7c85577..7b30dd72e 100644 --- a/transforms/universal/hap/python/pyproject.toml +++ b/transforms/universal/hap/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_hap_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "HAP Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/hap/python/requirements.txt b/transforms/universal/hap/python/requirements.txt index 1250d1f77..70e633ac9 100644 --- a/transforms/universal/hap/python/requirements.txt +++ b/transforms/universal/hap/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.3.dev0 +data-prep-toolkit>=0.2.2 nltk==3.9.1 transformers==4.38.2 torch>=2.2.2,<=2.4.1 diff --git a/transforms/universal/hap/ray/pyproject.toml b/transforms/universal/hap/ray/pyproject.toml index 38e78938b..6518e5277 100644 --- a/transforms/universal/hap/ray/pyproject.toml +++ b/transforms/universal/hap/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_hap_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "HAP Ray Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/hap/ray/requirements.txt b/transforms/universal/hap/ray/requirements.txt index 7c4c8eb94..3d18acaa4 100644 --- a/transforms/universal/hap/ray/requirements.txt +++ b/transforms/universal/hap/ray/requirements.txt @@ -1,5 +1,5 @@ -data-prep-toolkit[ray]==0.2.3.dev0 -dpk-hap-transform-python==0.2.3.dev0 +data-prep-toolkit[ray]>=0.2.2 +dpk-hap-transform-python==0.2.2 nltk==3.9.1 transformers==4.38.2 torch>=2.2.2,<=2.4.1 diff --git a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py index 3b102d205..737b60121 100644 --- a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "noop_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/noop/kfp_ray/noop_wf.py b/transforms/universal/noop/kfp_ray/noop_wf.py index e8125328b..9dbdaf3b0 100644 --- a/transforms/universal/noop/kfp_ray/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_wf.py @@ -24,7 +24,7 @@ EXEC_SCRIPT_NAME: str = "noop_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/noop/python/pyproject.toml b/transforms/universal/noop/python/pyproject.toml index ff9a24244..b60eef1ef 100644 --- a/transforms/universal/noop/python/pyproject.toml +++ b/transforms/universal/noop/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_noop_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "NOOP Python Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.3.dev0", + "data-prep-toolkit>=0.2.2", ] [build-system] diff --git a/transforms/universal/noop/ray/pyproject.toml b/transforms/universal/noop/ray/pyproject.toml index da9327917..e9e28eefd 100644 --- a/transforms/universal/noop/ray/pyproject.toml +++ b/transforms/universal/noop/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_noop_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "NOOP Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "dpk-noop-transform-python==0.2.3.dev0", - "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk-noop-transform-python==0.2.2", + "data-prep-toolkit[ray]>=0.2.2", ] [build-system] diff --git a/transforms/universal/noop/spark/pyproject.toml b/transforms/universal/noop/spark/pyproject.toml index d3cd47bf6..89d0a18dd 100644 --- a/transforms/universal/noop/spark/pyproject.toml +++ b/transforms/universal/noop/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_noop_transform_spark" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "NOOP Spark Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk-noop-transform-python==0.2.3.dev0", - "data-prep-toolkit[spark]==0.2.3.dev0", + "dpk-noop-transform-python==0.2.2", + "data-prep-toolkit[spark]>=0.2.2", ] [build-system] diff --git a/transforms/universal/profiler/kfp_ray/profiler_wf.py b/transforms/universal/profiler/kfp_ray/profiler_wf.py index 914637895..ee6323d74 100644 --- a/transforms/universal/profiler/kfp_ray/profiler_wf.py +++ b/transforms/universal/profiler/kfp_ray/profiler_wf.py @@ -24,7 +24,7 @@ EXEC_SCRIPT_NAME: str = "profiler_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/profiler/python/pyproject.toml b/transforms/universal/profiler/python/pyproject.toml index 39d9788f8..117be53c0 100644 --- a/transforms/universal/profiler/python/pyproject.toml +++ b/transforms/universal/profiler/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_profiler_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "profiler Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/profiler/python/requirements.txt b/transforms/universal/profiler/python/requirements.txt index 526140ada..420e3fe86 100644 --- a/transforms/universal/profiler/python/requirements.txt +++ b/transforms/universal/profiler/python/requirements.txt @@ -1,5 +1,5 @@ -data-prep-toolkit==0.2.3.dev0 +data-prep-toolkit>=0.2.2 mmh3==4.1.0 xxhash==3.4.1 diff --git a/transforms/universal/profiler/ray/pyproject.toml b/transforms/universal/profiler/ray/pyproject.toml index ac8d729ec..336d7e35d 100644 --- a/transforms/universal/profiler/ray/pyproject.toml +++ b/transforms/universal/profiler/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_profiler_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "profiler Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[ray]==0.2.3.dev0", - "dpk_profiler_transform_python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.2", + "dpk_profiler_transform_python==0.2.2", "tqdm==4.66.3", ] diff --git a/transforms/universal/profiler/spark/pyproject.toml b/transforms/universal/profiler/spark/pyproject.toml index 6ba790301..1e1638766 100644 --- a/transforms/universal/profiler/spark/pyproject.toml +++ b/transforms/universal/profiler/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_profiler_transform_spark" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Profiler Spark Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk-profiler-transform-python==0.2.3.dev0", - "data-prep-toolkit[spark]==0.2.3.dev0", + "dpk-profiler-transform-python==0.2.2", + "data-prep-toolkit[spark]>=0.2.2", ] [build-system] diff --git a/transforms/universal/resize/kfp_ray/resize_wf.py b/transforms/universal/resize/kfp_ray/resize_wf.py index 0724ed731..0a9be8e95 100644 --- a/transforms/universal/resize/kfp_ray/resize_wf.py +++ b/transforms/universal/resize/kfp_ray/resize_wf.py @@ -22,7 +22,7 @@ # the name of the job script EXEC_SCRIPT_NAME: str = "resize_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/resize/python/pyproject.toml b/transforms/universal/resize/python/pyproject.toml index 6fdad69d0..836388694 100644 --- a/transforms/universal/resize/python/pyproject.toml +++ b/transforms/universal/resize/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_resize_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "resize Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/resize/python/requirements.txt b/transforms/universal/resize/python/requirements.txt index 2f67f6a80..e9abc2535 100644 --- a/transforms/universal/resize/python/requirements.txt +++ b/transforms/universal/resize/python/requirements.txt @@ -1 +1 @@ -data-prep-toolkit==0.2.3.dev0 \ No newline at end of file +data-prep-toolkit>=0.2.2 \ No newline at end of file diff --git a/transforms/universal/resize/ray/pyproject.toml b/transforms/universal/resize/ray/pyproject.toml index c266a39f4..fbb4d0f30 100644 --- a/transforms/universal/resize/ray/pyproject.toml +++ b/transforms/universal/resize/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_resize_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Resize Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "dpk-resize-transform-python==0.2.3.dev0", - "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk-resize-transform-python==0.2.2", + "data-prep-toolkit[ray]>=0.2.2", ] [build-system] diff --git a/transforms/universal/resize/spark/pyproject.toml b/transforms/universal/resize/spark/pyproject.toml index 7de14c673..9f83a6816 100644 --- a/transforms/universal/resize/spark/pyproject.toml +++ b/transforms/universal/resize/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_resize_transform_spark" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Resize Spark Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk-resize-transform-python==0.2.3.dev0", - "data-prep-toolkit[spark]==0.2.3.dev0", + "dpk-resize-transform-python==0.2.2", + "data-prep-toolkit[spark]>=0.2.2", ] [build-system] diff --git a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py index c131d11ea..243cac6be 100644 --- a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py +++ b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py @@ -23,7 +23,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/tokenization-ray:latest" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files # path to kfp component specifications files diff --git a/transforms/universal/tokenization/python/pyproject.toml b/transforms/universal/tokenization/python/pyproject.toml index dbb8e84ba..021a1427f 100644 --- a/transforms/universal/tokenization/python/pyproject.toml +++ b/transforms/universal/tokenization/python/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "dpk_tokenization_transform_python" keywords = ["tokenizer", "data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Tokenization Transform for Python" license = {text = "Apache-2.0"} diff --git a/transforms/universal/tokenization/python/requirements.txt b/transforms/universal/tokenization/python/requirements.txt index 8a1920162..9c2a695a6 100644 --- a/transforms/universal/tokenization/python/requirements.txt +++ b/transforms/universal/tokenization/python/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit==0.2.3.dev0 +data-prep-toolkit>=0.2.2 transformers==4.38.2 diff --git a/transforms/universal/tokenization/ray/pyproject.toml b/transforms/universal/tokenization/ray/pyproject.toml index c094b9e7e..4cea4b905 100644 --- a/transforms/universal/tokenization/ray/pyproject.toml +++ b/transforms/universal/tokenization/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_tokenization_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Tokenization Transform for Ray" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Xuan-Hong Dang", email = "xuan-hong.dang@ibm.com"}, ] dependencies = [ - "dpk-tokenization-transform-python==0.2.3.dev0", - "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk-tokenization-transform-python==0.2.2", + "data-prep-toolkit[ray]>=0.2.2", ] [build-system] diff --git a/transforms/universal/web2parquet/requirements.txt b/transforms/universal/web2parquet/requirements.txt index 1af3f12a4..dfb74a6ca 100644 --- a/transforms/universal/web2parquet/requirements.txt +++ b/transforms/universal/web2parquet/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit>=0.2.3.dev0 +data-prep-toolkit>=0.2.2 data_prep_connector>=0.2.3 \ No newline at end of file From eec1339c96fe22cba3834484a8e5c41dc5308dc7 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Mon, 2 Dec 2024 15:30:20 -0500 Subject: [PATCH 2/4] set transform versionfor dev1 after release of dev0 Signed-off-by: Maroun Touma --- .make.versions | 6 +++--- data-processing-lib/pyproject.toml | 2 +- kfp/kfp_ray_components/createRayClusterComponent.yaml | 2 +- kfp/kfp_ray_components/deleteRayClusterComponent.yaml | 2 +- kfp/kfp_ray_components/executeRayJobComponent.yaml | 2 +- kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml | 2 +- kfp/kfp_ray_components/executeSubWorkflowComponent.yaml | 2 +- kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml | 4 ++-- kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml | 4 ++-- kfp/kfp_support_lib/shared_workflow_support/pyproject.toml | 4 ++-- transforms/code/code2parquet/kfp_ray/code2parquet_wf.py | 2 +- transforms/code/code2parquet/python/pyproject.toml | 2 +- transforms/code/code2parquet/python/requirements.txt | 2 +- transforms/code/code2parquet/ray/pyproject.toml | 6 +++--- transforms/code/code_profiler/python/pyproject.toml | 2 +- transforms/code/code_profiler/python/requirements.txt | 2 +- transforms/code/code_profiler/ray/pyproject.toml | 6 +++--- transforms/code/code_quality/kfp_ray/code_quality_wf.py | 2 +- transforms/code/code_quality/python/pyproject.toml | 2 +- transforms/code/code_quality/python/requirements.txt | 2 +- transforms/code/code_quality/ray/pyproject.toml | 6 +++--- .../code/header_cleanser/kfp_ray/header_cleanser_wf.py | 2 +- transforms/code/header_cleanser/python/pyproject.toml | 2 +- transforms/code/header_cleanser/python/requirements.txt | 2 +- transforms/code/header_cleanser/ray/pyproject.toml | 6 +++--- transforms/code/license_select/kfp_ray/license_select_wf.py | 2 +- transforms/code/license_select/python/pyproject.toml | 2 +- transforms/code/license_select/python/requirements.txt | 2 +- transforms/code/license_select/ray/pyproject.toml | 6 +++--- transforms/code/malware/kfp_ray/malware_wf.py | 2 +- transforms/code/malware/python/pyproject.toml | 4 ++-- transforms/code/malware/ray/pyproject.toml | 6 +++--- .../code/proglang_select/kfp_ray/proglang_select_wf.py | 2 +- transforms/code/proglang_select/python/pyproject.toml | 2 +- transforms/code/proglang_select/python/requirements.txt | 2 +- transforms/code/proglang_select/ray/pyproject.toml | 6 +++--- .../code/repo_level_ordering/kfp_ray/repo_level_order_wf.py | 2 +- transforms/code/repo_level_ordering/ray/pyproject.toml | 4 ++-- .../language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py | 2 +- transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py | 2 +- transforms/language/doc_chunk/python/requirements.txt | 2 +- transforms/language/doc_chunk/ray/pyproject.toml | 2 +- .../language/doc_quality/kfp_ray/doc_quality_multiple_wf.py | 2 +- transforms/language/doc_quality/kfp_ray/doc_quality_wf.py | 2 +- transforms/language/doc_quality/python/pyproject.toml | 2 +- transforms/language/doc_quality/python/requirements.txt | 2 +- transforms/language/doc_quality/ray/pyproject.toml | 6 +++--- transforms/language/html2parquet/kfp_ray/html2parquet_wf.py | 2 +- transforms/language/html2parquet/python/pyproject.toml | 2 +- transforms/language/html2parquet/python/requirements.txt | 2 +- transforms/language/html2parquet/ray/pyproject.toml | 2 +- transforms/language/html2parquet/ray/requirements.txt | 4 ++-- transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py | 2 +- transforms/language/lang_id/kfp_ray/lang_id_wf.py | 2 +- transforms/language/lang_id/python/pyproject.toml | 2 +- transforms/language/lang_id/python/requirements.txt | 2 +- transforms/language/lang_id/ray/pyproject.toml | 6 +++--- .../language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py | 2 +- transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py | 2 +- transforms/language/pdf2parquet/python/requirements.txt | 2 +- transforms/language/pdf2parquet/ray/requirements.txt | 2 +- transforms/language/pii_redactor/python/pyproject.toml | 2 +- transforms/language/pii_redactor/python/requirements.txt | 2 +- transforms/language/pii_redactor/ray/pyproject.toml | 6 +++--- .../text_encoder/kfp_ray/text_encoder_multiple_wf.py | 2 +- transforms/language/text_encoder/kfp_ray/text_encoder_wf.py | 2 +- transforms/language/text_encoder/python/pyproject.toml | 2 +- transforms/language/text_encoder/python/requirements.txt | 2 +- transforms/language/text_encoder/ray/pyproject.toml | 6 +++--- transforms/pyproject.toml | 2 +- transforms/requirements-ray.txt | 2 +- transforms/requirements.txt | 2 +- transforms/universal/doc_id/kfp_ray/doc_id_wf.py | 2 +- transforms/universal/doc_id/python/pyproject.toml | 2 +- transforms/universal/doc_id/python/requirements.txt | 2 +- transforms/universal/doc_id/ray/pyproject.toml | 6 +++--- transforms/universal/doc_id/spark/pyproject.toml | 4 ++-- transforms/universal/ededup/kfp_ray/ededup_wf.py | 2 +- transforms/universal/ededup/python/pyproject.toml | 2 +- transforms/universal/ededup/python/requirements.txt | 2 +- transforms/universal/ededup/ray/pyproject.toml | 6 +++--- transforms/universal/fdedup/kfp_ray/fdedup_wf.py | 2 +- transforms/universal/fdedup/python/pyproject.toml | 2 +- transforms/universal/fdedup/python/requirements.txt | 2 +- transforms/universal/fdedup/ray/pyproject.toml | 2 +- transforms/universal/fdedup/ray/requirements.txt | 4 ++-- transforms/universal/fdedup/spark/pyproject.toml | 2 +- transforms/universal/fdedup/spark/requirements.txt | 4 ++-- transforms/universal/filter/kfp_ray/filter_wf.py | 2 +- transforms/universal/filter/python/pyproject.toml | 2 +- transforms/universal/filter/python/requirements.txt | 2 +- transforms/universal/filter/ray/pyproject.toml | 6 +++--- transforms/universal/filter/spark/pyproject.toml | 4 ++-- transforms/universal/hap/kfp_ray.disable/hap_wf.py | 2 +- transforms/universal/hap/python/pyproject.toml | 2 +- transforms/universal/hap/python/requirements.txt | 2 +- transforms/universal/hap/ray/pyproject.toml | 2 +- transforms/universal/hap/ray/requirements.txt | 4 ++-- transforms/universal/noop/kfp_ray/noop_multiple_wf.py | 2 +- transforms/universal/noop/kfp_ray/noop_wf.py | 2 +- transforms/universal/noop/python/pyproject.toml | 4 ++-- transforms/universal/noop/ray/pyproject.toml | 6 +++--- transforms/universal/noop/spark/pyproject.toml | 6 +++--- transforms/universal/profiler/kfp_ray/profiler_wf.py | 2 +- transforms/universal/profiler/python/pyproject.toml | 2 +- transforms/universal/profiler/python/requirements.txt | 2 +- transforms/universal/profiler/ray/pyproject.toml | 6 +++--- transforms/universal/profiler/spark/pyproject.toml | 6 +++--- transforms/universal/resize/kfp_ray/resize_wf.py | 2 +- transforms/universal/resize/python/pyproject.toml | 2 +- transforms/universal/resize/python/requirements.txt | 2 +- transforms/universal/resize/ray/pyproject.toml | 6 +++--- transforms/universal/resize/spark/pyproject.toml | 6 +++--- .../universal/tokenization/kfp_ray/tokenization_wf.py | 2 +- transforms/universal/tokenization/python/pyproject.toml | 2 +- transforms/universal/tokenization/python/requirements.txt | 2 +- transforms/universal/tokenization/ray/pyproject.toml | 6 +++--- transforms/universal/web2parquet/requirements.txt | 2 +- 118 files changed, 174 insertions(+), 174 deletions(-) diff --git a/.make.versions b/.make.versions index ba5e87b0f..53a814695 100644 --- a/.make.versions +++ b/.make.versions @@ -16,10 +16,10 @@ DPK_MAJOR_VERSION=0 # The minor version is incremented manually when significant features have been added that are backward compatible with the previous major.minor release. DPK_MINOR_VERSION=2 # The minor version is incremented AUTOMATICALLY by the release.sh script when a new release is set. -DPK_MICRO_VERSION=2 +DPK_MICRO_VERSION=3 # The suffix is generally always set in the main/development branch and only nulled out when creating release branches. # It can be manually incremented, for example, to allow publishing a new intermediate version wheel to pypi. -DPK_VERSION_SUFFIX= +DPK_VERSION_SUFFIX=.dev0 DPK_VERSION=$(DPK_MAJOR_VERSION).$(DPK_MINOR_VERSION).$(DPK_MICRO_VERSION)$(DPK_VERSION_SUFFIX) @@ -66,4 +66,4 @@ endif # # If you change the versions numbers, be sure to run "make set-versions" to # update version numbers across the transform (e.g., pyproject.toml). -TRANSFORMS_PKG_VERSION=0.2.3.dev0 +TRANSFORMS_PKG_VERSION=0.2.3.dev1 diff --git a/data-processing-lib/pyproject.toml b/data-processing-lib/pyproject.toml index 4f5734be1..a347a14a1 100644 --- a/data-processing-lib/pyproject.toml +++ b/data-processing-lib/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit" -version = "0.2.2" +version = "0.2.3.dev0" keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] requires-python = ">=3.10,<3.13" description = "Data Preparation Toolkit Library for Ray and Python" diff --git a/kfp/kfp_ray_components/createRayClusterComponent.yaml b/kfp/kfp_ray_components/createRayClusterComponent.yaml index 78976a97c..30b0b66d8 100644 --- a/kfp/kfp_ray_components/createRayClusterComponent.yaml +++ b/kfp/kfp_ray_components/createRayClusterComponent.yaml @@ -11,7 +11,7 @@ inputs: implementation: container: - image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" + image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/deleteRayClusterComponent.yaml b/kfp/kfp_ray_components/deleteRayClusterComponent.yaml index c75554d5f..44e199c47 100644 --- a/kfp/kfp_ray_components/deleteRayClusterComponent.yaml +++ b/kfp/kfp_ray_components/deleteRayClusterComponent.yaml @@ -9,7 +9,7 @@ inputs: implementation: container: - image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" + image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/executeRayJobComponent.yaml b/kfp/kfp_ray_components/executeRayJobComponent.yaml index 2e02c3adf..7ab517bff 100644 --- a/kfp/kfp_ray_components/executeRayJobComponent.yaml +++ b/kfp/kfp_ray_components/executeRayJobComponent.yaml @@ -12,7 +12,7 @@ inputs: implementation: container: - image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" + image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml b/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml index 37c0198bf..9b98912f0 100644 --- a/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml +++ b/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml @@ -13,7 +13,7 @@ inputs: implementation: container: - image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" + image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml b/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml index ec82e9484..6b261a003 100644 --- a/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml +++ b/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml @@ -27,7 +27,7 @@ outputs: implementation: container: - image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" + image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists, and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml index daa903aaf..f09b2f32a 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_kfp_v1" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Data Preparation Kit Library. KFP support" license = {text = "Apache-2.0"} @@ -13,7 +13,7 @@ authors = [ ] dependencies = [ "kfp==1.8.22", - "data-prep-toolkit-kfp-shared==0.2.2", + "data-prep-toolkit-kfp-shared==0.2.3.dev0", ] [build-system] diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml index 61f54663f..01c5b3e17 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_kfp_v2" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Data Preparation Kit Library. KFP support" license = {text = "Apache-2.0"} @@ -14,7 +14,7 @@ authors = [ dependencies = [ "kfp==2.8.0", "kfp-kubernetes==1.2.0", - "data-prep-toolkit-kfp-shared==0.2.2", + "data-prep-toolkit-kfp-shared==0.2.3.dev0", ] [build-system] diff --git a/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml b/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml index 17ed57ea9..e0a6235c1 100644 --- a/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_kfp_shared" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Data Preparation Kit Library. KFP support" license = {text = "Apache-2.0"} @@ -14,7 +14,7 @@ authors = [ dependencies = [ "requests", "kubernetes", - "data-prep-toolkit[ray]>=0.2.2", + "data-prep-toolkit[ray]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py b/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py index 3e5f262b9..f3f491e4b 100644 --- a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py +++ b/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py @@ -25,7 +25,7 @@ # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/code2parquet/python/pyproject.toml b/transforms/code/code2parquet/python/pyproject.toml index d4f8c11cf..be84b2f20 100644 --- a/transforms/code/code2parquet/python/pyproject.toml +++ b/transforms/code/code2parquet/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code2parquet_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "code2parquet Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/code2parquet/python/requirements.txt b/transforms/code/code2parquet/python/requirements.txt index 0ce538837..08deeb7d9 100644 --- a/transforms/code/code2parquet/python/requirements.txt +++ b/transforms/code/code2parquet/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit>=0.2.2 +data-prep-toolkit>=0.2.3.dev0 parameterized pandas diff --git a/transforms/code/code2parquet/ray/pyproject.toml b/transforms/code/code2parquet/ray/pyproject.toml index 666551e94..923e2d4f3 100644 --- a/transforms/code/code2parquet/ray/pyproject.toml +++ b/transforms/code/code2parquet/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code2parquet_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "code2parquet Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[ray]>=0.2.2", - "dpk-code2parquet-transform-python==0.2.2", + "data-prep-toolkit[ray]>=0.2.3.dev0", + "dpk-code2parquet-transform-python==0.2.3.dev0", "parameterized", "pandas", ] diff --git a/transforms/code/code_profiler/python/pyproject.toml b/transforms/code/code_profiler/python/pyproject.toml index d3c2c2196..334c86fed 100644 --- a/transforms/code/code_profiler/python/pyproject.toml +++ b/transforms/code/code_profiler/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code_profiler_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Code Profiler Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/code_profiler/python/requirements.txt b/transforms/code/code_profiler/python/requirements.txt index a38213e0f..b36eabb8d 100644 --- a/transforms/code/code_profiler/python/requirements.txt +++ b/transforms/code/code_profiler/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit>=0.2.2 +data-prep-toolkit>=0.2.3.dev0 parameterized pandas aiolimiter==1.1.0 diff --git a/transforms/code/code_profiler/ray/pyproject.toml b/transforms/code/code_profiler/ray/pyproject.toml index 773ae353b..dbd552e93 100644 --- a/transforms/code/code_profiler/ray/pyproject.toml +++ b/transforms/code/code_profiler/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code_profiler_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Code Profiler Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Pankaj Thorat", email = "pankaj.thorat@ibm.com" }, ] dependencies = [ - "dpk-code-profiler-transform-python==0.2.2", - "data-prep-toolkit[ray]>=0.2.2", + "dpk-code-profiler-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/code/code_quality/kfp_ray/code_quality_wf.py b/transforms/code/code_quality/kfp_ray/code_quality_wf.py index 7f5aa9768..6a4ccec1b 100644 --- a/transforms/code/code_quality/kfp_ray/code_quality_wf.py +++ b/transforms/code/code_quality/kfp_ray/code_quality_wf.py @@ -24,7 +24,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/code_quality-ray:latest" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/code_quality/python/pyproject.toml b/transforms/code/code_quality/python/pyproject.toml index d7b452d6b..17cbce67d 100644 --- a/transforms/code/code_quality/python/pyproject.toml +++ b/transforms/code/code_quality/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code_quality_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Code Quality Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/code_quality/python/requirements.txt b/transforms/code/code_quality/python/requirements.txt index 10eb1001b..21ada1a79 100644 --- a/transforms/code/code_quality/python/requirements.txt +++ b/transforms/code/code_quality/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit>=0.2.2 +data-prep-toolkit>=0.2.3.dev0 bs4==0.0.2 transformers==4.38.2 diff --git a/transforms/code/code_quality/ray/pyproject.toml b/transforms/code/code_quality/ray/pyproject.toml index 5bf3d2dff..70dcd445b 100644 --- a/transforms/code/code_quality/ray/pyproject.toml +++ b/transforms/code/code_quality/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code_quality_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Code Quality Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, ] dependencies = [ - "dpk-code-quality-transform-python==0.2.2", - "data-prep-toolkit[ray]>=0.2.2", + "dpk-code-quality-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py index 5049a9c11..9bb315569 100644 --- a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py +++ b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py @@ -24,7 +24,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/header_cleanser-ray:latest" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/header_cleanser/python/pyproject.toml b/transforms/code/header_cleanser/python/pyproject.toml index 2dadeaf02..3703ec55f 100644 --- a/transforms/code/header_cleanser/python/pyproject.toml +++ b/transforms/code/header_cleanser/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_header_cleanser_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "License and Copyright Removal Transform for Python" license = {text = "Apache-2.0"} diff --git a/transforms/code/header_cleanser/python/requirements.txt b/transforms/code/header_cleanser/python/requirements.txt index 9123fc955..7a0fe8d28 100644 --- a/transforms/code/header_cleanser/python/requirements.txt +++ b/transforms/code/header_cleanser/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit>=0.2.2 +data-prep-toolkit>=0.2.3.dev0 scancode-toolkit==32.1.0 ; platform_system != 'Darwin' diff --git a/transforms/code/header_cleanser/ray/pyproject.toml b/transforms/code/header_cleanser/ray/pyproject.toml index 179aa7769..896f451ad 100644 --- a/transforms/code/header_cleanser/ray/pyproject.toml +++ b/transforms/code/header_cleanser/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_header_cleanser_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "License and copyright removal Transform for Ray" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Yash kalathiya", email = "yashkalathiya164@gmail.com" }, ] dependencies = [ - "dpk-header-cleanser-transform-python==0.2.2", - "data-prep-toolkit[ray]>=0.2.2", + "dpk-header-cleanser-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.3.dev0", "scancode-toolkit==32.1.0", ] diff --git a/transforms/code/license_select/kfp_ray/license_select_wf.py b/transforms/code/license_select/kfp_ray/license_select_wf.py index 9bdcc6e96..7dba0d9d1 100644 --- a/transforms/code/license_select/kfp_ray/license_select_wf.py +++ b/transforms/code/license_select/kfp_ray/license_select_wf.py @@ -25,7 +25,7 @@ # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/license_select/python/pyproject.toml b/transforms/code/license_select/python/pyproject.toml index b445c6b09..3345d3a5a 100644 --- a/transforms/code/license_select/python/pyproject.toml +++ b/transforms/code/license_select/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_license_select_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "License Select Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/license_select/python/requirements.txt b/transforms/code/license_select/python/requirements.txt index e9abc2535..08447f212 100644 --- a/transforms/code/license_select/python/requirements.txt +++ b/transforms/code/license_select/python/requirements.txt @@ -1 +1 @@ -data-prep-toolkit>=0.2.2 \ No newline at end of file +data-prep-toolkit>=0.2.3.dev0 \ No newline at end of file diff --git a/transforms/code/license_select/ray/pyproject.toml b/transforms/code/license_select/ray/pyproject.toml index 96b293364..b5facfffe 100644 --- a/transforms/code/license_select/ray/pyproject.toml +++ b/transforms/code/license_select/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_license_select_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "License Select Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Mark Lewis", email = "mark_lewis@uk.ibm.com" }, ] dependencies = [ - "dpk-license-select-transform-python==0.2.2", - "data-prep-toolkit[ray]>=0.2.2", + "dpk-license-select-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/code/malware/kfp_ray/malware_wf.py b/transforms/code/malware/kfp_ray/malware_wf.py index 89eb9d730..bede80b88 100644 --- a/transforms/code/malware/kfp_ray/malware_wf.py +++ b/transforms/code/malware/kfp_ray/malware_wf.py @@ -24,7 +24,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/malware-ray:latest" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/malware/python/pyproject.toml b/transforms/code/malware/python/pyproject.toml index 4dc1a9012..29db772a6 100644 --- a/transforms/code/malware/python/pyproject.toml +++ b/transforms/code/malware/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_malware_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Malware Python Transform" license = {text = "Apache-2.0"} @@ -9,7 +9,7 @@ authors = [ { name = "Takuya Goto", email = "tkyg@jp.ibm.com" }, ] dependencies = [ - "data-prep-toolkit>=0.2.2", + "data-prep-toolkit>=0.2.3.dev0", "clamd==1.0.2", ] diff --git a/transforms/code/malware/ray/pyproject.toml b/transforms/code/malware/ray/pyproject.toml index 22e7ecc28..9f9e78377 100644 --- a/transforms/code/malware/ray/pyproject.toml +++ b/transforms/code/malware/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_malware_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Malware Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Takuya Goto", email = "tkyg@jp.ibm.com" }, ] dependencies = [ - "dpk-malware-transform-python==0.2.2", - "data-prep-toolkit[ray]>=0.2.2", + "dpk-malware-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py index bb114e3d6..11f001bfa 100644 --- a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py +++ b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py @@ -24,7 +24,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/proglang_select-ray:latest" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/proglang_select/python/pyproject.toml b/transforms/code/proglang_select/python/pyproject.toml index e20a62f7c..e5736a9c7 100644 --- a/transforms/code/proglang_select/python/pyproject.toml +++ b/transforms/code/proglang_select/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_proglang_select_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Programming Language Selection Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/proglang_select/python/requirements.txt b/transforms/code/proglang_select/python/requirements.txt index e9abc2535..08447f212 100644 --- a/transforms/code/proglang_select/python/requirements.txt +++ b/transforms/code/proglang_select/python/requirements.txt @@ -1 +1 @@ -data-prep-toolkit>=0.2.2 \ No newline at end of file +data-prep-toolkit>=0.2.3.dev0 \ No newline at end of file diff --git a/transforms/code/proglang_select/ray/pyproject.toml b/transforms/code/proglang_select/ray/pyproject.toml index 3d330d3cc..321eb8f19 100644 --- a/transforms/code/proglang_select/ray/pyproject.toml +++ b/transforms/code/proglang_select/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_proglang_select_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Programming Language Selection Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, ] dependencies = [ - "dpk-proglang-select-transform-python==0.2.2", - "data-prep-toolkit[ray]>=0.2.2", + "dpk-proglang-select-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py b/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py index fa739bfd0..38a829fab 100644 --- a/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py +++ b/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py @@ -24,7 +24,7 @@ EXEC_SCRIPT_NAME: str = "repo_level_order_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/repo_level_ordering/ray/pyproject.toml b/transforms/code/repo_level_ordering/ray/pyproject.toml index 602799503..2481a1bf8 100644 --- a/transforms/code/repo_level_ordering/ray/pyproject.toml +++ b/transforms/code/repo_level_ordering/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_repo_level_order_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "repo_level_order Ray Transform" license = {text = "Apache-2.0"} @@ -11,7 +11,7 @@ authors = [ { name = "Shanmukha Guttula", email = "shagutt1@in.ibm.com" }, ] dependencies = [ - "data-prep-toolkit[ray]>=0.2.2", + "data-prep-toolkit[ray]>=0.2.3.dev0", "networkx==3.3", "colorlog==6.8.2", "func-timeout==4.3.5", diff --git a/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py b/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py index 1fd927356..7e30ee8b8 100644 --- a/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py +++ b/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "doc_chunk_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py b/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py index e128df8b0..387c3bda7 100644 --- a/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py +++ b/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "doc_chunk_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/doc_chunk/python/requirements.txt b/transforms/language/doc_chunk/python/requirements.txt index 2d282a8ac..6488e9c68 100644 --- a/transforms/language/doc_chunk/python/requirements.txt +++ b/transforms/language/doc_chunk/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit>=0.2.2 +data-prep-toolkit>=0.2.3.dev0 docling-core==2.3.0 pydantic>=2.0.0,<2.10.0 llama-index-core>=0.11.22,<0.12.0 diff --git a/transforms/language/doc_chunk/ray/pyproject.toml b/transforms/language/doc_chunk/ray/pyproject.toml index 6694456ce..774e9fc13 100644 --- a/transforms/language/doc_chunk/ray/pyproject.toml +++ b/transforms/language/doc_chunk/ray/pyproject.toml @@ -12,7 +12,7 @@ authors = [ ] dependencies = [ "dpk-doc-chunk-transform-python==0.3.0", - "data-prep-toolkit[ray]>=0.2.2", + "data-prep-toolkit[ray]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py b/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py index f103b7269..436d93ff3 100644 --- a/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py +++ b/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "doc_quality_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py b/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py index 0ca4fb865..f39fd7e39 100644 --- a/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py +++ b/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "doc_quality_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/doc_quality/python/pyproject.toml b/transforms/language/doc_quality/python/pyproject.toml index f3abe0337..23538b8c7 100644 --- a/transforms/language/doc_quality/python/pyproject.toml +++ b/transforms/language/doc_quality/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_quality_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Document Quality Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/doc_quality/python/requirements.txt b/transforms/language/doc_quality/python/requirements.txt index 2e29c9cb4..fddab961a 100644 --- a/transforms/language/doc_quality/python/requirements.txt +++ b/transforms/language/doc_quality/python/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit>=0.2.2 +data-prep-toolkit>=0.2.3.dev0 diff --git a/transforms/language/doc_quality/ray/pyproject.toml b/transforms/language/doc_quality/ray/pyproject.toml index 62f97e538..6395c45b4 100644 --- a/transforms/language/doc_quality/ray/pyproject.toml +++ b/transforms/language/doc_quality/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_quality_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Document Quality Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Daiki Tsuzuku", email = "dtsuzuku@jp.ibm.com" } ] dependencies = [ - "dpk-doc_quality-transform-python==0.2.2", - "data-prep-toolkit[ray]>=0.2.2", + "dpk-doc_quality-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py b/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py index 4eaef2fea..4eb8b9de1 100644 --- a/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py +++ b/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "html2parquet_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/html2parquet/python/pyproject.toml b/transforms/language/html2parquet/python/pyproject.toml index af6b64763..3a7a6efbc 100644 --- a/transforms/language/html2parquet/python/pyproject.toml +++ b/transforms/language/html2parquet/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_html2parquet_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "HTML2PARQUET Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/html2parquet/python/requirements.txt b/transforms/language/html2parquet/python/requirements.txt index 42e2459b2..fdd84b1e8 100644 --- a/transforms/language/html2parquet/python/requirements.txt +++ b/transforms/language/html2parquet/python/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit>=0.2.2 +data-prep-toolkit>=0.2.3.dev0 trafilatura==1.12.0 diff --git a/transforms/language/html2parquet/ray/pyproject.toml b/transforms/language/html2parquet/ray/pyproject.toml index 859706621..5e888748c 100644 --- a/transforms/language/html2parquet/ray/pyproject.toml +++ b/transforms/language/html2parquet/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_html2parquet_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "HTML2PARQUET Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/html2parquet/ray/requirements.txt b/transforms/language/html2parquet/ray/requirements.txt index 700267692..2f19e8b53 100644 --- a/transforms/language/html2parquet/ray/requirements.txt +++ b/transforms/language/html2parquet/ray/requirements.txt @@ -1,3 +1,3 @@ -dpk-html2parquet-transform-python==0.2.2 -data-prep-toolkit[ray]>=0.2.2 +dpk-html2parquet-transform-python==0.2.3.dev0 +data-prep-toolkit[ray]>=0.2.3.dev0 trafilatura==1.12.0 \ No newline at end of file diff --git a/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py b/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py index e853c2328..a89c54ab3 100644 --- a/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py +++ b/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "lang_id_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/lang_id/kfp_ray/lang_id_wf.py b/transforms/language/lang_id/kfp_ray/lang_id_wf.py index 5aed719c5..2ac84645d 100644 --- a/transforms/language/lang_id/kfp_ray/lang_id_wf.py +++ b/transforms/language/lang_id/kfp_ray/lang_id_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "lang_id_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/lang_id/python/pyproject.toml b/transforms/language/lang_id/python/pyproject.toml index 43650a50a..a69724a2d 100644 --- a/transforms/language/lang_id/python/pyproject.toml +++ b/transforms/language/lang_id/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_lang_id_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Language Identification Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/lang_id/python/requirements.txt b/transforms/language/lang_id/python/requirements.txt index 1f90bcd54..828ec54c3 100644 --- a/transforms/language/lang_id/python/requirements.txt +++ b/transforms/language/lang_id/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit>=0.2.2 +data-prep-toolkit>=0.2.3.dev0 fasttext==0.9.2 langcodes==3.3.0 huggingface-hub >= 0.21.4, <1.0.0 diff --git a/transforms/language/lang_id/ray/pyproject.toml b/transforms/language/lang_id/ray/pyproject.toml index b60a3a5bb..777e0d718 100644 --- a/transforms/language/lang_id/ray/pyproject.toml +++ b/transforms/language/lang_id/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_lang_id_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Language Identification Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Daiki Tsuzuku", email = "dtsuzuku@jp.ibm.com" } ] dependencies = [ - "dpk-lang_id-transform-python==0.2.2", - "data-prep-toolkit[ray]>=0.2.2", + "dpk-lang_id-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py index 56e881b5e..8992f1145 100644 --- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py +++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "pdf2parquet_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py index 395918ac3..c9cdbf652 100644 --- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py +++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "pdf2parquet_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/pdf2parquet/python/requirements.txt b/transforms/language/pdf2parquet/python/requirements.txt index 1d1aa2570..19f394d6c 100644 --- a/transforms/language/pdf2parquet/python/requirements.txt +++ b/transforms/language/pdf2parquet/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit>=0.2.2 +data-prep-toolkit>=0.2.3.dev0 docling-core==2.3.0 docling-ibm-models==2.0.3 deepsearch-glm==0.26.1 diff --git a/transforms/language/pdf2parquet/ray/requirements.txt b/transforms/language/pdf2parquet/ray/requirements.txt index 40650d1a5..93b9c3f96 100644 --- a/transforms/language/pdf2parquet/ray/requirements.txt +++ b/transforms/language/pdf2parquet/ray/requirements.txt @@ -1,5 +1,5 @@ dpk-pdf2parquet-transform-python==0.3.0 -data-prep-toolkit[ray]>=0.2.2 +data-prep-toolkit[ray]>=0.2.3.dev0 # docling-core==1.7.2 # docling-ibm-models==2.0.0 # deepsearch-glm==0.22.0 diff --git a/transforms/language/pii_redactor/python/pyproject.toml b/transforms/language/pii_redactor/python/pyproject.toml index 4a159bba0..9e490e730 100644 --- a/transforms/language/pii_redactor/python/pyproject.toml +++ b/transforms/language/pii_redactor/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_pii_redactor_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "PII redactor Transform for Python" license = {text = "Apache-2.0"} diff --git a/transforms/language/pii_redactor/python/requirements.txt b/transforms/language/pii_redactor/python/requirements.txt index 51fbd2494..5c3d41aa8 100644 --- a/transforms/language/pii_redactor/python/requirements.txt +++ b/transforms/language/pii_redactor/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit>=0.2.2 +data-prep-toolkit>=0.2.3.dev0 presidio-analyzer>=2.2.355 presidio-anonymizer>=2.2.355 flair>=0.14.0 diff --git a/transforms/language/pii_redactor/ray/pyproject.toml b/transforms/language/pii_redactor/ray/pyproject.toml index a65aa5913..a3648e80d 100644 --- a/transforms/language/pii_redactor/ray/pyproject.toml +++ b/transforms/language/pii_redactor/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_pii_redactor_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "PII Redactor Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "dpk_pii_redactor_transform_python==0.2.2", - "data-prep-toolkit[ray]>=0.2.2", + "dpk_pii_redactor_transform_python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.3.dev0", "presidio-analyzer>=2.2.355", "presidio-anonymizer>=2.2.355", "flair>=0.14.0", diff --git a/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py b/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py index bad5e24cd..e522737a1 100644 --- a/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py +++ b/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "text_encoder_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py b/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py index 5c762c2a1..f88fe9eef 100644 --- a/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py +++ b/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "text_encoder_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/text_encoder/python/pyproject.toml b/transforms/language/text_encoder/python/pyproject.toml index 62182b27b..dc15beb6e 100644 --- a/transforms/language/text_encoder/python/pyproject.toml +++ b/transforms/language/text_encoder/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_text_encoder_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Text Encoder Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/text_encoder/python/requirements.txt b/transforms/language/text_encoder/python/requirements.txt index 0d8160151..5a1cae43d 100644 --- a/transforms/language/text_encoder/python/requirements.txt +++ b/transforms/language/text_encoder/python/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit>=0.2.2 +data-prep-toolkit>=0.2.3.dev0 sentence-transformers==3.0.1 diff --git a/transforms/language/text_encoder/ray/pyproject.toml b/transforms/language/text_encoder/ray/pyproject.toml index 2f8483e2d..530f890d2 100644 --- a/transforms/language/text_encoder/ray/pyproject.toml +++ b/transforms/language/text_encoder/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_text_encoder_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Text Encoder Ray Transform" license = {text = "Apache-2.0"} @@ -11,8 +11,8 @@ authors = [ { name = "Peter Staar", email = "taa@zurich.ibm.com" }, ] dependencies = [ - "dpk-text_encoder-transform-python==0.2.2", - "data-prep-toolkit[ray]>=0.2.2", + "dpk-text_encoder-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/pyproject.toml b/transforms/pyproject.toml index eb25124c6..d6e0d2fdd 100644 --- a/transforms/pyproject.toml +++ b/transforms/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_transforms" -version = "0.2.3.dev0" +version = "0.2.3.dev1" requires-python = ">=3.10,<3.13" keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] description = "Data Preparation Toolkit Transforms using Ray" diff --git a/transforms/requirements-ray.txt b/transforms/requirements-ray.txt index 11d0decf5..b0527bdd6 100644 --- a/transforms/requirements-ray.txt +++ b/transforms/requirements-ray.txt @@ -1,4 +1,4 @@ -data-prep-toolkit[ray]>=0.2.2 +data-prep-toolkit[ray]>=0.2.3.dev0 networkx==3.3 colorlog==6.8.2 func-timeout==4.3.5 diff --git a/transforms/requirements.txt b/transforms/requirements.txt index 7317d33e3..934c95182 100644 --- a/transforms/requirements.txt +++ b/transforms/requirements.txt @@ -1 +1 @@ -data-prep-toolkit>=0.2.2 +data-prep-toolkit>=0.2.3.dev0 diff --git a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py index 7e1bd0b8e..f41231159 100644 --- a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py +++ b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py @@ -22,7 +22,7 @@ # the name of the job script EXEC_SCRIPT_NAME: str = "doc_id_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/doc_id/python/pyproject.toml b/transforms/universal/doc_id/python/pyproject.toml index a9e69f0bf..1a962662d 100644 --- a/transforms/universal/doc_id/python/pyproject.toml +++ b/transforms/universal/doc_id/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_id_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "ededup Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/doc_id/python/requirements.txt b/transforms/universal/doc_id/python/requirements.txt index e9abc2535..08447f212 100644 --- a/transforms/universal/doc_id/python/requirements.txt +++ b/transforms/universal/doc_id/python/requirements.txt @@ -1 +1 @@ -data-prep-toolkit>=0.2.2 \ No newline at end of file +data-prep-toolkit>=0.2.3.dev0 \ No newline at end of file diff --git a/transforms/universal/doc_id/ray/pyproject.toml b/transforms/universal/doc_id/ray/pyproject.toml index ee022af54..372f39762 100644 --- a/transforms/universal/doc_id/ray/pyproject.toml +++ b/transforms/universal/doc_id/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_id_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "docid Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk_doc_id_transform_python==0.2.2", - "data-prep-toolkit[ray]>=0.2.2", + "dpk_doc_id_transform_python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/doc_id/spark/pyproject.toml b/transforms/universal/doc_id/spark/pyproject.toml index f50d4f70d..369a1bb72 100644 --- a/transforms/universal/doc_id/spark/pyproject.toml +++ b/transforms/universal/doc_id/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_id_transform_spark" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Doc ID Spark Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[spark]==0.2.2", + "data-prep-toolkit[spark]==0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/ededup/kfp_ray/ededup_wf.py b/transforms/universal/ededup/kfp_ray/ededup_wf.py index d878bd3e2..ab46daadb 100644 --- a/transforms/universal/ededup/kfp_ray/ededup_wf.py +++ b/transforms/universal/ededup/kfp_ray/ededup_wf.py @@ -24,7 +24,7 @@ EXEC_SCRIPT_NAME: str = "ededup_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/ededup/python/pyproject.toml b/transforms/universal/ededup/python/pyproject.toml index 67fd0f758..da28e715f 100644 --- a/transforms/universal/ededup/python/pyproject.toml +++ b/transforms/universal/ededup/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_ededup_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "ededup Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/ededup/python/requirements.txt b/transforms/universal/ededup/python/requirements.txt index 9fe419975..b5082bf0b 100644 --- a/transforms/universal/ededup/python/requirements.txt +++ b/transforms/universal/ededup/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit>=0.2.2 +data-prep-toolkit>=0.2.3.dev0 mmh3>=4.1.0 xxhash==3.4.1 diff --git a/transforms/universal/ededup/ray/pyproject.toml b/transforms/universal/ededup/ray/pyproject.toml index 58b39d7d7..43045fed7 100644 --- a/transforms/universal/ededup/ray/pyproject.toml +++ b/transforms/universal/ededup/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_ededup_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "ededup Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[ray]>=0.2.2", - "dpk_ededup_transform_python==0.2.2", + "data-prep-toolkit[ray]>=0.2.3.dev0", + "dpk_ededup_transform_python==0.2.3.dev0", "tqdm==4.66.3", ] diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py index 8e8795cce..ffc6f79bc 100644 --- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py +++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py @@ -34,7 +34,7 @@ DATA_CLEANING_EXEC_SCRIPT_NAME: str = "data_cleaning_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/fdedup/python/pyproject.toml b/transforms/universal/fdedup/python/pyproject.toml index ff3666695..08b20ed75 100644 --- a/transforms/universal/fdedup/python/pyproject.toml +++ b/transforms/universal/fdedup/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_fdedup_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Fuzzy Dedup Transform for Python" license = {text = "Apache-2.0"} diff --git a/transforms/universal/fdedup/python/requirements.txt b/transforms/universal/fdedup/python/requirements.txt index 4cd06d819..985c0b967 100644 --- a/transforms/universal/fdedup/python/requirements.txt +++ b/transforms/universal/fdedup/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit>=0.2.2 +data-prep-toolkit>=0.2.3.dev0 pyyaml>=6.0.2 boto3>=1.34.69 kubernetes>=30.1.0 diff --git a/transforms/universal/fdedup/ray/pyproject.toml b/transforms/universal/fdedup/ray/pyproject.toml index fa0627f00..485d6de21 100644 --- a/transforms/universal/fdedup/ray/pyproject.toml +++ b/transforms/universal/fdedup/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_fdedup_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "fdedup Ray Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/fdedup/ray/requirements.txt b/transforms/universal/fdedup/ray/requirements.txt index ecb79fa77..23e0a8b75 100644 --- a/transforms/universal/fdedup/ray/requirements.txt +++ b/transforms/universal/fdedup/ray/requirements.txt @@ -1,5 +1,5 @@ -data-prep-toolkit[ray]>=0.2.2 -dpk_fdedup_transform_python==0.2.2 +data-prep-toolkit[ray]>=0.2.3.dev0 +dpk_fdedup_transform_python==0.2.3.dev0 mmh3>=4.1.0 xxhash==3.4.1 tqdm==4.66.3 diff --git a/transforms/universal/fdedup/spark/pyproject.toml b/transforms/universal/fdedup/spark/pyproject.toml index 798931552..8a072b31b 100644 --- a/transforms/universal/fdedup/spark/pyproject.toml +++ b/transforms/universal/fdedup/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_fdedup_transform_spark" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Fuzzy Dedup Spark Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/fdedup/spark/requirements.txt b/transforms/universal/fdedup/spark/requirements.txt index e70a880bd..653b94256 100644 --- a/transforms/universal/fdedup/spark/requirements.txt +++ b/transforms/universal/fdedup/spark/requirements.txt @@ -1,5 +1,5 @@ -dpk_fdedup_transform_python==0.2.2 -data-prep-toolkit[spark]>=0.2.2 +dpk_fdedup_transform_python==0.2.3.dev0 +data-prep-toolkit[spark]>=0.2.3.dev0 pyyaml>=6.0.2 boto3>=1.34.69 kubernetes>=30.1.0 diff --git a/transforms/universal/filter/kfp_ray/filter_wf.py b/transforms/universal/filter/kfp_ray/filter_wf.py index 4b122d98f..b856b1007 100644 --- a/transforms/universal/filter/kfp_ray/filter_wf.py +++ b/transforms/universal/filter/kfp_ray/filter_wf.py @@ -24,7 +24,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/filter-ray:latest" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/filter/python/pyproject.toml b/transforms/universal/filter/python/pyproject.toml index 8e9bb2366..fcf0f6419 100644 --- a/transforms/universal/filter/python/pyproject.toml +++ b/transforms/universal/filter/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_filter_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Filter Transform for Python" license = {text = "Apache-2.0"} diff --git a/transforms/universal/filter/python/requirements.txt b/transforms/universal/filter/python/requirements.txt index 91f37927e..d97ef5cfd 100644 --- a/transforms/universal/filter/python/requirements.txt +++ b/transforms/universal/filter/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit>=0.2.2 +data-prep-toolkit>=0.2.3.dev0 duckdb>=0.10.1 diff --git a/transforms/universal/filter/ray/pyproject.toml b/transforms/universal/filter/ray/pyproject.toml index 94df1cbac..6cb90c2bb 100644 --- a/transforms/universal/filter/ray/pyproject.toml +++ b/transforms/universal/filter/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_filter_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Filter Transform for Ray" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] dependencies = [ - "dpk-filter-transform-python==0.2.2", - "data-prep-toolkit[ray]>=0.2.2", + "dpk-filter-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/filter/spark/pyproject.toml b/transforms/universal/filter/spark/pyproject.toml index f62a81085..3f2c48e54 100644 --- a/transforms/universal/filter/spark/pyproject.toml +++ b/transforms/universal/filter/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_filter_transform_spark" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Filter Spark Transform" license = {text = "Apache-2.0"} @@ -9,7 +9,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[spark]>=0.2.2", + "data-prep-toolkit[spark]>=3.dev0", ] [project.optional-dependencies] diff --git a/transforms/universal/hap/kfp_ray.disable/hap_wf.py b/transforms/universal/hap/kfp_ray.disable/hap_wf.py index 8069ec181..786011d4d 100644 --- a/transforms/universal/hap/kfp_ray.disable/hap_wf.py +++ b/transforms/universal/hap/kfp_ray.disable/hap_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "hap_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/hap/python/pyproject.toml b/transforms/universal/hap/python/pyproject.toml index 7b30dd72e..bf7c85577 100644 --- a/transforms/universal/hap/python/pyproject.toml +++ b/transforms/universal/hap/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_hap_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "HAP Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/hap/python/requirements.txt b/transforms/universal/hap/python/requirements.txt index 70e633ac9..fdf9a425e 100644 --- a/transforms/universal/hap/python/requirements.txt +++ b/transforms/universal/hap/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit>=0.2.2 +data-prep-toolkit>=0.2.3.dev0 nltk==3.9.1 transformers==4.38.2 torch>=2.2.2,<=2.4.1 diff --git a/transforms/universal/hap/ray/pyproject.toml b/transforms/universal/hap/ray/pyproject.toml index 6518e5277..38e78938b 100644 --- a/transforms/universal/hap/ray/pyproject.toml +++ b/transforms/universal/hap/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_hap_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "HAP Ray Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/hap/ray/requirements.txt b/transforms/universal/hap/ray/requirements.txt index 3d18acaa4..adf675cac 100644 --- a/transforms/universal/hap/ray/requirements.txt +++ b/transforms/universal/hap/ray/requirements.txt @@ -1,5 +1,5 @@ -data-prep-toolkit[ray]>=0.2.2 -dpk-hap-transform-python==0.2.2 +data-prep-toolkit[ray]>=0.2.3.dev0 +dpk-hap-transform-python==0.2.3.dev0 nltk==3.9.1 transformers==4.38.2 torch>=2.2.2,<=2.4.1 diff --git a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py index 737b60121..3b102d205 100644 --- a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "noop_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/noop/kfp_ray/noop_wf.py b/transforms/universal/noop/kfp_ray/noop_wf.py index 9dbdaf3b0..e8125328b 100644 --- a/transforms/universal/noop/kfp_ray/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_wf.py @@ -24,7 +24,7 @@ EXEC_SCRIPT_NAME: str = "noop_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/noop/python/pyproject.toml b/transforms/universal/noop/python/pyproject.toml index b60eef1ef..16f07053a 100644 --- a/transforms/universal/noop/python/pyproject.toml +++ b/transforms/universal/noop/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_noop_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "NOOP Python Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit>=0.2.2", + "data-prep-toolkit>=0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/noop/ray/pyproject.toml b/transforms/universal/noop/ray/pyproject.toml index e9e28eefd..e848ec793 100644 --- a/transforms/universal/noop/ray/pyproject.toml +++ b/transforms/universal/noop/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_noop_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "NOOP Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "dpk-noop-transform-python==0.2.2", - "data-prep-toolkit[ray]>=0.2.2", + "dpk-noop-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/noop/spark/pyproject.toml b/transforms/universal/noop/spark/pyproject.toml index 89d0a18dd..5fe682eef 100644 --- a/transforms/universal/noop/spark/pyproject.toml +++ b/transforms/universal/noop/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_noop_transform_spark" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "NOOP Spark Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk-noop-transform-python==0.2.2", - "data-prep-toolkit[spark]>=0.2.2", + "dpk-noop-transform-python==0.2.3.dev0", + "data-prep-toolkit[spark]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/profiler/kfp_ray/profiler_wf.py b/transforms/universal/profiler/kfp_ray/profiler_wf.py index ee6323d74..914637895 100644 --- a/transforms/universal/profiler/kfp_ray/profiler_wf.py +++ b/transforms/universal/profiler/kfp_ray/profiler_wf.py @@ -24,7 +24,7 @@ EXEC_SCRIPT_NAME: str = "profiler_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/profiler/python/pyproject.toml b/transforms/universal/profiler/python/pyproject.toml index 117be53c0..39d9788f8 100644 --- a/transforms/universal/profiler/python/pyproject.toml +++ b/transforms/universal/profiler/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_profiler_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "profiler Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/profiler/python/requirements.txt b/transforms/universal/profiler/python/requirements.txt index 420e3fe86..2b32cd843 100644 --- a/transforms/universal/profiler/python/requirements.txt +++ b/transforms/universal/profiler/python/requirements.txt @@ -1,5 +1,5 @@ -data-prep-toolkit>=0.2.2 +data-prep-toolkit>=0.2.3.dev0 mmh3==4.1.0 xxhash==3.4.1 diff --git a/transforms/universal/profiler/ray/pyproject.toml b/transforms/universal/profiler/ray/pyproject.toml index 336d7e35d..9e1c49adf 100644 --- a/transforms/universal/profiler/ray/pyproject.toml +++ b/transforms/universal/profiler/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_profiler_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "profiler Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[ray]>=0.2.2", - "dpk_profiler_transform_python==0.2.2", + "data-prep-toolkit[ray]>=0.2.3.dev0", + "dpk_profiler_transform_python==0.2.3.dev0", "tqdm==4.66.3", ] diff --git a/transforms/universal/profiler/spark/pyproject.toml b/transforms/universal/profiler/spark/pyproject.toml index 1e1638766..08e770278 100644 --- a/transforms/universal/profiler/spark/pyproject.toml +++ b/transforms/universal/profiler/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_profiler_transform_spark" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Profiler Spark Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk-profiler-transform-python==0.2.2", - "data-prep-toolkit[spark]>=0.2.2", + "dpk-profiler-transform-python==0.2.3.dev0", + "data-prep-toolkit[spark]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/resize/kfp_ray/resize_wf.py b/transforms/universal/resize/kfp_ray/resize_wf.py index 0a9be8e95..0724ed731 100644 --- a/transforms/universal/resize/kfp_ray/resize_wf.py +++ b/transforms/universal/resize/kfp_ray/resize_wf.py @@ -22,7 +22,7 @@ # the name of the job script EXEC_SCRIPT_NAME: str = "resize_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/resize/python/pyproject.toml b/transforms/universal/resize/python/pyproject.toml index 836388694..6fdad69d0 100644 --- a/transforms/universal/resize/python/pyproject.toml +++ b/transforms/universal/resize/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_resize_transform_python" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "resize Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/resize/python/requirements.txt b/transforms/universal/resize/python/requirements.txt index e9abc2535..08447f212 100644 --- a/transforms/universal/resize/python/requirements.txt +++ b/transforms/universal/resize/python/requirements.txt @@ -1 +1 @@ -data-prep-toolkit>=0.2.2 \ No newline at end of file +data-prep-toolkit>=0.2.3.dev0 \ No newline at end of file diff --git a/transforms/universal/resize/ray/pyproject.toml b/transforms/universal/resize/ray/pyproject.toml index fbb4d0f30..1b056fc8f 100644 --- a/transforms/universal/resize/ray/pyproject.toml +++ b/transforms/universal/resize/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_resize_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Resize Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "dpk-resize-transform-python==0.2.2", - "data-prep-toolkit[ray]>=0.2.2", + "dpk-resize-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/resize/spark/pyproject.toml b/transforms/universal/resize/spark/pyproject.toml index 9f83a6816..dc5bd98e3 100644 --- a/transforms/universal/resize/spark/pyproject.toml +++ b/transforms/universal/resize/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_resize_transform_spark" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Resize Spark Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk-resize-transform-python==0.2.2", - "data-prep-toolkit[spark]>=0.2.2", + "dpk-resize-transform-python==0.2.3.dev0", + "data-prep-toolkit[spark]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py index 243cac6be..c131d11ea 100644 --- a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py +++ b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py @@ -23,7 +23,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/tokenization-ray:latest" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files # path to kfp component specifications files diff --git a/transforms/universal/tokenization/python/pyproject.toml b/transforms/universal/tokenization/python/pyproject.toml index 021a1427f..dbb8e84ba 100644 --- a/transforms/universal/tokenization/python/pyproject.toml +++ b/transforms/universal/tokenization/python/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "dpk_tokenization_transform_python" keywords = ["tokenizer", "data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Tokenization Transform for Python" license = {text = "Apache-2.0"} diff --git a/transforms/universal/tokenization/python/requirements.txt b/transforms/universal/tokenization/python/requirements.txt index 9c2a695a6..1fca1f418 100644 --- a/transforms/universal/tokenization/python/requirements.txt +++ b/transforms/universal/tokenization/python/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit>=0.2.2 +data-prep-toolkit>=0.2.3.dev0 transformers==4.38.2 diff --git a/transforms/universal/tokenization/ray/pyproject.toml b/transforms/universal/tokenization/ray/pyproject.toml index 4cea4b905..6df6b746c 100644 --- a/transforms/universal/tokenization/ray/pyproject.toml +++ b/transforms/universal/tokenization/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_tokenization_transform_ray" -version = "0.2.2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Tokenization Transform for Ray" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Xuan-Hong Dang", email = "xuan-hong.dang@ibm.com"}, ] dependencies = [ - "dpk-tokenization-transform-python==0.2.2", - "data-prep-toolkit[ray]>=0.2.2", + "dpk-tokenization-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/web2parquet/requirements.txt b/transforms/universal/web2parquet/requirements.txt index dfb74a6ca..1af3f12a4 100644 --- a/transforms/universal/web2parquet/requirements.txt +++ b/transforms/universal/web2parquet/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit>=0.2.2 +data-prep-toolkit>=0.2.3.dev0 data_prep_connector>=0.2.3 \ No newline at end of file From 920a17bf4c7158ccaf6ba1b06d4e541b1f817ada Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Mon, 2 Dec 2024 16:40:16 -0500 Subject: [PATCH 3/4] fix typo: Signed-off-by: Maroun Touma --- transforms/universal/filter/spark/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transforms/universal/filter/spark/pyproject.toml b/transforms/universal/filter/spark/pyproject.toml index 3f2c48e54..176ff1de3 100644 --- a/transforms/universal/filter/spark/pyproject.toml +++ b/transforms/universal/filter/spark/pyproject.toml @@ -9,7 +9,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[spark]>=3.dev0", + "data-prep-toolkit[spark]>=0.2.3.dev0", ] [project.optional-dependencies] From 8c70ce3a6b50bd7abe2f7bca81f179adc703e8a4 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Mon, 2 Dec 2024 18:26:23 -0500 Subject: [PATCH 4/4] improved make targets for transforms Signed-off-by: Maroun Touma --- .make.defaults | 75 +++++------------ transforms/.make.cicd.targets | 153 +++++++++++++++++++++++----------- 2 files changed, 124 insertions(+), 104 deletions(-) diff --git a/.make.defaults b/.make.defaults index 51eb984ee..80df91c8e 100644 --- a/.make.defaults +++ b/.make.defaults @@ -209,7 +209,7 @@ __check_defined = \ # We create both local and remote tags. Local seems to be needed when using our spark # base image. Remote seems to be needed by kfp. .PHONY: .defaults.image -.defaults.image:: # Must be called with a DOCKER_IMAGE= settings. +.defaults.image:: # Must be called with a DOCKER_IMAGE_NAME= settings. @# Help: Create the docker image $(DOCKER_LOCAL_IMAGE) and a tag for $(DOCKER_REMOTE_IMAGE) $(call check_defined, DOCKER_IMAGE_NAME) # The following touch seems to be needed to work around a docker build problem in which @@ -222,14 +222,15 @@ __check_defined = \ if [ -e pyproject.toml ]; then \ touch pyproject.toml; \ fi - $(DOCKER) build -t $(DOCKER_LOCAL_IMAGE) $(DOCKER_BUILD_EXTRA_ARGS) \ + $(DOCKER) build -f $(DOCKER_FILE) -t $(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) $(DOCKER_BUILD_EXTRA_ARGS) \ --platform $(DOCKER_PLATFORM) \ --build-arg EXTRA_INDEX_URL=$(EXTRA_INDEX_URL) \ --build-arg BASE_IMAGE=$(BASE_IMAGE) \ --build-arg DPK_WHEEL_FILE_NAME=$(DPK_WHEEL_FILE_NAME) \ --build-arg BUILD_DATE=$(shell date -u +'%Y-%m-%dT%H:%M:%SZ') \ --build-arg GIT_COMMIT=$(shell git log -1 --format=%h) . - $(DOCKER) tag $(DOCKER_LOCAL_IMAGE) $(DOCKER_REMOTE_IMAGE) + $(DOCKER) tag $(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) $(DOCKER_REGISTRY_ENDPOINT)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) + # Copy a source tree in LIB_PATH, including src, pyproject.toml to LIB_NAME # Generally used to copy source from within the repo into a local directory for use by a Dockerfile @@ -244,24 +245,25 @@ __check_defined = \ cp -p ${LIB_PATH}/requirements.txt ${LIB_NAME}; \ fi - -# Build and image using the local Dockerfile and make the data-processing-lib/python -# available in the current directory for use by the Dockerfile (i.e. to install the library). -#.PHONY: .defaults.python-lib-src-image -#.defaults.python-lib-src-image:: # Must be called with a DOCKER_LOCAL_IMAGE= settings. -# @# Help: Build the Python $(DOCKER_LOCAL_IMAGE) using the $(DOCKER_FILE), requirements.txt and data-processing-lib/python source -#ifeq ($(USE_REPO_LIB_SRC), 1) -# $(MAKE) LIB_PATH=$(DPK_PYTHON_LIB_DIR) LIB_NAME=data-processing-lib-python .defaults.copy-lib -#endif -# $(MAKE) DOCKER_IMAGE=$(DOCKER_LOCAL_IMAGE) .defaults.image -# -rm -rf data-processing-lib-python .PHONY: .default.build-lib-wheel .default.build-lib-wheel: - make -C $(REPOROOT)/data-processing-lib build-pkg-dist + $(MAKE) -C $(REPOROOT)/data-processing-lib build-pkg-dist rm -rf data-processing-dist && mkdir data-processing-dist cp $(REPOROOT)/data-processing-lib/dist/*.whl data-processing-dist + +# Build and image using the local Dockerfile +# Assumes wheel has already been created +.PHONY: .defaults.lib-whl-image +.defaults.lib-whl-image:: + # Must be called with a DOCKER_LOCAL_IMAGE= settings. + @# Help: Build the Python $(DOCKER_LOCAL_IMAGE) using the the wheel file for the library + @$(eval LIB_WHEEL_FILE := $(shell find data-processing-dist/*.whl)) + $(eval LIB_WHEEL_FILE := $(shell basename $(LIB_WHEEL_FILE))) + $(MAKE) DPK_WHEEL_FILE_NAME=$(LIB_WHEEL_FILE) .defaults.image + + # Build and image using the local Dockerfile and make the wheel for data-processing-lib # available in the current directory for use by the Dockerfile (i.e. to install the library). .PHONY: .defaults.python-lib-whl-image @@ -270,28 +272,9 @@ __check_defined = \ @# Help: Build the Python $(DOCKER_LOCAL_IMAGE) using the the wheel file for the library @$(eval LIB_WHEEL_FILE := $(shell find data-processing-dist/*.whl)) $(eval LIB_WHEEL_FILE := $(shell basename $(LIB_WHEEL_FILE))) - $(MAKE) DOCKER_IMAGE=$(DOCKER_LOCAL_IMAGE) DPK_WHEEL_FILE_NAME=$(LIB_WHEEL_FILE) .defaults.image + $(MAKE) DPK_WHEEL_FILE_NAME=$(LIB_WHEEL_FILE) .defaults.image -rm -rf data-processing-dist -# Build an image using the local Dockerfile and make the data-processing-lib/ray -# available in the current directory for use by the Dockerfile (i.e. to install the library). -# Note that this looks for the ../python directory, which is currently only used in the transform projects, -# but we add it here as a convenience to avoid duplicating a lot of this in transforms/.make.transforms. -#.PHONY: .defaults.ray-lib-src-image -#.defaults.ray-lib-src-image:: # Must be called with a DOCKER_LOCAL_IMAGE= settings. -# @# Help: Build the Ray $(DOCKER_LOCAL_IMAGE) using the $(DOCKER_FILE), requirements.txt and data-processing-libs source -#ifeq ($(USE_REPO_LIB_SRC), 1) -# $(MAKE) LIB_PATH=$(DPK_PYTHON_LIB_DIR) LIB_NAME=data-processing-lib-python .defaults.copy-lib -# $(MAKE) LIB_PATH=$(DPK_RAY_LIB_DIR) LIB_NAME=data-processing-lib-ray .defaults.copy-lib -#endif -# if [ -e ../python ]; then \ -# $(MAKE) LIB_PATH=../python LIB_NAME=python-transform .defaults.copy-lib; \ -# fi -# $(MAKE) DOCKER_IMAGE=$(DOCKER_LOCAL_IMAGE) .defaults.image -# -rm -rf data-processing-lib-python -# -rm -rf data-processing-lib-ray -# -rm -rf python-transform - # Build an image using the local Dockerfile and make the data-processing wheel # available in the current directory for use by the Dockerfile (i.e. to install the library). @@ -306,7 +289,7 @@ __check_defined = \ if [ -e ../python ]; then \ $(MAKE) LIB_PATH=../python LIB_NAME=python-transform .defaults.copy-lib; \ fi - $(MAKE) DOCKER_IMAGE=$(DOCKER_LOCAL_IMAGE) DPK_WHEEL_FILE_NAME=$(LIB_WHEEL_FILE) .defaults.image + $(MAKE) DPK_WHEEL_FILE_NAME=$(LIB_WHEEL_FILE) .defaults.image -rm -rf python-transform -rm -rf data-processing-dist @@ -316,24 +299,6 @@ __check_defined = \ .defaults.spark-lib-base-image: $(MAKE) -C $(DPK_SPARK_LIB_DIR) image -# Note that this looks for the ../python directory, which is currently only used in the transform projects, -# but we add it here as a convenience to avoid duplicating a lot of this in transforms/.make.transforms. -# Must be called with a DOCKER_LOCAL_IMAGE= settings. -#.PHONY: .defaults.spark-lib-src-image -#.defaults.spark-lib-src-image:: .defaults.spark-lib-base-image -# @# Help: Build the Spark $(DOCKER_LOCAL_IMAGE) using the $(DOCKER_FILE), requirements.txt and data-processing-libs source -# $(MAKE) IMAGE_NAME_TO_VERIFY=$(DOCKER_SPARK_BASE_IMAGE_NAME) .defaults.verify-image-availability -#ifeq ($(USE_REPO_LIB_SRC), 1) -# $(MAKE) LIB_PATH=$(DPK_PYTHON_LIB_DIR) LIB_NAME=data-processing-lib-python .defaults.copy-lib -# $(MAKE) LIB_PATH=$(DPK_SPARK_LIB_DIR) LIB_NAME=data-processing-lib-spark .defaults.copy-lib -#endif -# if [ -e ../python ]; then \ -# $(MAKE) LIB_PATH=../python LIB_NAME=python-transform .defaults.copy-lib; \ -# fi -# $(MAKE) DOCKER_IMAGE=$(DOCKER_LOCAL_IMAGE) BASE_IMAGE=$(DOCKER_SPARK_BASE_IMAGE) .defaults.image -# -rm -rf data-processing-lib-python -# -rm -rf data-processing-lib-spark -# -rm -rf python-transform .PHONY: .defaults.spark-lib-whl-image .defaults.spark-lib-whl-image:: .default.build-lib-wheel .defaults.spark-lib-base-image @@ -345,7 +310,7 @@ __check_defined = \ if [ -e ../python ]; then \ $(MAKE) LIB_PATH=../python LIB_NAME=python-transform .defaults.copy-lib; \ fi - $(MAKE) DOCKER_IMAGE=$(DOCKER_LOCAL_IMAGE) BASE_IMAGE=$(DOCKER_SPARK_BASE_IMAGE) DPK_WHEEL_FILE_NAME=$(LIB_WHEEL_FILE) .defaults.image + $(MAKE) BASE_IMAGE=$(DOCKER_SPARK_BASE_IMAGE) DPK_WHEEL_FILE_NAME=$(LIB_WHEEL_FILE) .defaults.image -rm -rf python-transform -rm -rf data-processing-dist diff --git a/transforms/.make.cicd.targets b/transforms/.make.cicd.targets index 69a5f54fd..e392e8f36 100644 --- a/transforms/.make.cicd.targets +++ b/transforms/.make.cicd.targets @@ -7,10 +7,15 @@ include $(REPOROOT)/transforms/.make.transforms ###################################################################### -## Default setting for TRANSFORM_RUNTIME uses folder name-- Old layout -TRANSFORM_PYTHON_RUNTIME_SRC_FILE=-m dpk_$(TRANSFORM_NAME).transform -TRANSFORM_RAY_RUNTIME_SRC_FILE=-m dpk_$(TRANSFORM_NAME).ray.transform -TRANSFORM_PYTHON_RUNTIME_SRC_FILE=-m dpk_$(TRANSFORM_NAME).spark.transform +## Default setting for TRANSFORM_RUNTIME entry point: +# python -m dpk_html2parquet.ray.transform --help +# or +# python -m dpk_html2parquet.transform_python --help +# +TRANSFORM_PYTHON_SRC?="-m dpk_$(TRANSFORM_NAME).transform_python" +TRANSFORM_RAY_SRC?="-m dpk_$(TRANSFORM_NAME).ray.transform" +TRANSFORM_SPARK_SRC?="-m dpk_$(TRANSFORM_NAME).spark.transform" + venv:: .defaults.create-venv source venv/bin/activate && $(PIP) install -e $(REPOROOT)/data-processing-lib[ray,spark] @@ -19,7 +24,6 @@ venv:: .defaults.create-venv source venv/bin/activate && $(PIP) install -r requirements.txt; \ fi; - test:: .transforms.test-src test-image clean:: .transforms.clean @@ -28,62 +32,113 @@ clean:: .transforms.clean set-versions:: ## We need to think how we want to do this going forward -build:: -image:: - @if [ -e Dockerfile ]; then \ - $(MAKE) image-default ; \ - else \ - echo "Skipping image for $(shell pwd) since no Dockerfile is present"; \ +build:: image + +publish: + @if [ -e Dockerfile.python ]; then \ + $(MAKE) DOCKER_REMOTE_IMAGE=$(DOCKER_REGISTRY_ENDPOINT)/$(TRANSFORM_NAME)-python:$(DOCKER_IMAGE_VERSION) \ + .defaults.publish-image ; \ + fi + @if [ -e Dockerfile.ray ]; then \ + $(MAKE) DOCKER_REMOTE_IMAGE=$(DOCKER_REGISTRY_ENDPOINT)/$(TRANSFORM_NAME)-ray:$(DOCKER_IMAGE_VERSION) \ + .defaults.publish-image ; \ + fi + @if [ -e Dockerfile.spark ]; then \ + $(MAKE) DOCKER_REMOTE_IMAGE=$(DOCKER_REGISTRY_ENDPOINT)/$(TRANSFORM_NAME)-spark:$(DOCKER_IMAGE_VERSION) \ + .defaults.publish-image ; \ + fi + +test-image-sequence:: .defaults.lib-whl-image .transforms.test-image-help .transforms.clean + +test-image:: .default.build-lib-wheel + @if [ -e Dockerfile.python ]; then \ + $(MAKE) DOCKER_FILE=Dockerfile.python \ + TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_PYTHON_SRC) \ + DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-python \ + test-image-sequence ; \ + fi + @if [ -e Dockerfile.ray ]; then \ + $(MAKE) DOCKER_FILE=Dockerfile.ray \ + TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_RAY_SRC) \ + DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-ray \ + BASE_IMAGE=$(RAY_BASE_IMAGE) \ + test-image-sequence ; \ fi + @if [ -e Dockerfile.spark ]; then \ + $(MAKE) DOCKER_FILE=Dockerfile.spark \ + TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_SPARK_SRC) \ + DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-spark \ + BASE_IMAGE=$(SPARK_BASE_IMAGE) \ + test-image-sequence ; \ + fi + -rm -rf data-processing-dist + -publish:: - @if [ -e Dockerfile ]; then \ - $(MAKE) publish-default ; \ - else \ - echo "Skipping publish for $(shell pwd) since no Dockerfile is present"; \ +image-python: + @if [ -e Dockerfile.python ]; then \ + $(MAKE) DOCKER_FILE=Dockerfile.python \ + DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-python \ + .defaults.lib-whl-image ; \ fi -publish-image:: - @if [ -e Dockerfile ]; then \ - $(MAKE) publish-image-default ; \ - else \ - echo "Skipping publish-image for $(shell pwd) since no Dockerfile is present"; \ +image-ray: + @if [ -e Dockerfile.ray ]; then \ + $(MAKE) DOCKER_FILE=Dockerfile.ray \ + DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-ray \ + BASE_IMAGE=$(RAY_BASE_IMAGE) \ + .defaults.lib-whl-image ; \ fi -test-image:: - @if [ -e Dockerfile ]; then \ - $(MAKE) test-image-default ; \ - else \ - echo "Skipping test-image for $(shell pwd) since no Dockerfile is present"; \ +image-spark: + @if [ -e Dockerfile.spark ]; then \ + $(MAKE) DOCKER_FILE=Dockerfile.spark \ + DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-spark \ + BASE_IMAGE=$(SPARK_BASE_IMAGE) \ + .defaults.lib-whl-image ; \ fi +image:: .default.build-lib-wheel + ## Build all possible images unless a specific runtime is specified + @if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "python" ]; then \ + $(MAKE) image-python ; \ + fi + @if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "ray" ]; then \ + $(MAKE) image-ray ; \ + fi + @if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "spark" ]; then \ + $(MAKE) image-spark ; \ + fi + -rm -rf data-processing-dist + test-src:: .transforms.test-src setup:: .transforms.setup -publish-default:: publish-image - -publish-image-default:: .defaults.publish-image - -test-image-default:: image .transforms.test-image-help .defaults.test-image-pytest .transforms.clean - -build-lib-wheel: - make -C $(REPOROOT)/data-processing-lib build-pkg-dist - -image-default:: build-lib-wheel - @$(eval LIB_WHEEL_FILE := $(shell find $(REPOROOT)/data-processing-lib/dist/*.whl)) - rm -fr dist && mv $(REPOROOT)/data-processing-lib/dist . - $(eval WHEEL_FILE_NAME := $(shell basename $(LIB_WHEEL_FILE))) - $(DOCKER) build -t $(DOCKER_IMAGE_NAME) $(DOCKER_BUILD_EXTRA_ARGS) \ - --platform $(DOCKER_PLATFORM) \ - --build-arg EXTRA_INDEX_URL=$(EXTRA_INDEX_URL) \ - --build-arg BASE_IMAGE=$(RAY_BASE_IMAGE) \ - --build-arg BUILD_DATE=$(shell date -u +'%Y-%m-%dT%H:%M:%SZ') \ - --build-arg WHEEL_FILE_NAME=$(WHEEL_FILE_NAME) \ - --build-arg TRANSFORM_NAME=$(TRANSFORM_NAME) \ - --build-arg GIT_COMMIT=$(shell git log -1 --format=%h) . - $(DOCKER) tag $(DOCKER_LOCAL_IMAGE) $(DOCKER_REMOTE_IMAGE) - rm -fr dist +kind-load-image:: .transforms.kind-load-image + +.PHONY: workflow-vent +workflow-venv: + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray TRANSFORM_NAME=$(TRANSFORM_NAME) workflow-venv; \ + fi + +.PHONY: workflow-test +workflow-test: + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray TRANSFORM_NAME=$(TRANSFORM_NAME) workflow-test; \ + fi + +.PHONY: workflow-upload +workflow-upload: + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray TRANSFORM_NAME=$(TRANSFORM_NAME) workflow-upload; \ + fi + +.PHONY: workflow-build +workflow-build: + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray TRANSFORM_NAME=$(TRANSFORM_NAME) workflow-build; \ + fi