diff --git a/.asf.yaml b/.asf.yaml index 721b9f2d3dd7..8c067e7e4ee9 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -38,6 +38,7 @@ github: collaborators: - pcoet - olehborysevych + - rshamunov enabled_merge_buttons: squash: true diff --git a/.github/REVIEWERS.yml b/.github/REVIEWERS.yml index 4dc9b2b3de53..a0ec8f6eefde 100644 --- a/.github/REVIEWERS.yml +++ b/.github/REVIEWERS.yml @@ -33,7 +33,6 @@ labels: - ryanthompson591 - tvalentyn - pabloem - - y1chi exclusionList: [] - name: Java reviewers: diff --git a/.github/codecov.yml b/.github/codecov.yml index 0eaf91cdbdd6..c1c5dfb17bb4 100644 --- a/.github/codecov.yml +++ b/.github/codecov.yml @@ -64,6 +64,7 @@ ignore: - "**/*_test_py3*.py" - "**/*_microbenchmark.py" - "sdks/go/pkg/beam/register/register.go" + - "sdks/python/apache_beam/testing/benchmarks/nexmark/**" # See https://docs.codecov.com/docs/flags for options. flag_management: diff --git a/.github/workflows/build_playground_backend.yml b/.github/workflows/build_playground_backend.yml index 4aa0fd294931..c9e705d24c98 100644 --- a/.github/workflows/build_playground_backend.yml +++ b/.github/workflows/build_playground_backend.yml @@ -42,7 +42,7 @@ jobs: steps: - name: Check out the repo uses: actions/checkout@v3 - - uses: actions/setup-java@v3 + - uses: actions/setup-java@v3.6.0 with: distribution: 'zulu' java-version: '8' diff --git a/.github/workflows/build_playground_frontend.yml b/.github/workflows/build_playground_frontend.yml index a27ce08d07ae..73e918f47005 100644 --- a/.github/workflows/build_playground_frontend.yml +++ b/.github/workflows/build_playground_frontend.yml @@ -43,7 +43,7 @@ jobs: steps: - name: Check out the repo uses: actions/checkout@v3 - - uses: actions/setup-java@v3 + - uses: actions/setup-java@v3.6.0 with: distribution: 'zulu' java-version: '8' diff --git a/.github/workflows/java_tests.yml b/.github/workflows/java_tests.yml index b8e64c3fdf46..1a587e7b4919 100644 --- a/.github/workflows/java_tests.yml +++ b/.github/workflows/java_tests.yml @@ -172,7 +172,7 @@ jobs: project_id: ${{ secrets.GCP_PROJECT_ID }} export_default_credentials: true - name: Set Java Version - uses: actions/setup-java@v3 + uses: actions/setup-java@v3.6.0 with: distribution: 'zulu' java-version: 8 diff --git a/.github/workflows/playground_deploy_examples.yml b/.github/workflows/playground_deploy_examples.yml index d75e9473126f..541da7907246 100644 --- a/.github/workflows/playground_deploy_examples.yml +++ b/.github/workflows/playground_deploy_examples.yml @@ -78,7 +78,7 @@ jobs: - uses: actions/setup-python@v4 with: python-version: '3.8' - - uses: actions/setup-java@v3 + - uses: actions/setup-java@v3.6.0 with: distribution: 'zulu' java-version: '8' diff --git a/.github/workflows/playground_examples_ci_reusable.yml b/.github/workflows/playground_examples_ci_reusable.yml index bde25c457404..43c72e7ec16c 100644 --- a/.github/workflows/playground_examples_ci_reusable.yml +++ b/.github/workflows/playground_examples_ci_reusable.yml @@ -100,7 +100,7 @@ jobs: - uses: actions/setup-python@v4 with: python-version: '3.8' - - uses: actions/setup-java@v3 + - uses: actions/setup-java@v3.6.0 with: distribution: 'zulu' java-version: '8' diff --git a/.github/workflows/run_rc_validation.yml b/.github/workflows/run_rc_validation.yml index 2407a0406168..e8d990912ffe 100644 --- a/.github/workflows/run_rc_validation.yml +++ b/.github/workflows/run_rc_validation.yml @@ -88,8 +88,7 @@ jobs: git config user.name $GITHUB_ACTOR git config user.email actions@"$RUNNER_NAME".local - name: Verify working branch name - run: - - sh ./ci_check_git_branch.sh $WORKING_BRANCH + run: ./scripts/ci/ci_check_git_branch.sh $WORKING_BRANCH - name: Create Pull Request run: | git checkout -b ${{env.WORKING_BRANCH}} ${{ env.RC_TAG }} --quiet @@ -121,7 +120,7 @@ jobs: - name: Setup Java JDK - uses: actions/setup-java@v3.5.1 + uses: actions/setup-java@v3.6.0 with: distribution: 'temurin' java-version: 11 @@ -188,7 +187,7 @@ jobs: uses: azure/setup-kubectl@v3 - name: Setup Java JDK - uses: actions/setup-java@v3.5.1 + uses: actions/setup-java@v3.6.0 with: distribution: 'temurin' java-version: 11 diff --git a/.github/workflows/tour_of_beam_backend_integration.yml b/.github/workflows/tour_of_beam_backend_integration.yml index 473088150840..47399e728dac 100644 --- a/.github/workflows/tour_of_beam_backend_integration.yml +++ b/.github/workflows/tour_of_beam_backend_integration.yml @@ -23,11 +23,15 @@ on: push: branches: ['master', 'release-*'] tags: 'v*' - paths: ['learning/tour-of-beam/backend/**'] + paths: + - 'learning/tour-of-beam/backend/**' + - 'playground/backend/**' pull_request: branches: ['master', 'release-*'] tags: 'v*' - paths: ['learning/tour-of-beam/backend/**'] + paths: + - 'learning/tour-of-beam/backend/**' + - 'playground/backend/**' # This allows a subsequently queued workflow run to interrupt previous runs concurrency: @@ -36,12 +40,23 @@ concurrency: env: TOB_LEARNING_ROOT: ./samples/learning-content - DATASTORE_PROJECT_ID: test-proj + # firebase + GOOGLE_CLOUD_PROJECT: demo-test-proj + FIREBASE_AUTH_EMULATOR_HOST: localhost:9099 + # datastore + DATASTORE_PROJECT_ID: demo-test-proj DATASTORE_EMULATOR_HOST: localhost:8081 DATASTORE_EMULATOR_DATADIR: ./datadir + # playground API + PLAYGROUND_ROUTER_HOST: localhost:8000 + + # GCF PORT_SDK_LIST: 8801 PORT_GET_CONTENT_TREE: 8802 PORT_GET_UNIT_CONTENT: 8803 + PORT_GET_USER_PROGRESS: 8804 + PORT_POST_UNIT_COMPLETE: 8805 + PORT_POST_USER_CODE: 8806 jobs: @@ -56,20 +71,12 @@ jobs: with: # pin to the biggest Go version supported by Cloud Functions runtime go-version: '1.16' - - # 1. Datastore emulator - - name: 'Set up Cloud SDK' - uses: 'google-github-actions/setup-gcloud@v0' - with: - version: 397.0.0 - project_id: ${{ env.DATASTORE_PROJECT_ID }} - install_components: 'beta,cloud-datastore-emulator' - - name: 'Start datastore emulator' - run: | - gcloud beta emulators datastore start \ - --data-dir=${{ env.DATASTORE_EMULATOR_DATADIR }} \ - --host-port=${{ env.DATASTORE_EMULATOR_HOST }} \ - --consistency=1 & + - name: Build Playground router image + run: ./gradlew playground:backend:containers:router:docker + working-directory: ${{ env.GITHUB_WORKSPACE }} + # 1. Start emulators + - name: Start emulators + run: docker-compose up -d # 2. start function-framework processes in BG - name: Compile CF @@ -80,14 +87,25 @@ jobs: run: PORT=${{ env.PORT_GET_CONTENT_TREE }} FUNCTION_TARGET=getContentTree ./tob_function & - name: Run getUnitContent in background run: PORT=${{ env.PORT_GET_UNIT_CONTENT }} FUNCTION_TARGET=getUnitContent ./tob_function & + - name: Run getUserProgress in background + run: PORT=${{ env.PORT_GET_USER_PROGRESS }} FUNCTION_TARGET=getUserProgress ./tob_function & + - name: Run postUnitComplete in background + run: PORT=${{ env.PORT_POST_UNIT_COMPLETE }} FUNCTION_TARGET=postUnitComplete ./tob_function & + - name: Run postUserCode in background + run: PORT=${{ env.PORT_POST_USER_CODE }} FUNCTION_TARGET=postUserCode ./tob_function & # 3. Load data in datastore: run CD step on samples/learning-content - name: Run CI/CD to populate datastore run: go run cmd/ci_cd/ci_cd.go - # 4. Check sdkList, getContentTree, getUnitContent: run integration tests + # 4. run integration tests - name: Go integration tests run: go test -v --tags integration ./integration_tests/... + + - name: Stop emulators + if: always() + run: docker-compose down + # 5. Compare storage/datastore/index.yml VS generated - name: Check index.yaml run: | diff --git a/.github/workflows/verify_release_build.yml b/.github/workflows/verify_release_build.yml index 5a128a4eec61..2ab76079e1d8 100644 --- a/.github/workflows/verify_release_build.yml +++ b/.github/workflows/verify_release_build.yml @@ -39,9 +39,8 @@ jobs: RELEASE_VER: ${{ github.event.inputs.RELEASE_VER }} steps: - name: Verify branch name - run: - - sh ./ci_check_git_branch.sh $WORKING_BRANCH - working-directory: 'scripts/ci' + run: ./scripts/ci/ci_check_git_branch.sh $WORKING_BRANCH + - name: Set RELEASE_BRANCH env variable run: | RELEASE_BRANCH=release-${{env.RELEASE_VER}} diff --git a/.test-infra/jenkins/Flink.groovy b/.test-infra/jenkins/Flink.groovy index 2aecf8ea8311..4aadf6943ed7 100644 --- a/.test-infra/jenkins/Flink.groovy +++ b/.test-infra/jenkins/Flink.groovy @@ -17,7 +17,7 @@ */ class Flink { - private static final String flinkDownloadUrl = 'https://archive.apache.org/dist/flink/flink-1.12.3/flink-1.12.3-bin-scala_2.11.tgz' + private static final String flinkDownloadUrl = 'https://archive.apache.org/dist/flink/flink-1.13.6/flink-1.13.6-bin-scala_2.12.tgz' private static final String hadoopDownloadUrl = 'https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.8.3-10.0/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar' private static final String FLINK_DIR = '"$WORKSPACE/src/.test-infra/dataproc"' private static final String FLINK_SCRIPT = 'flink_cluster.sh' @@ -75,7 +75,7 @@ class Flink { } /** - * Updates the number of worker nodes in a cluster. + * Updates the number of worker nodes in a cluster. * * @param workerCount - the new number of worker nodes in the cluster */ diff --git a/.test-infra/jenkins/README.md b/.test-infra/jenkins/README.md index c860435ea743..e53dae86c458 100644 --- a/.test-infra/jenkins/README.md +++ b/.test-infra/jenkins/README.md @@ -140,9 +140,9 @@ Beam Jenkins overview page: [link](https://ci-beam.apache.org/) | beam_PerformanceTests_AvroIOIT | [cron](https://ci-beam.apache.org/job/beam_PerformanceTests_AvroIOIT/), [hdfs_cron](https://ci-beam.apache.org/job/beam_PerformanceTests_AvroIOIT_HDFS/) | `Run Java AvroIO Performance Test` | [![Build Status](https://ci-beam.apache.org/job/beam_PerformanceTests_AvroIOIT/badge/icon)](https://ci-beam.apache.org/job/beam_PerformanceTests_AvroIOIT) [![Build Status](https://ci-beam.apache.org/job/beam_PerformanceTests_AvroIOIT_HDFS/badge/icon)](https://ci-beam.apache.org/job/beam_PerformanceTests_AvroIOIT_HDFS) | | beam_PerformanceTests_BiqQueryIO_Read_Python | [cron](https://ci-beam.apache.org/job/beam_PerformanceTests_BiqQueryIO_Read_Python/), [phrase](https://ci-beam.apache.org/view/PerformanceTests/job/beam_PerformanceTests_BiqQueryIO_Read_Python_PR/) | `Run BigQueryIO Read Performance Test Python` | [![Build Status](https://ci-beam.apache.org/job/beam_PerformanceTests_BiqQueryIO_Read_Python/badge/icon)](https://ci-beam.apache.org/job/beam_PerformanceTests_BiqQueryIO_Read_Python) | | beam_PerformanceTests_BiqQueryIO_Write_Python_Batch | [cron](https://ci-beam.apache.org/job/beam_PerformanceTests_BiqQueryIO_Write_Python_Batch/), [phrase](https://ci-beam.apache.org/view/PerformanceTests/job/beam_PerformanceTests_BiqQueryIO_Write_Python_Batch_PR/) | `Run BigQueryIO Write Performance Test Python Batch` | [![Build Status](https://ci-beam.apache.org/job/beam_PerformanceTests_BiqQueryIO_Write_Python_Batch/badge/icon)](https://ci-beam.apache.org/job/beam_PerformanceTests_BiqQueryIO_Write_Python_Batch) | -| beam_BiqQueryIO_Batch_Performance_Test_Java_Avro | [cron](https://ci-beam.apache.org/job/beam_BiqQueryIO_Batch_Performance_Test_Java_Avro/) | `Run BigQueryIO Batch Performance Test Java Avro` | [![Build Status](https://ci-beam.apache.org/job/beam_BiqQueryIO_Batch_Performance_Test_Java_Avro/badge/icon)](https://ci-beam.apache.org/job/beam_BiqQueryIO_Batch_Performance_Test_Java_Avro/) | -| beam_BiqQueryIO_Batch_Performance_Test_Java_Json | [cron](https://ci-beam.apache.org/job/beam_BiqQueryIO_Batch_Performance_Test_Java_Json/) | `Run BigQueryIO Batch Performance Test Java Json` | [![Build Status](https://ci-beam.apache.org/job/beam_BiqQueryIO_Batch_Performance_Test_Java_Json/badge/icon)](https://ci-beam.apache.org/job/beam_BiqQueryIO_Batch_Performance_Test_Java_Json/) | -| beam_BiqQueryIO_Streaming_Performance_Test_Java | [cron](https://ci-beam.apache.org/job/beam_BiqQueryIO_Streaming_Performance_Test_Java/) | `Run BigQueryIO Streaming Performance Test Java` | [![Build Status](https://ci-beam.apache.org/job/beam_BiqQueryIO_Streaming_Performance_Test_Java/badge/icon)](https://ci-beam.apache.org/job/beam_BiqQueryIO_Streaming_Performance_Test_Java/) | +| beam_PerformanceTests_BiqQueryIO_Batch_Java_Avro | [cron](https://ci-beam.apache.org/job/beam_PerformanceTests_BiqQueryIO_Batch_Java_Avro/) | `Run BigQueryIO Batch Performance Test Java Avro` | [![Build Status](https://ci-beam.apache.org/job/beam_PerformanceTests_BiqQueryIO_Batch_Java_Avro/badge/icon)](https://ci-beam.apache.org/job/beam_PerformanceTests_BiqQueryIO_Batch_Java_Avro/) | +| beam_PerformanceTests_BiqQueryIO_Batch_Java_Json | [cron](https://ci-beam.apache.org/job/beam_PerformanceTests_BiqQueryIO_Batch_Java_Json/) | `Run BigQueryIO Batch Performance Test Java Json` | [![Build Status](https://ci-beam.apache.org/job/beam_PerformanceTests_BiqQueryIO_Batch_Java_Json/badge/icon)](https://ci-beam.apache.org/job/beam_PerformanceTests_BiqQueryIO_Batch_Java_Json/) | +| beam_PerformanceTests_BiqQueryIO_Streaming_Java | [cron](https://ci-beam.apache.org/job/beam_PerformanceTests_BiqQueryIO_Streaming_Java/) | `Run BigQueryIO Streaming Performance Test Java` | [![Build Status](https://ci-beam.apache.org/job/beam_PerformanceTests_BiqQueryIO_Streaming_Java/badge/icon)](https://ci-beam.apache.org/job/beam_PerformanceTests_BiqQueryIO_Streaming_Java/) | | beam_PerformanceTests_Cdap | [cron](https://ci-beam.apache.org/job/beam_PerformanceTests_Cdap/) | `Run Java CdapIO Performance Test` | [![Build Status](https://ci-beam.apache.org/job/beam_PerformanceTests_Cdap/badge/icon)](https://ci-beam.apache.org/job/beam_PerformanceTests_Cdap) | | beam_PerformanceTests_Compressed_TextIOIT | [cron](https://ci-beam.apache.org/job/beam_PerformanceTests_Compressed_TextIOIT/), [hdfs_cron](https://ci-beam.apache.org/job/beam_PerformanceTests_Compressed_TextIOIT_HDFS/) | `Run Java CompressedTextIO Performance Test` | [![Build Status](https://ci-beam.apache.org/job/beam_PerformanceTests_Compressed_TextIOIT/badge/icon)](https://ci-beam.apache.org/job/beam_PerformanceTests_Compressed_TextIOIT) [![Build Status](https://ci-beam.apache.org/job/beam_PerformanceTests_Compressed_TextIOIT_HDFS/badge/icon)](https://ci-beam.apache.org/job/beam_PerformanceTests_Compressed_TextIOIT_HDFS) | | beam_PerformanceTests_HadoopFormat | [cron](https://ci-beam.apache.org/job/beam_PerformanceTests_HadoopFormat/) | `Run Java HadoopFormatIO Performance Test` | [![Build Status](https://ci-beam.apache.org/job/beam_PerformanceTests_HadoopFormat/badge/icon)](https://ci-beam.apache.org/job/beam_PerformanceTests_HadoopFormat) | @@ -155,6 +155,7 @@ Beam Jenkins overview page: [link](https://ci-beam.apache.org/) | beam_PerformanceTests_PubsubIOIT_Python_Streaming | [cron](https://ci-beam.apache.org/job/beam_PerformanceTests_PubsubIOIT_Python_Streaming/), [phrase](https://ci-beam.apache.org/view/PerformanceTests/job/beam_PerformanceTests_PubsubIOIT_Python_Streaming_PR/) | `Run PubsubIO Performance Test Python` | [![Build Status](https://ci-beam.apache.org/job/beam_PerformanceTests_PubsubIOIT_Python_Streaming/badge/icon)](https://ci-beam.apache.org/job/beam_PerformanceTests_PubsubIOIT_Python_Streaming) | | beam_PerformanceTests_SpannerIO_Read_2GB_Python | [cron](https://ci-beam.apache.org/view/PerformanceTests/job/beam_PerformanceTests_SpannerIO_Read_2GB_Python/), [phrase](https://ci-beam.apache.org/view/PerformanceTests/job/beam_PerformanceTests_SpannerIO_Read_2GB_Python_PR/) | `Run SpannerIO Read 2GB Performance Test Python Batch` | [![Build Status](https://ci-beam.apache.org/view/PerformanceTests/job/beam_PerformanceTests_SpannerIO_Read_2GB_Python/badge/icon)](https://ci-beam.apache.org/view/PerformanceTests/job/beam_PerformanceTests_SpannerIO_Read_2GB_Python/) | | beam_PerformanceTests_SpannerIO_Write_2GB_Python_Batch | [cron](https://ci-beam.apache.org/job/beam_PerformanceTests_SpannerIO_Write_2GB_Python_Batch/), [phrase](https://ci-beam.apache.org/view/PerformanceTests/job/beam_PerformanceTests_SpannerIO_Write_2GB_Python_Batch_PR/) | `Run SpannerIO Write 2GB Performance Test Python Batch` | [![Build Status](https://ci-beam.apache.org/job/beam_PerformanceTests_SpannerIO_Write_2GB_Python_Batch/badge/icon)](https://ci-beam.apache.org/job/beam_PerformanceTests_SpannerIO_Write_2GB_Python_Batch) | +| beam_PerformanceTests_SparkReceiverIOIT | [cron](https://ci-beam.apache.org/job/beam_PerformanceTests_SparkReceiverIOIT/) | `Run Java SparkReceiverIO Performance Test` | [![Build Status](https://ci-beam.apache.org/job/beam_PerformanceTests_SparkReceiverIO/badge/icon)](https://ci-beam.apache.org/job/beam_PerformanceTests_SparkReceiverIO) | | beam_PerformanceTests_TFRecordIOIT | [cron](https://ci-beam.apache.org/job/beam_PerformanceTests_TFRecordIOIT/) | `Run Java TFRecordIO Performance Test` | [![Build Status](https://ci-beam.apache.org/job/beam_PerformanceTests_TFRecordIOIT/badge/icon)](https://ci-beam.apache.org/job/beam_PerformanceTests_TFRecordIOIT) | | beam_PerformanceTests_TextIOIT | [cron](https://ci-beam.apache.org/job/beam_PerformanceTests_TextIOIT/), [hdfs_cron](https://ci-beam.apache.org/job/beam_PerformanceTests_TextIOIT_HDFS/) | `Run Java TextIO Performance Test` | [![Build Status](https://ci-beam.apache.org/job/beam_PerformanceTests_TextIOIT/badge/icon)](https://ci-beam.apache.org/job/beam_PerformanceTests_TextIOIT) [![Build Status](https://ci-beam.apache.org/job/beam_PerformanceTests_TextIOIT_HDFS/badge/icon)](https://ci-beam.apache.org/job/beam_PerformanceTests_TextIOIT_HDFS) | | beam_PerformanceTests_WordCountIT_Py37 | [cron](https://ci-beam.apache.org/job/beam_PerformanceTests_WordCountIT_Py37/) | `Run Python37 WordCountIT Performance Test` | [![Build Status](https://ci-beam.apache.org/job/beam_PerformanceTests_WordCountIT_Py37/badge/icon)](https://ci-beam.apache.org/job/beam_PerformanceTests_WordCountIT_Py37) | diff --git a/.test-infra/jenkins/job_LoadTests_Combine_Flink_Python.groovy b/.test-infra/jenkins/job_LoadTests_Combine_Flink_Python.groovy index 50863a0ddf1f..b88a3fafc2d4 100644 --- a/.test-infra/jenkins/job_LoadTests_Combine_Flink_Python.groovy +++ b/.test-infra/jenkins/job_LoadTests_Combine_Flink_Python.groovy @@ -132,7 +132,7 @@ def loadTestJob = { scope, triggeringContext, mode -> "${DOCKER_CONTAINER_REGISTRY}/${DOCKER_BEAM_SDK_IMAGE}" ], initialParallelism, - "${DOCKER_CONTAINER_REGISTRY}/beam_flink1.12_job_server:latest") + "${DOCKER_CONTAINER_REGISTRY}/beam_flink1.13_job_server:latest") // Execute all scenarios connected with initial parallelism. loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.PYTHON, initialScenarios, 'Combine', mode) diff --git a/.test-infra/jenkins/job_LoadTests_GBK_Flink_Python.groovy b/.test-infra/jenkins/job_LoadTests_GBK_Flink_Python.groovy index be395c829e49..ade6bc16a69b 100644 --- a/.test-infra/jenkins/job_LoadTests_GBK_Flink_Python.groovy +++ b/.test-infra/jenkins/job_LoadTests_GBK_Flink_Python.groovy @@ -146,7 +146,7 @@ def loadTest = { scope, triggeringContext -> "${DOCKER_CONTAINER_REGISTRY}/${DOCKER_BEAM_SDK_IMAGE}" ], numberOfWorkers, - "${DOCKER_CONTAINER_REGISTRY}/beam_flink1.12_job_server:latest") + "${DOCKER_CONTAINER_REGISTRY}/beam_flink1.13_job_server:latest") def configurations = testScenarios.findAll { it.pipelineOptions?.parallelism?.value == numberOfWorkers } loadTestsBuilder.loadTests(scope, sdk, configurations, "GBK", "batch") diff --git a/.test-infra/jenkins/job_LoadTests_ParDo_Flink_Python.groovy b/.test-infra/jenkins/job_LoadTests_ParDo_Flink_Python.groovy index 793e06109d45..d07964d0d448 100644 --- a/.test-infra/jenkins/job_LoadTests_ParDo_Flink_Python.groovy +++ b/.test-infra/jenkins/job_LoadTests_ParDo_Flink_Python.groovy @@ -320,7 +320,7 @@ def loadTestJob = { scope, triggeringContext, mode -> "${DOCKER_CONTAINER_REGISTRY}/${DOCKER_BEAM_SDK_IMAGE}" ], numberOfWorkers, - "${DOCKER_CONTAINER_REGISTRY}/beam_flink1.12_job_server:latest") + "${DOCKER_CONTAINER_REGISTRY}/beam_flink1.13_job_server:latest") loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.PYTHON, testScenarios, 'ParDo', mode) } diff --git a/.test-infra/jenkins/job_LoadTests_coGBK_Flink_Python.groovy b/.test-infra/jenkins/job_LoadTests_coGBK_Flink_Python.groovy index 3e7dbaa706aa..e1bb58cbdc85 100644 --- a/.test-infra/jenkins/job_LoadTests_coGBK_Flink_Python.groovy +++ b/.test-infra/jenkins/job_LoadTests_coGBK_Flink_Python.groovy @@ -137,7 +137,7 @@ def loadTest = { scope, triggeringContext -> "${DOCKER_CONTAINER_REGISTRY}/${DOCKER_BEAM_SDK_IMAGE}" ], numberOfWorkers, - "${DOCKER_CONTAINER_REGISTRY}/beam_flink1.12_job_server:latest") + "${DOCKER_CONTAINER_REGISTRY}/beam_flink1.13_job_server:latest") loadTestsBuilder.loadTests(scope, CommonTestProperties.SDK.PYTHON, testScenarios, 'CoGBK', 'batch') } diff --git a/.test-infra/jenkins/job_PerformanceTests_BigQueryIO_Java.groovy b/.test-infra/jenkins/job_PerformanceTests_BigQueryIO_Java.groovy index 1d8ce84ea12d..c3d0ae1f78cd 100644 --- a/.test-infra/jenkins/job_PerformanceTests_BigQueryIO_Java.groovy +++ b/.test-infra/jenkins/job_PerformanceTests_BigQueryIO_Java.groovy @@ -24,9 +24,9 @@ def now = new Date().format("MMddHHmmss", TimeZone.getTimeZone('UTC')) def jobConfigs = [ [ - title : 'BigQueryIO Streaming Performance Test Java 10 GB', + title : 'BigQueryIO Performance Test Streaming Java 10 GB', triggerPhrase: 'Run BigQueryIO Streaming Performance Test Java', - name : 'beam_BiqQueryIO_Streaming_Performance_Test_Java', + name : 'beam_PerformanceTests_BiqQueryIO_Streaming_Java', itClass : 'org.apache.beam.sdk.bigqueryioperftests.BigQueryIOIT', properties: [ project : 'apache-beam-testing', @@ -34,6 +34,7 @@ def jobConfigs = [ tempRoot : 'gs://temp-storage-for-perf-tests/loadtests', writeMethod : 'STREAMING_INSERTS', writeFormat : 'JSON', + pipelineTimeout : '1200', testBigQueryDataset : 'beam_performance', testBigQueryTable : 'bqio_write_10GB_java_stream_' + now, metricsBigQueryDataset: 'beam_performance', @@ -53,9 +54,9 @@ def jobConfigs = [ ] ], [ - title : 'BigQueryIO Batch Performance Test Java 10 GB JSON', + title : 'BigQueryIO Performance Test Batch Java 10 GB JSON', triggerPhrase: 'Run BigQueryIO Batch Performance Test Java Json', - name : 'beam_BiqQueryIO_Batch_Performance_Test_Java_Json', + name : 'beam_PerformanceTests_BiqQueryIO_Batch_Java_Json', itClass : 'org.apache.beam.sdk.bigqueryioperftests.BigQueryIOIT', properties: [ project : 'apache-beam-testing', @@ -82,9 +83,9 @@ def jobConfigs = [ ] ], [ - title : 'BigQueryIO Batch Performance Test Java 10 GB AVRO', + title : 'BigQueryIO Performance Test Batch Java 10 GB AVRO', triggerPhrase: 'Run BigQueryIO Batch Performance Test Java Avro', - name : 'beam_BiqQueryIO_Batch_Performance_Test_Java_Avro', + name : 'beam_PerformanceTests_BiqQueryIO_Batch_Java_Avro', itClass : 'org.apache.beam.sdk.bigqueryioperftests.BigQueryIOIT', properties: [ project : 'apache-beam-testing', diff --git a/.test-infra/jenkins/job_PerformanceTests_SparkReceiverIO_IT.groovy b/.test-infra/jenkins/job_PerformanceTests_SparkReceiverIO_IT.groovy new file mode 100644 index 000000000000..0bfb01b43ce7 --- /dev/null +++ b/.test-infra/jenkins/job_PerformanceTests_SparkReceiverIO_IT.groovy @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import CommonJobProperties as common +import Kubernetes +import InfluxDBCredentialsHelper + +String jobName = "beam_PerformanceTests_SparkReceiver_IO" + +/** + * This job runs the SparkReceiver IO performance tests. + It runs on a RabbitMQ cluster that is build by applying the folder .test-infra/kubernetes/rabbit, + in an existing kubernetes cluster (DEFAULT_CLUSTER in Kubernetes.groovy). + The services created to run this test are: + Pods: 1 RabbitMq pods. + Services: 1 broker + When the performance tests finish all resources are cleaned up by a postBuild step in Kubernetes.groovy + */ +job(jobName) { + common.setTopLevelMainJobProperties(delegate, 'master', 120) + common.setAutoJob(delegate, 'H H/6 * * *') + common.enablePhraseTriggeringFromPullRequest( + delegate, + 'Java SparkReceiverIO Performance Test', + 'Run Java SparkReceiverIO Performance Test') + InfluxDBCredentialsHelper.useCredentials(delegate) + + String namespace = common.getKubernetesNamespace(jobName) + String kubeconfig = common.getKubeconfigLocationForNamespace(namespace) + Kubernetes k8s = Kubernetes.create(delegate, kubeconfig, namespace) + + k8s.apply(common.makePathAbsolute("src/.test-infra/kubernetes/rabbit/rabbitmq.yaml")) + String rabbitMqHostName = "LOAD_BALANCER_IP" + k8s.loadBalancerIP("rabbitmq", rabbitMqHostName) + + Map pipelineOptions = [ + tempRoot : 'gs://temp-storage-for-perf-tests', + project : 'apache-beam-testing', + runner : 'DataflowRunner', + sourceOptions : """ + { + "numRecords": "600000", + "keySizeBytes": "1", + "valueSizeBytes": "90" + } + """.trim().replaceAll("\\s", ""), + bigQueryDataset : 'beam_performance', + bigQueryTable : 'sparkreceiverioit_results', + influxMeasurement : 'sparkreceiverioit_results', + influxDatabase : InfluxDBCredentialsHelper.InfluxDBDatabaseName, + influxHost : InfluxDBCredentialsHelper.InfluxDBHostUrl, + rabbitMqBootstrapServerAddress: "amqp://guest:guest@\$${rabbitMqHostName}:5672", + streamName : 'rabbitMqTestStream', + readTimeout : '900', + numWorkers : '5', + autoscalingAlgorithm : 'NONE' + ] + + steps { + gradle { + rootBuildScriptDir(common.checkoutDir) + common.setGradleSwitches(delegate) + switches("--info") + switches("-DintegrationTestPipelineOptions=\'${common.joinOptionsWithNestedJsonValues(pipelineOptions)}\'") + switches("-DintegrationTestRunner=dataflow") + tasks(":sdks:java:io:sparkreceiver:integrationTest --tests org.apache.beam.sdk.io.sparkreceiver.SparkReceiverIOIT") + } + } +} diff --git a/.test-infra/jenkins/job_PostCommit_Python_Chicago_Taxi_Example_Flink.groovy b/.test-infra/jenkins/job_PostCommit_Python_Chicago_Taxi_Example_Flink.groovy index 2874fc3bad3a..516bf028714c 100644 --- a/.test-infra/jenkins/job_PostCommit_Python_Chicago_Taxi_Example_Flink.groovy +++ b/.test-infra/jenkins/job_PostCommit_Python_Chicago_Taxi_Example_Flink.groovy @@ -38,7 +38,7 @@ def chicagoTaxiJob = { scope -> "${DOCKER_CONTAINER_REGISTRY}/${beamSdkDockerImage}" ], numberOfWorkers, - "${DOCKER_CONTAINER_REGISTRY}/beam_flink1.12_job_server:latest") + "${DOCKER_CONTAINER_REGISTRY}/beam_flink1.13_job_server:latest") def pipelineOptions = [ parallelism : numberOfWorkers, diff --git a/.test-infra/kubernetes/rabbit/rabbitmq.yaml b/.test-infra/kubernetes/rabbit/rabbitmq.yaml new file mode 100644 index 000000000000..72bd41d5a92c --- /dev/null +++ b/.test-infra/kubernetes/rabbit/rabbitmq.yaml @@ -0,0 +1,187 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +apiVersion: v1 +kind: ServiceAccount +metadata: + name: rabbitmq +--- +kind: Service +apiVersion: v1 +metadata: + name: rabbitmq-internal + labels: + app: rabbitmq +spec: + clusterIP: None + ports: + - name: http + protocol: TCP + port: 15672 + - name: amqp + protocol: TCP + port: 5672 + selector: + app: rabbitmq +--- +kind: Service +apiVersion: v1 +metadata: + name: rabbitmq + labels: + app: rabbitmq + type: LoadBalancer +spec: + type: LoadBalancer + ports: + - name: http + protocol: TCP + port: 15672 + - name: amqp + protocol: TCP + port: 5672 + selector: + app: rabbitmq +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: rabbitmq-config +data: + enabled_plugins: | + [rabbitmq_management,rabbitmq_peer_discovery_k8s]. + + rabbitmq.conf: | + loopback_users = none + cluster_formation.peer_discovery_backend = rabbit_peer_discovery_k8s + cluster_formation.k8s.host = kubernetes.default.svc.cluster.local + cluster_formation.k8s.port = 443 + ### cluster_formation.k8s.address_type = ip + cluster_formation.k8s.address_type = hostname + cluster_formation.node_cleanup.interval = 10 + cluster_formation.node_cleanup.only_log_warning = true + cluster_partition_handling = autoheal + queue_master_locator=min-masters + cluster_formation.randomized_startup_delay_range.min = 0 + cluster_formation.randomized_startup_delay_range.max = 2 + cluster_formation.k8s.service_name = rabbitmq-internal + cluster_formation.k8s.hostname_suffix = .rabbitmq-internal.our-namespace.svc.cluster.local +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: rabbitmq +spec: + selector: + matchLabels: + app: "rabbitmq" + serviceName: rabbitmq-internal + replicas: 1 + volumeClaimTemplates: + - metadata: + name: rabbitmq-data + namespace: rabbit-test + spec: + storageClassName: standard + accessModes: + - ReadWriteOnce + resources: + requests: + storage: "3Gi" + template: + metadata: + labels: + app: rabbitmq + annotations: + scheduler.alpha.kubernetes.io/affinity: > + { + "podAntiAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": [{ + "labelSelector": { + "matchExpressions": [{ + "key": "app", + "operator": "In", + "values": ["rabbitmq"] + }] + }, + "topologyKey": "kubernetes.io/hostname" + }] + } + } + spec: + serviceAccountName: rabbitmq + terminationGracePeriodSeconds: 10 + containers: + - name: rabbitmq-k8s + image: rabbitmq:3.7 + volumeMounts: + - name: config-volume + mountPath: /etc/rabbitmq + - name: rabbitmq-data + mountPath: /var/lib/rabbitmq/mnesia + ports: + - name: http + protocol: TCP + containerPort: 15672 + - name: amqp + protocol: TCP + containerPort: 5672 + livenessProbe: + exec: + command: ["rabbitmqctl", "status"] + initialDelaySeconds: 60 + periodSeconds: 10 + timeoutSeconds: 10 + readinessProbe: + exec: + command: ["rabbitmqctl", "status"] + initialDelaySeconds: 10 + periodSeconds: 10 + timeoutSeconds: 10 + imagePullPolicy: Always + env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: HOSTNAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: RABBITMQ_USE_LONGNAME + value: "true" + - name: RABBITMQ_NODENAME + value: "rabbit@$(HOSTNAME).rabbitmq-internal.$(NAMESPACE).svc.cluster.local" + - name: K8S_SERVICE_NAME + value: "rabbitmq-internal" + - name: RABBITMQ_ERLANG_COOKIE + value: "cookie" + volumes: + - name: config-volume + configMap: + name: rabbitmq-config + items: + - key: rabbitmq.conf + path: rabbitmq.conf + - key: enabled_plugins + path: enabled_plugins + - name: rabbitmq-data + persistentVolumeClaim: + claimName: rabbitmq-data diff --git a/.test-infra/metrics/grafana/dashboards/perftests_metrics/Java_IO_IT_Tests_Dataflow.json b/.test-infra/metrics/grafana/dashboards/perftests_metrics/Java_IO_IT_Tests_Dataflow.json index 1c6cde8bbb79..54eba316631f 100644 --- a/.test-infra/metrics/grafana/dashboards/perftests_metrics/Java_IO_IT_Tests_Dataflow.json +++ b/.test-infra/metrics/grafana/dashboards/perftests_metrics/Java_IO_IT_Tests_Dataflow.json @@ -103,7 +103,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "TextIOIT | 1 GB | GCS", + "title": "TextIOIT | GCS | 1 GB", "tooltip": { "shared": true, "sort": 0, @@ -225,7 +225,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "TextIOIT | 1 GB | HDFS", + "title": "TextIOIT | HDFS | 1 GB", "tooltip": { "shared": true, "sort": 0, @@ -347,7 +347,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "TextIOIT GZIP | 1 GB | GCS", + "title": "TextIOIT | GZIP GCS | 1 GB", "tooltip": { "shared": true, "sort": 0, @@ -469,7 +469,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "TextIOIT GZIP | 1 GB | HDFS", + "title": "TextIOIT | GZIP HDFS | 1 GB", "tooltip": { "shared": true, "sort": 0, @@ -591,7 +591,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "TextIOIT | 1 GB | GCP | \"Many files\"", + "title": "TextIOIT | \"Many files\" GCP | 1 GB", "tooltip": { "shared": true, "sort": 0, @@ -713,7 +713,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "TextIOIT | 1 GB | HDFS | \"Many files\"", + "title": "TextIOIT | \"Many files\" HDFS | 1 GB", "tooltip": { "shared": true, "sort": 0, @@ -837,7 +837,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "TextIOIT | 1 GB | GCS | \"Many files\" | GCS Rename", + "title": "TextIOIT | \"Many files\" GCS Rename | 1 GB", "tooltip": { "shared": true, "sort": 0, @@ -855,7 +855,7 @@ "yaxes": [ { "$$hashKey": "object:403", - "format": "s", + "format": "none", "label": null, "logBase": 1, "max": null, @@ -959,7 +959,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "AvroIOIT | 1 GB | GCS", + "title": "AvroIOIT | GCS | 1 GB", "tooltip": { "shared": true, "sort": 0, @@ -1081,7 +1081,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "AvroIOIT | 1 GB | HDFS", + "title": "AvroIOIT | HDFS | 1 GB", "tooltip": { "shared": true, "sort": 0, @@ -1203,7 +1203,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "XmlIOIT | 1 GB | GCS", + "title": "XmlIOIT | GCS | 1 GB", "tooltip": { "shared": true, "sort": 0, @@ -1325,7 +1325,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "XmlIOIT | 1 GB | HDFS", + "title": "XmlIOIT | HDFS | 1 GB", "tooltip": { "shared": true, "sort": 0, @@ -1372,7 +1372,6 @@ "dashLength": 10, "dashes": false, "datasource": "BeamInfluxDB", - "description": "TODO: https://issues.apache.org/jira/browse/BEAM-7115", "fill": 1, "fillGradient": 0, "gridPos": { @@ -1448,7 +1447,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "TFRecordIO | 1 GB | GCS", + "title": "TFRecordIO | GCS | 1 GB", "tooltip": { "shared": true, "sort": 0, @@ -1571,7 +1570,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "ParquetIO | 1 GB | GCS", + "title": "ParquetIO | GCS | 1 GB", "tooltip": { "shared": true, "sort": 0, @@ -1694,7 +1693,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "ParquetIO | 1 GB | HDFS", + "title": "ParquetIO | HDFS | 1 GB", "tooltip": { "shared": true, "sort": 0, @@ -1817,7 +1816,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "MongoDBIO", + "title": "MongoDBIO | 10M records", "tooltip": { "shared": true, "sort": 0, @@ -1940,7 +1939,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "JdbcIO", + "title": "JdbcIO | 5M records", "tooltip": { "shared": true, "sort": 0, @@ -2063,7 +2062,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "HadoopFormatIO", + "title": "HadoopFormatIO | 600k records", "tooltip": { "shared": true, "sort": 0, @@ -2186,7 +2185,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "KafkaIO | 1GB", + "title": "KafkaIO | 100M records, 10 GB", "tooltip": { "shared": true, "sort": 0, @@ -2309,7 +2308,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "BigQueryIO | batch | JSON", + "title": "BigQueryIO batch JSON | 10M records, 10 GB", "tooltip": { "shared": true, "sort": 0, @@ -2432,7 +2431,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "BigQueryIO | streaming | JSON", + "title": "BigQueryIO | streaming JSON | 10M records, 10 GB", "tooltip": { "shared": true, "sort": 0, @@ -2484,7 +2483,7 @@ "fillGradient": 0, "gridPos": { "h": 9, - "w": 24, + "w": 12, "x": 0, "y": 97 }, @@ -2555,7 +2554,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "BigQueryIO | batch | Avro", + "title": "BigQueryIO | batch Avro | 10M records, 10 GB", "tooltip": { "shared": true, "sort": 0, @@ -2608,8 +2607,8 @@ "gridPos": { "h": 9, "w": 12, - "x": 0, - "y": 106 + "x": 12, + "y": 97 }, "hiddenSeries": false, "id": 26, @@ -2678,7 +2677,130 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "CdapIO", + "title": "CdapIO | 600k records", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transparent": true, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:403", + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:404", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "dashLength": 10, + "dashes": false, + "datasource": "BeamInfluxDB", + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 106 + }, + "hiddenSeries": false, + "id": 27, + "interval": "6h", + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pluginVersion": "6.7.2", + "pointradius": 2, + "points": true, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "$tag_metric", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + } + ], + "measurement": "", + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(\"value\") FROM \"sparkreceiverioit_results\" WHERE \"metric\" =~ /time/ AND $timeFilter GROUP BY time($__interval), \"metric\"", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "SparkReceiverIO | 600k Records", "tooltip": { "shared": true, "sort": 0, @@ -2746,6 +2868,7 @@ }, "timezone": "", "title": "Java IO IT Tests | Dataflow", + "description": "Shows performance test metrics on Dataflow of Beam Java SDK.\nTests are named after 'IO Connector | Specifications | data size'.", "uid": "bnlHKP3Wz", "variables": { "list": [] diff --git a/.test-infra/metrics/grafana/dashboards/perftests_metrics/Python_IO_IT_Tests_Dataflow.json b/.test-infra/metrics/grafana/dashboards/perftests_metrics/Python_IO_IT_Tests_Dataflow.json index 570dc82e3d4b..5b1ff2b8103b 100644 --- a/.test-infra/metrics/grafana/dashboards/perftests_metrics/Python_IO_IT_Tests_Dataflow.json +++ b/.test-infra/metrics/grafana/dashboards/perftests_metrics/Python_IO_IT_Tests_Dataflow.json @@ -94,96 +94,7 @@ ] ], "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Reading 10GB of data | BigQuery native Dataflow IO", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "transparent": true, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:403", - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true }, - { - "$$hashKey": "object:404", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "cacheTimeout": null, - "dashLength": 10, - "dashes": false, - "datasource": "BeamInfluxDB", - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 0 - }, - "hiddenSeries": false, - "id": 3, - "interval": "24h", - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pluginVersion": "6.7.2", - "pointradius": 2, - "points": true, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ { "alias": "write_time", "groupBy": [ @@ -222,7 +133,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Writing 10GB of data | BigQuery native Dataflow IO", + "title": "BigQueryIO | Batch | 10 GB", "tooltip": { "shared": true, "sort": 0, @@ -274,8 +185,8 @@ "gridPos": { "h": 9, "w": 12, - "x": 0, - "y": 9 + "x": 12, + "y": 0 }, "hiddenSeries": false, "id": 4, @@ -338,96 +249,7 @@ ] ], "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Reading 2GB of data | Pubsub native Dataflow IO | streaming", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "transparent": true, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:403", - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true }, - { - "$$hashKey": "object:404", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "cacheTimeout": null, - "dashLength": 10, - "dashes": false, - "datasource": "BeamInfluxDB", - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 9 - }, - "hiddenSeries": false, - "id": 5, - "interval": "24h", - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pluginVersion": "6.7.2", - "pointradius": 2, - "points": true, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ { "alias": "write_time", "groupBy": [ @@ -466,7 +288,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Writing 2GB of data | Pubsub native Dataflow IO | streaming", + "title": "PubsubIO | Streaming | 2 GB", "tooltip": { "shared": true, "sort": 0, @@ -519,7 +341,7 @@ "h": 9, "w": 12, "x": 0, - "y": 18 + "y": 9 }, "hiddenSeries": false, "id": 6, @@ -582,96 +404,7 @@ ] ], "tags": [] - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Reading 2GB of data | Spanner native Dataflow IO", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "transparent": true, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:403", - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true }, - { - "$$hashKey": "object:404", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "cacheTimeout": null, - "dashLength": 10, - "dashes": false, - "datasource": "BeamInfluxDB", - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 18 - }, - "hiddenSeries": false, - "id": 7, - "interval": "24h", - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pluginVersion": "6.7.2", - "pointradius": 2, - "points": true, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ { "alias": "write_time", "groupBy": [ @@ -710,7 +443,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Writing 2GB of data | Spanner native Dataflow IO", + "title": "SpannerIO | native | 2 GB", "tooltip": { "shared": true, "sort": 0, @@ -777,6 +510,7 @@ }, "timezone": "", "title": "Python IO IT Tests | Dataflow", + "description": "Shows performance test metrics on Dataflow of Beam Python SDK.\nTests are named after 'IO Connector | Specifications | data size'.", "uid": "gP7vMPqZz", "variables": { "list": [] diff --git a/.test-infra/metrics/sync/github/sync.py b/.test-infra/metrics/sync/github/sync.py index d48fd7e43f29..543b19476c10 100644 --- a/.test-infra/metrics/sync/github/sync.py +++ b/.test-infra/metrics/sync/github/sync.py @@ -207,11 +207,16 @@ def fetchGHData(timestamp, ghQuery): query = ghQuery.replace('', tsString) return executeGHGraphqlQuery(query) +def extractUserLogin(user): + # user could be missing + if not user: + return "Unknown" + return user.get("login", "Unknown") def extractRequestedReviewers(pr): reviewEdges = pr["reviewRequests"]["edges"] return list( - map(lambda x: x["node"]["requestedReviewer"]["login"], reviewEdges)) + map(lambda x: extractUserLogin(x["node"]["requestedReviewer"]), reviewEdges)) def extractMentions(pr): @@ -238,24 +243,24 @@ def extractFirstNAActivity(pr): Returns timestamp and login of author on first activity on pull request done by non-author. ''' - author = pr["author"]["login"] + author = extractUserLogin(pr["author"]) commentEdges = None commentEdges = [ edge for edge in pr["comments"]["edges"] - if edge["node"]["author"]["login"] != author + if extractUserLogin(edge["node"]["author"]) != author ] reviewEdges = [ edge for edge in pr["reviews"]["edges"] - if edge["node"]["author"]["login"] != author + if extractUserLogin(edge["node"]["author"]) != author ] merged = pr["merged"] mergedAt = pr["mergedAt"] - mergedBy = None if not merged else pr["mergedBy"]["login"] + mergedBy = None if not merged else extractUserLogin(pr["mergedBy"]) commentTimestamps = list( - map(lambda x: (x["node"]["createdAt"], x["node"]["author"]["login"]), + map(lambda x: (x["node"]["createdAt"], extractUserLogin(x["node"]["author"])), commentEdges)) reviewTimestamps = list( - map(lambda x: (x["node"]["createdAt"], x["node"]["author"]["login"]), + map(lambda x: (x["node"]["createdAt"], extractUserLogin(x["node"]["author"])), reviewEdges)) allTimestamps = commentTimestamps + reviewTimestamps if merged: @@ -266,18 +271,18 @@ def extractFirstNAActivity(pr): def extractBeamReviewers(pr): '''Extract logins of users defined by Beam as reviewers.''' - author = pr['author']['login'] + author = extractUserLogin(pr['author']) # All the direct GitHub indicators of reviewers reviewers = [] for r in pr['assignees']['edges']: - reviewers.append(r['node']['login']) + reviewers.append(extractUserLogin(r['node'])) for r in pr['reviewRequests']['edges']: - reviewers.append(r['node']['requestedReviewer']['login']) + reviewers.append(extractUserLogin(r['node']['requestedReviewer'])) # GitHub users that have performed reviews. for r in pr['reviews']['edges']: - reviewers.append(r['node']['author']['login']) + reviewers.append(extractUserLogin(r['node']['author'])) # @r1, @r2 ... look/PTAL/ptal? beam_reviewer_regex = r'(@\w+).*?(?:PTAL|ptal|look)' @@ -303,7 +308,7 @@ def extractBeamReviewers(pr): def extractReviewers(pr): '''Extracts reviewers logins from PR.''' - return [edge["node"]["author"]["login"] for edge in pr["reviews"]["edges"]] + return [extractUserLogin(edge["node"]["author"]) for edge in pr["reviews"]["edges"]] def extractRowValuesFromPr(pr): @@ -318,7 +323,7 @@ def extractRowValuesFromPr(pr): reviewedBy = extractReviewers(pr) result = [ - pr["number"], pr["author"]["login"], pr["createdAt"], pr["updatedAt"], + pr["number"], extractUserLogin(pr["author"]), pr["createdAt"], pr["updatedAt"], pr["closedAt"], pr["merged"], firstNAActivity, firstNAAAuthor, requestedReviewers, mentions, beamReviewers, reviewedBy ] @@ -333,13 +338,13 @@ def extractRowValuesFromIssue(issue): ''' assignees = [] for a in issue['assignees']['edges']: - assignees.append(a['node']['login']) + assignees.append(extractUserLogin(a['node'])) labels = [] for l in issue['labels']['edges']: labels.append(l['node']['name']) result = [ - issue["number"], issue["author"]["login"], issue["createdAt"], issue["updatedAt"], + issue["number"], extractUserLogin(issue["author"]), issue["createdAt"], issue["updatedAt"], issue["closedAt"], issue["title"], assignees, labels ] diff --git a/CHANGES.md b/CHANGES.md index 1df3cb35b2bf..724a57e59aab 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -43,18 +43,51 @@ ## Bugfixes -* Fixed JmsIO acknowledgment issue (https://github.com/apache/beam/issues/20814) * Fixed X (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). ## Known Issues * ([#X](https://github.com/apache/beam/issues/X)). --> +# [2.44.0] - Unreleased + +## Highlights + +* New highly anticipated feature X added to Python SDK ([#X](https://github.com/apache/beam/issues/X)). +* New highly anticipated feature Y added to Java SDK ([#Y](https://github.com/apache/beam/issues/Y)). + +## I/Os + +* Support for X source added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). + +## New Features / Improvements + +* Local packages can now be used as dependencies in the requirements.txt file, rather + than requiring them to be passed separately via the `--extra_package` option. + ([#23684](https://github.com/apache/beam/pull/23684)) + +## Breaking Changes + +* `ParquetIO.withSplit` was removed since splittable reading has been the default behavior since 2.35.0. The effect of + this change is to drop support for non-splittable reading ([#23832](https://github.com/apache/beam/issues/23832)). + +## Deprecations + +* X behavior is deprecated and will be removed in X versions ([#X](https://github.com/apache/beam/issues/X)). + +## Bugfixes + +* Fixed X (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). +* Fixed JmsIO acknowledgment issue (https://github.com/apache/beam/issues/20814) +* Fixed Beam SQL CalciteUtils (Java) and Cross-language JdbcIO (Python) did not support JDBC CHAR/VARCHAR, BINARY/VARBINARY logical types ([#23747](https://github.com/apache/beam/issues/23747), [#23526](https://github.com/apache/beam/issues/23526)). +* Ensure iterated and emitted types are used with the generic register package are registered with the type and schema registries.(Go) ([#23889](https://github.com/apache/beam/pull/23889)) + # [2.43.0] - Unreleased ## Highlights * Python 3.10 support in Apache Beam ([#21458](https://github.com/apache/beam/issues/21458)). +* An initial implementation of a runner that allows us to run Beam pipelines on Dask. Try it out and give us feedback! (Python) ([#18962](https://github.com/apache/beam/issues/18962)). ## I/Os @@ -72,10 +105,13 @@ * X feature added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). * Dataframe wrapper added in Go SDK via Cross-Language (with automatic expansion service). (Go) ([#23384](https://github.com/apache/beam/issues/23384)). * Name all Java threads to aid in debugging ([#23049](https://github.com/apache/beam/issues/23049)). +* An initial implementation of a runner that allows us to run Beam pipelines on Dask. (Python) ([#18962](https://github.com/apache/beam/issues/18962)). ## Breaking Changes * Python SDK CoGroupByKey outputs an iterable allowing for arbitrarily large results. [#21556](https://github.com/apache/beam/issues/21556) Beam users may see an error on transforms downstream from CoGroupByKey. Users must change methods expecting a List to expect an Iterable going forward. See [document](https://docs.google.com/document/d/1RIzm8-g-0CyVsPb6yasjwokJQFoKHG4NjRUcKHKINu0) for information and fixes. +* The PortableRunner for Spark assumes Spark 3 as default Spark major version unless configured otherwise using `--spark_version`. + Spark 2 support is deprecated and will be removed soon ([#23728](https://github.com/apache/beam/issues/23728)). ## Deprecations diff --git a/build.gradle.kts b/build.gradle.kts index 72d2a8e92584..38d2971303b8 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -117,7 +117,11 @@ tasks.rat { // Tour Of Beam backend autogenerated Datastore indexes "learning/tour-of-beam/backend/internal/storage/index.yaml", - + + // Tour Of Beam backend autogenerated Playground GRPC API stubs and mocks + "learning/tour-of-beam/backend/playground_api/api.pb.go", + "learning/tour-of-beam/backend/playground_api/api_grpc.pb.go", + "learning/tour-of-beam/backend/playground_api/mock.go", // test p8 file for SnowflakeIO "sdks/java/io/snowflake/src/test/resources/invalid_test_rsa_key.p8", diff --git a/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy index 1f1fe4589ffc..6aa2e4859c59 100644 --- a/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy +++ b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy @@ -546,6 +546,7 @@ class BeamModulePlugin implements Plugin { aws_java_sdk2_http_client_spi : "software.amazon.awssdk:http-client-spi:$aws_java_sdk2_version", aws_java_sdk2_regions : "software.amazon.awssdk:regions:$aws_java_sdk2_version", aws_java_sdk2_utils : "software.amazon.awssdk:utils:$aws_java_sdk2_version", + aws_java_sdk2_profiles : "software.amazon.awssdk:profiles:$aws_java_sdk2_version", bigdataoss_gcsio : "com.google.cloud.bigdataoss:gcsio:$google_cloud_bigdataoss_version", bigdataoss_util : "com.google.cloud.bigdataoss:util:$google_cloud_bigdataoss_version", byte_buddy : "net.bytebuddy:byte-buddy:1.12.14", @@ -603,7 +604,7 @@ class BeamModulePlugin implements Plugin { google_cloud_pubsub : "com.google.cloud:google-cloud-pubsub", // google_cloud_platform_libraries_bom sets version google_cloud_pubsublite : "com.google.cloud:google-cloud-pubsublite", // google_cloud_platform_libraries_bom sets version // The GCP Libraries BOM dashboard shows the versions set by the BOM: - // https://storage.googleapis.com/cloud-opensource-java-dashboard/com.google.cloud/libraries-bom/25.2.0/artifact_details.html + // https://storage.googleapis.com/cloud-opensource-java-dashboard/com.google.cloud/libraries-bom/26.1.3/artifact_details.html // Update libraries-bom version on sdks/java/container/license_scripts/dep_urls_java.yaml google_cloud_platform_libraries_bom : "com.google.cloud:libraries-bom:26.1.3", google_cloud_spanner : "com.google.cloud:google-cloud-spanner", // google_cloud_platform_libraries_bom sets version @@ -725,6 +726,7 @@ class BeamModulePlugin implements Plugin { testcontainers_postgresql : "org.testcontainers:postgresql:$testcontainers_version", testcontainers_mysql : "org.testcontainers:mysql:$testcontainers_version", testcontainers_gcloud : "org.testcontainers:gcloud:$testcontainers_version", + testcontainers_rabbitmq : "org.testcontainers:rabbitmq:$testcontainers_version", vendored_grpc_1_48_1 : "org.apache.beam:beam-vendor-grpc-1_48_1:0.1", vendored_guava_26_0_jre : "org.apache.beam:beam-vendor-guava-26_0-jre:0.1", vendored_calcite_1_28_0 : "org.apache.beam:beam-vendor-calcite-1_28_0:0.2", @@ -913,6 +915,18 @@ class BeamModulePlugin implements Plugin { project.tasks.withType(JavaCompile).configureEach { options.encoding = "UTF-8" + // Use --release 8 when targeting Java 8 and running on JDK > 8 + // + // Consider migrating compilation and testing to use JDK 9+ and setting '--release=8' as + // the default allowing 'applyJavaNature' to override it for the few modules that need JDK 9+ + // artifacts. See https://stackoverflow.com/a/43103038/4368200 for additional details. + if (JavaVersion.VERSION_1_8.compareTo(JavaVersion.toVersion(project.javaVersion)) == 0 + && JavaVersion.VERSION_1_8.compareTo(JavaVersion.current()) < 0) { + options.compilerArgs += ['--release', '8'] + // TODO(https://github.com/apache/beam/issues/23901): Fix + // optimizerOuterThis breakage + options.compilerArgs += ['-XDoptimizeOuterThis=false'] + } // As we want to add '-Xlint:-deprecation' we intentionally remove '-Xlint:deprecation' from compilerArgs here, // as intellij is adding this, see https://youtrack.jetbrains.com/issue/IDEA-196615 options.compilerArgs -= [ @@ -981,10 +995,10 @@ class BeamModulePlugin implements Plugin { 'org.checkerframework.checker.nullness.NullnessChecker' ] - if (parseBooleanProperty(project, 'enableCheckerFramework') || project.jenkins.isCIBuild) { - skipCheckerFramework = false - } else { + if (!parseBooleanProperty(project, 'enableCheckerFramework') && !project.jenkins.isCIBuild) { skipCheckerFramework = true + } else { + skipCheckerFramework = false } // Always exclude checkerframework on tests. It's slow, and it often @@ -1919,9 +1933,7 @@ class BeamModulePlugin implements Plugin { } if (runner?.equalsIgnoreCase('spark')) { - testRuntimeOnly it.project(path: ":runners:spark:2", configuration: "testRuntimeMigration") - testRuntimeOnly project.library.java.spark_core - testRuntimeOnly project.library.java.spark_streaming + testRuntimeOnly it.project(path: ":runners:spark:3", configuration: "testRuntimeMigration") // Testing the Spark runner causes a StackOverflowError if slf4j-jdk14 is on the classpath project.configurations.testRuntimeClasspath { @@ -2679,7 +2691,7 @@ class BeamModulePlugin implements Plugin { dependsOn = [installGcpTest] mustRunAfter = [ ":runners:flink:${project.ext.latestFlinkVersion}:job-server:shadowJar", - ':runners:spark:2:job-server:shadowJar', + ':runners:spark:3:job-server:shadowJar', ':sdks:python:container:py37:docker', ':sdks:python:container:py38:docker', ':sdks:python:container:py39:docker', @@ -2695,7 +2707,7 @@ class BeamModulePlugin implements Plugin { "--parallelism=2", "--sdk_worker_parallelism=1", "--flink_job_server_jar=${project.project(flinkJobServerProject).shadowJar.archivePath}", - "--spark_job_server_jar=${project.project(':runners:spark:2:job-server').shadowJar.archivePath}", + "--spark_job_server_jar=${project.project(':runners:spark:3:job-server').shadowJar.archivePath}", ] if (isStreaming) options += [ diff --git a/examples/java/build.gradle b/examples/java/build.gradle index 13b2518bf382..aa51dcfeae85 100644 --- a/examples/java/build.gradle +++ b/examples/java/build.gradle @@ -109,13 +109,8 @@ dependencies { } directRunnerPreCommit project(path: ":runners:direct-java", configuration: "shadow") flinkRunnerPreCommit project(":runners:flink:${project.ext.latestFlinkVersion}") - // TODO: Make the netty version used configurable, we add netty-all 4.1.17.Final so it appears on the classpath - // before 4.1.8.Final defined by Apache Beam - sparkRunnerPreCommit "io.netty:netty-all:4.1.17.Final" - sparkRunnerPreCommit project(":runners:spark:2") + sparkRunnerPreCommit project(":runners:spark:3") sparkRunnerPreCommit project(":sdks:java:io:hadoop-file-system") - sparkRunnerPreCommit library.java.spark_streaming - sparkRunnerPreCommit library.java.spark_core } /* diff --git a/examples/multi-language/src/main/java/org/apache/beam/examples/multilanguage/PythonDataframeWordCount.java b/examples/java/src/main/java/org/apache/beam/examples/multilanguage/PythonDataframeWordCount.java similarity index 100% rename from examples/multi-language/src/main/java/org/apache/beam/examples/multilanguage/PythonDataframeWordCount.java rename to examples/java/src/main/java/org/apache/beam/examples/multilanguage/PythonDataframeWordCount.java diff --git a/examples/kotlin/build.gradle b/examples/kotlin/build.gradle index 0aa3dc257b09..79a1248712d0 100644 --- a/examples/kotlin/build.gradle +++ b/examples/kotlin/build.gradle @@ -81,13 +81,8 @@ dependencies { } directRunnerPreCommit project(path: ":runners:direct-java", configuration: "shadow") flinkRunnerPreCommit project(":runners:flink:${project.ext.latestFlinkVersion}") - // TODO: Make the netty version used configurable, we add netty-all 4.1.17.Final so it appears on the classpath - // before 4.1.8.Final defined by Apache Beam - sparkRunnerPreCommit "io.netty:netty-all:4.1.17.Final" - sparkRunnerPreCommit project(":runners:spark:2") + sparkRunnerPreCommit project(":runners:spark:3") sparkRunnerPreCommit project(":sdks:java:io:hadoop-file-system") - sparkRunnerPreCommit library.java.spark_streaming - sparkRunnerPreCommit library.java.spark_core } /* diff --git a/examples/multi-language/README.md b/examples/multi-language/README.md index 127ab8c30eb2..072052b4cd56 100644 --- a/examples/multi-language/README.md +++ b/examples/multi-language/README.md @@ -126,9 +126,25 @@ gsutil cat gs://$GCP_BUCKET/multi-language-beam/output* #### Instructions for running the Java pipeline at HEAD (Beam 2.41.0 and 2.42.0). +* Activate a new virtual environment following +[these instructions](https://beam.apache.org/get-started/quickstart-py/#create-and-activate-a-virtual-environment). + +* 2. Install Apache Beam package with gcp support and the `sklearn` package. + +``` +pip install apache-beam[gcp] +pip install sklearn +``` + +* Startup the expansion service + +``` +python -m apache_beam.runners.portability.expansion_service_main -p --fully_qualified_name_glob "*" +``` + * Make sure that Docker is installed and available on your system. -* Build and push Python and Java Docker containers. +* In a different shell, build and push Python and Java Docker containers. ``` export DOCKER_ROOT= @@ -137,7 +153,7 @@ export DOCKER_ROOT= docker push $DOCKER_ROOT/beam_python3.8_sdk:latest -./gradlew :sdks:java:container:java11:docker -Pdocker-repository-root=$DOCKER_ROOT -Pdocker-tag=latest +./gradlew :sdks:java:container:java11:docker -Pdocker-repository-root=$DOCKER_ROOT -Pdocker-tag=latest -Pjava11Home=$JAVA_HOME docker push $DOCKER_ROOT/beam_java11_sdk:latest ``` @@ -149,6 +165,10 @@ Note that we override both the Java and Python SDK harness containers here. export GCP_PROJECT= export GCP_BUCKET= export GCP_REGION= +export EXPANSION_SERVICE_PORT= + +# This removes any existing output. +gsutil rm gs://$GCP_BUCKET/multi-language-beam/output* ./gradlew :examples:multi-language:sklearnMinstClassification --args=" \ --runner=DataflowRunner \ @@ -157,6 +177,7 @@ export GCP_REGION= --output=gs://$GCP_BUCKET/multi-language-beam/output \ --sdkContainerImage=$DOCKER_ROOT/beam_java11_sdk:latest \ --sdkHarnessContainerImageOverrides=.*python.*,$DOCKER_ROOT/beam_python3.8_sdk:latest \ +--expansionService=localhost:$EXPANSION_SERVICE_PORT \ --region=${GCP_REGION}" ``` @@ -166,3 +187,9 @@ of the digit. The second item is the predicted label of the digit. ``` gsutil cat gs://$GCP_BUCKET/multi-language-beam/output* ``` + +### Python Dataframe Wordcount + +This example is covered in the [Java multi-language pipelines quickstart](https://beam.apache.org/documentation/sdks/java-multi-language-pipelines/). +The pipeline source code is available at +[PythonDataframeWordCount.java](https://github.com/apache/beam/tree/master/examples/java/src/main/java/org/apache/beam/examples/multilanguage/PythonDataframeWordCount.java). diff --git a/examples/multi-language/build.gradle b/examples/multi-language/build.gradle index 61fdb686f4eb..b266faeb8f17 100644 --- a/examples/multi-language/build.gradle +++ b/examples/multi-language/build.gradle @@ -40,7 +40,6 @@ dependencies { runtimeOnly project(path: ":runners:portability:java") implementation library.java.vendored_guava_26_0_jre implementation project(":sdks:java:expansion-service") - implementation project(":sdks:java:extensions:python") permitUnusedDeclared project(":sdks:java:expansion-service") // BEAM-11761 } diff --git a/examples/notebooks/beam-ml/custom_remote_inference.ipynb b/examples/notebooks/beam-ml/custom_remote_inference.ipynb new file mode 100644 index 000000000000..713c65599656 --- /dev/null +++ b/examples/notebooks/beam-ml/custom_remote_inference.ipynb @@ -0,0 +1,625 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "paYiulysGrwR" + }, + "outputs": [], + "source": [ + "# @title ###### Licensed to the Apache Software Foundation (ASF), Version 2.0 (the \"License\")\n", + "\n", + "# Licensed to the Apache Software Foundation (ASF) under one\n", + "# or more contributor license agreements. See the NOTICE file\n", + "# distributed with this work for additional information\n", + "# regarding copyright ownership. The ASF licenses this file\n", + "# to you under the Apache License, Version 2.0 (the\n", + "# \"License\"); you may not use this file except in compliance\n", + "# with the License. You may obtain a copy of the License at\n", + "#\n", + "# http://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing,\n", + "# software distributed under the License is distributed on an\n", + "# \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY\n", + "# KIND, either express or implied. See the License for the\n", + "# specific language governing permissions and limitations\n", + "# under the License" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0UGzzndTBPWQ" + }, + "source": [ + "# Remote inference in Beam\n", + "\n", + "The prefered way of running inference in Beam is by using the [RunInference API](https://beam.apache.org/documentation/sdks/python-machine-learning/). The RunInference API enables you to run your models as part of your pipeline in a way that is optimized for machine learning inference. It supports features such as batching, so that you do not need to take care of it yourself. For more info on the RunInference API you can check out the [RunInference notebook](https://github.com/apache/beam/blob/master/examples/notebooks/beam-ml/run_inference_pytorch_tensorflow_sklearn.ipynb), which demonstrates how you can implement model inference in pytorch, scikit-learn and tensorflow.\n", + "\n", + "As of now, RunInference API doesn't support making remote inference calls (e.g. Natural Language API, Cloud Vision API and others). Therefore, in order to use these remote APIs with Beam, one needs to write custom inference call. \n", + "\n", + "This notebook shows how you can implement such a custom inference call in Beam. We are using Cloud Vision API for demonstration. \n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GNbarEZsalS1" + }, + "source": [ + "## Use case: run Cloud Vision API\n", + "\n", + "The Cloud Vision API can be used to retrieve labels that describe an image.\n", + "For example:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "q-jVQn3maZ81" + }, + "source": [ + "![Capture.PNG]()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4io1vzkzF683" + }, + "source": [ + "We want to run the Google Cloud Vision API on a large set of images. Beam is the ideal tool to handle this. In this notebook we will show how to retrieve image labels with this API on a small set of images.\n", + "\n", + "The steps needed to implement this are shown in the notebook:\n", + "* read the images\n", + "* batch your images together to optimize your model call\n", + "* send your images to an external API to run inference\n", + "* post-process the results of your API\n", + "\n", + "⚠️ beware of API quotas and the heavy load you might incur on your external API. Make sure you have set up your pipeline and API correctly for your use case.\n", + "\n", + "For optimizing the calls to external API, you can confgure [PipelineOptions](https://beam.apache.org/documentation/programming-guide/#configuring-pipeline-options) to limit the parallel calls to the external remote API. Different Runners in Beam provide options to handle the parallelism, for example:\n", + "* [DirectRunner](https://beam.apache.org/documentation/runners/direct/) provides `direct_num_workers`.\n", + "* [DataflowRunner](https://beam.apache.org/documentation/runners/dataflow/) provides `max_num_workers`.\n", + "\n", + "You can find details about other runners here: [Link](https://beam.apache.org/documentation/runners/capability-matrix/) " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FAawWOaiIYaS" + }, + "source": [ + "## Installation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XhpKOxINrIqz" + }, + "source": [ + "Install dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "bA7MLR8OptJw" + }, + "outputs": [], + "source": [ + "!pip install --upgrade pip\n", + "!pip install protobuf==3.19.4\n", + "!pip install apache-beam[interactive,gcp]>=2.40.0\n", + "!pip install google-cloud-vision==3.1.1\n", + "!pip install requests\n", + "\n", + "# restart the runtime in order to use newly installed versions\n", + "exit() " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "C-RVR2eprc0r" + }, + "source": [ + "Authenticate with Google so that you will be able to use the Cloud Vision API." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "qGDJCbxgTprh" + }, + "outputs": [], + "source": [ + "# Follow the steps to configure your GCP setup\n", + "!gcloud init --console-only" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "74acX7AlT91N" + }, + "outputs": [], + "source": [ + "\n", + "!gcloud auth application-default login" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mL4MaHm_XOVd" + }, + "source": [ + "## Remote inference on Google Cloud vision API" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gE0go8CpnTy3" + }, + "outputs": [], + "source": [ + "from typing import List\n", + "import io\n", + "import os\n", + "import requests\n", + "\n", + "from google.cloud import vision\n", + "from google.cloud.vision_v1.types import Feature\n", + "import apache_beam as beam" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "09k08IYlLmON" + }, + "source": [ + "For this use case we have selected some images part of the [MSCoco dataset](https://cocodataset.org/#explore), as a list of image urls. This is what we will use as input for our pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_89eN_1QeYEd" + }, + "outputs": [], + "source": [ + "image_urls = [\n", + " \"http://farm3.staticflickr.com/2824/10213933686_6936eb402b_z.jpg\",\n", + " \"http://farm8.staticflickr.com/7026/6388965173_92664a0d78_z.jpg\",\n", + " \"http://farm8.staticflickr.com/7003/6528937031_10e1ce0960_z.jpg\",\n", + " \"http://farm6.staticflickr.com/5207/5304302785_7b5f763190_z.jpg\",\n", + " \"http://farm6.staticflickr.com/5207/5304302785_7b5f763190_z.jpg\",\n", + " \"http://farm8.staticflickr.com/7026/6388965173_92664a0d78_z.jpg\",\n", + " \"http://farm8.staticflickr.com/7026/6388965173_92664a0d78_z.jpg\",\n", + "]\n", + "\n", + "def read_image(image_url):\n", + " \"\"\"Read image from url and return image_url, image bytes\"\"\"\n", + " response = requests.get(image_url)\n", + " image_bytes = io.BytesIO(response.content).read()\n", + " return image_url, image_bytes " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HLy7VKJhLrmT" + }, + "source": [ + "### Custom DoFn\n", + "\n", + "In order to implement remote inference, we must create our own DoFn class. This class will be responsible to send a batch of images to the Cloud vision API.\n", + "\n", + "The custom DoFn allows us to initialize our API, or in case of a custom model, a model can also be loaded in the `setup` function. \n", + "\n", + "The `process` function is the most interesting part. In this function we need to implement the actual model call and return its results.\n", + "\n", + "⚠️ When running remote inference, you must be prepared to encounter, identify, and handle failure as gracefully as possible. We recommend using the following techniques: \n", + "\n", + "* Exponential backoff: Retrying failed remote calls with exponentially growing pauses between retries. Using exponential backoff ensures that failures don't lead to an overwhelming number of retries in quick succession. \n", + "\n", + "* Dead letter queues: Routing failed inferences to a separate PCollection without failing the whole transform. This allows you to continue execution without failing the job (batch jobs' default behavior) or retrying indefinitely (streaming jobs' default behavior). You can then run custom pipeline logic on the deadletter queue to log the failure, alert, and push the failed message to temporary storage so that it can eventually be reprocessed. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "LnaisJ_JiY_Q" + }, + "outputs": [], + "source": [ + "class RemoteBatchInference(beam.DoFn):\n", + " \"\"\"DoFn that accepts a batch of images as bytearray\n", + " and sends that batch to the Cloud vision API for remote inference.\"\"\"\n", + " def setup(self):\n", + " \"\"\"Init the Google Vision API client.\"\"\"\n", + " self._client = vision.ImageAnnotatorClient()\n", + " \n", + " def process(self, images_batch):\n", + " feature = Feature()\n", + " feature.type_ = Feature.Type.LABEL_DETECTION\n", + "\n", + " # list of image_urls\n", + " image_urls = [image_url for (image_url, image_bytes) in images_batch]\n", + "\n", + " # create a batch request for all images in the batch\n", + " images = [vision.Image(content=image_bytes) for (image_url, image_bytes) in images_batch]\n", + " image_requests = [vision.AnnotateImageRequest(image=image, features=[feature]) for image in images]\n", + " batch_image_request = vision.BatchAnnotateImagesRequest(requests=image_requests)\n", + "\n", + " # send batch request to remote endpoint\n", + " responses = self._client.batch_annotate_images(request=batch_image_request).responses\n", + " \n", + " return list(zip(image_urls, responses))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lHJuyHhvL0-a" + }, + "source": [ + "### Batching\n", + "\n", + "Before we can chain all the different steps together in a pipeline, there is one more thing we need to understand: batching. When running inference with your model (both in Beam itself or in an external API), you can batch your input together to allow for more efficient execution of your model. When using a custom DoFn, you need to take care of the batching yourself, in contrast with the RunInference API which takes care of this for you.\n", + "\n", + "In order to achieve this in our pipeline: we will introduce one more step in our pipeline, a `BatchElements` transform that will group elements together to form a batch of the desired size.\n", + "\n", + "⚠️ If you have a streaming pipeline, you may considering using [GroupIntoBatches](https://beam.apache.org/documentation/transforms/python/aggregation/groupintobatches/) as `BatchElements` doesn't batch things across bundles. `GroupIntoBatches` requires choosing a key within which things are batched.\n", + "\n", + "⚠️ When batching make sure that the input batch matches the max payload of the external API. \n", + "\n", + "⚠️ If you are designing your own API endpoint, then make sure that it can handle batches. \n", + "\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4sXHwZk9Url2" + }, + "source": [ + "### Create pipeline\n", + "\n", + "Now we can chain the different steps all together to read data, transform it to fit the model input, run remote inference and finally process and display the results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LLg0OTvNkqo4", + "outputId": "7250b11d-a805-436a-990b-0a864404a536" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "('http://farm3.staticflickr.com/2824/10213933686_6936eb402b_z.jpg', label_annotations {\n", + " mid: \"/m/083wq\"\n", + " description: \"Wheel\"\n", + " score: 0.9790800213813782\n", + " topicality: 0.9790800213813782\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/0h9mv\"\n", + " description: \"Tire\"\n", + " score: 0.9781236052513123\n", + " topicality: 0.9781236052513123\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/043g5f\"\n", + " description: \"Fuel tank\"\n", + " score: 0.9584090113639832\n", + " topicality: 0.9584090113639832\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/05s2s\"\n", + " description: \"Plant\"\n", + " score: 0.956047534942627\n", + " topicality: 0.956047534942627\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/0h8lk_j\"\n", + " description: \"Automotive fuel system\"\n", + " score: 0.9403533339500427\n", + " topicality: 0.9403533339500427\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/07yv9\"\n", + " description: \"Vehicle\"\n", + " score: 0.9362041354179382\n", + " topicality: 0.9362041354179382\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/02qwkrn\"\n", + " description: \"Vehicle brake\"\n", + " score: 0.9050074815750122\n", + " topicality: 0.9050074815750122\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/0h8pb3l\"\n", + " description: \"Automotive tire\"\n", + " score: 0.8968825936317444\n", + " topicality: 0.8968825936317444\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/0768fx\"\n", + " description: \"Automotive lighting\"\n", + " score: 0.8944322466850281\n", + " topicality: 0.8944322466850281\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/04tkfx\"\n", + " description: \"Tread\"\n", + " score: 0.878828227519989\n", + " topicality: 0.878828227519989\n", + "}\n", + ")\n", + "('http://farm8.staticflickr.com/7026/6388965173_92664a0d78_z.jpg', label_annotations {\n", + " mid: \"/m/054_l\"\n", + " description: \"Mirror\"\n", + " score: 0.9682560563087463\n", + " topicality: 0.9682560563087463\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/02jz0l\"\n", + " description: \"Tap\"\n", + " score: 0.9611372947692871\n", + " topicality: 0.9611372947692871\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/0130jx\"\n", + " description: \"Sink\"\n", + " score: 0.9328749775886536\n", + " topicality: 0.9328749775886536\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/0h8lr5r\"\n", + " description: \"Bathroom sink\"\n", + " score: 0.9324912428855896\n", + " topicality: 0.9324912428855896\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/02pkr5\"\n", + " description: \"Plumbing fixture\"\n", + " score: 0.9191171526908875\n", + " topicality: 0.9191171526908875\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/02dgv\"\n", + " description: \"Door\"\n", + " score: 0.8910166621208191\n", + " topicality: 0.8910166621208191\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/09ggk\"\n", + " description: \"Purple\"\n", + " score: 0.8799519538879395\n", + " topicality: 0.8799519538879395\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/01j2bj\"\n", + " description: \"Bathroom\"\n", + " score: 0.8725592494010925\n", + " topicality: 0.8725592494010925\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/04wnmd\"\n", + " description: \"Fixture\"\n", + " score: 0.8603869080543518\n", + " topicality: 0.8603869080543518\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/04y4h8h\"\n", + " description: \"Bathroom cabinet\"\n", + " score: 0.80011385679245\n", + " topicality: 0.80011385679245\n", + "}\n", + ")\n", + "('http://farm8.staticflickr.com/7003/6528937031_10e1ce0960_z.jpg', error {\n", + " code: 3\n", + " message: \"Bad image data.\"\n", + "}\n", + ")\n", + "('http://farm6.staticflickr.com/5207/5304302785_7b5f763190_z.jpg', error {\n", + " code: 3\n", + " message: \"Bad image data.\"\n", + "}\n", + ")\n", + "('http://farm6.staticflickr.com/5207/5304302785_7b5f763190_z.jpg', error {\n", + " code: 3\n", + " message: \"Bad image data.\"\n", + "}\n", + ")\n", + "('http://farm8.staticflickr.com/7026/6388965173_92664a0d78_z.jpg', label_annotations {\n", + " mid: \"/m/054_l\"\n", + " description: \"Mirror\"\n", + " score: 0.9682560563087463\n", + " topicality: 0.9682560563087463\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/02jz0l\"\n", + " description: \"Tap\"\n", + " score: 0.9611372947692871\n", + " topicality: 0.9611372947692871\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/0130jx\"\n", + " description: \"Sink\"\n", + " score: 0.9328749775886536\n", + " topicality: 0.9328749775886536\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/0h8lr5r\"\n", + " description: \"Bathroom sink\"\n", + " score: 0.9324912428855896\n", + " topicality: 0.9324912428855896\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/02pkr5\"\n", + " description: \"Plumbing fixture\"\n", + " score: 0.9191171526908875\n", + " topicality: 0.9191171526908875\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/02dgv\"\n", + " description: \"Door\"\n", + " score: 0.8910166621208191\n", + " topicality: 0.8910166621208191\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/09ggk\"\n", + " description: \"Purple\"\n", + " score: 0.8799519538879395\n", + " topicality: 0.8799519538879395\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/01j2bj\"\n", + " description: \"Bathroom\"\n", + " score: 0.8725592494010925\n", + " topicality: 0.8725592494010925\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/04wnmd\"\n", + " description: \"Fixture\"\n", + " score: 0.8603869080543518\n", + " topicality: 0.8603869080543518\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/04y4h8h\"\n", + " description: \"Bathroom cabinet\"\n", + " score: 0.80011385679245\n", + " topicality: 0.80011385679245\n", + "}\n", + ")\n", + "('http://farm8.staticflickr.com/7026/6388965173_92664a0d78_z.jpg', label_annotations {\n", + " mid: \"/m/054_l\"\n", + " description: \"Mirror\"\n", + " score: 0.9682560563087463\n", + " topicality: 0.9682560563087463\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/02jz0l\"\n", + " description: \"Tap\"\n", + " score: 0.9611372947692871\n", + " topicality: 0.9611372947692871\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/0130jx\"\n", + " description: \"Sink\"\n", + " score: 0.9328749775886536\n", + " topicality: 0.9328749775886536\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/0h8lr5r\"\n", + " description: \"Bathroom sink\"\n", + " score: 0.9324912428855896\n", + " topicality: 0.9324912428855896\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/02pkr5\"\n", + " description: \"Plumbing fixture\"\n", + " score: 0.9191171526908875\n", + " topicality: 0.9191171526908875\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/02dgv\"\n", + " description: \"Door\"\n", + " score: 0.8910166621208191\n", + " topicality: 0.8910166621208191\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/09ggk\"\n", + " description: \"Purple\"\n", + " score: 0.8799519538879395\n", + " topicality: 0.8799519538879395\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/01j2bj\"\n", + " description: \"Bathroom\"\n", + " score: 0.8725592494010925\n", + " topicality: 0.8725592494010925\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/04wnmd\"\n", + " description: \"Fixture\"\n", + " score: 0.8603869080543518\n", + " topicality: 0.8603869080543518\n", + "}\n", + "label_annotations {\n", + " mid: \"/m/04y4h8h\"\n", + " description: \"Bathroom cabinet\"\n", + " score: 0.80011385679245\n", + " topicality: 0.80011385679245\n", + "}\n", + ")\n" + ] + } + ], + "source": [ + "with beam.Pipeline() as pipeline:\n", + " _ = (pipeline | \"Create inputs\" >> beam.Create(image_urls)\n", + " | \"Read images\" >> beam.Map(read_image)\n", + " | \"Batch images\" >> beam.BatchElements(min_batch_size=2, max_batch_size=4)\n", + " | \"Inference\" >> beam.ParDo(RemoteBatchInference())\n", + " | \"Print image_url and annotation\" >> beam.Map(print)\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7gwn5bF1XaDm" + }, + "source": [ + "### Metrics\n", + "\n", + "You should consider monitoring and measuring performance of a pipeline when deploying since monitoring can provide insight into the status and health of the application. See [RunInference Metrics](https://beam.apache.org/documentation/ml/runinference-metrics/) for an example of the types of metrics you may want to consider tracking." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/examples/notebooks/beam-ml/dataframe_api_preprocessing.ipynb b/examples/notebooks/beam-ml/dataframe_api_preprocessing.ipynb new file mode 100644 index 000000000000..0dbd0e66ddf8 --- /dev/null +++ b/examples/notebooks/beam-ml/dataframe_api_preprocessing.ipynb @@ -0,0 +1,3496 @@ +{ + "cells": [ + { + "cell_type": "code", + "source": [ + "#@title ###### Licensed to the Apache Software Foundation (ASF), Version 2.0 (the \"License\")\n", + "\n", + "# Licensed to the Apache Software Foundation (ASF) under one\n", + "# or more contributor license agreements. See the NOTICE file\n", + "# distributed with this work for additional information\n", + "# regarding copyright ownership. The ASF licenses this file\n", + "# to you under the Apache License, Version 2.0 (the\n", + "# \"License\"); you may not use this file except in compliance\n", + "# with the License. You may obtain a copy of the License at\n", + "#\n", + "# http://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing,\n", + "# software distributed under the License is distributed on an\n", + "# \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY\n", + "# KIND, either express or implied. See the License for the\n", + "# specific language governing permissions and limitations\n", + "# under the License." + ], + "metadata": { + "id": "sARMhsXz8yR1", + "cellView": "form" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Overview\n", + "\n", + "One of the most common tools used for data exploration and pre-processing is [pandas DataFrames](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html). Pandas has become very popular for its ease of use. It has very intuitive methods to perform common analytical tasks and data pre-processing. \n", + "\n", + "Pandas loads all of the data into memory on a single machine (one node) for rapid execution. This works well when dealing with small-scale datasets. However, many projects involve datasets that can grow too big to fit in memory. These use cases generally require the usage of parallel data processing frameworks such as Apache Beam.\n", + "\n", + "\n", + "## Beam DataFrames\n", + "\n", + "\n", + "Beam DataFrames provide a pandas-like\n", + "API to declare and define Beam processing pipelines. It provides a familiar interface for machine learning practioners to build complex data-processing pipelines by only invoking standard pandas commands.\n", + "\n", + "> ℹ️ To learn more about Beam DataFrames, take a look at the\n", + "[Beam DataFrames overview](https://beam.apache.org/documentation/dsls/dataframes/overview) page.\n", + "\n", + "## Goal\n", + "The goal of this notebook is to explore a dataset preprocessed it for machine learning model training using the Beam DataFrames API.\n", + "\n", + "\n", + "## Tutorial outline\n", + "\n", + "In this notebook, we walk through the use of the Beam DataFrames API to perform common data exploration as well as pre-processing steps that are necessary to prepare your dataset for machine learning model training and inference, such as: \n", + "\n", + "* Removing unwanted columns.\n", + "* One-hot encoding categorical columns.\n", + "* Normalizing numerical columns.\n", + "\n", + "\n" + ], + "metadata": { + "id": "iFZC1inKuUCy" + } + }, + { + "cell_type": "markdown", + "source": [ + "# Installation\n", + "\n", + "As we want to explore the elements within a `PCollection`, we can make use of the the Interactive runner by installing Apache Beam with the `interactive` component. The latest implemented DataFrames API methods invoked in this notebook are available in Beam 2.43 or later.\n" + ], + "metadata": { + "id": "A0f2HJ22D4lt" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pCjwrwNWnuqI" + }, + "source": [ + "Install latest version" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-OJC0Xn5Um-C", + "beam:comment": "TODO(https://github.com/apache/issues/23961): Just install 2.43.0 once it's released, [`issue 23276`](https://github.com/apache/beam/issues/23276) is currently not implemented for Beam 2.42 (required fix for implementing `str.get_dummies()`" + }, + "outputs": [], + "source": [ + "!git clone https://github.com/apache/beam.git\n", + "\n", + "!cd beam/sdks/python && pip3 install -r build-requirements.txt \n", + "\n", + "%pip install -e beam/sdks/python/.[interactive,gcp]" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Part I : Local exploration with the Interactive Beam runner\n", + "We first use the [Interactive Beam](https://beam.apache.org/releases/pydoc/2.20.0/apache_beam.runners.interactive.interactive_beam.html) to explore and develop our pipeline.\n", + "This allows us to test our code interactively, building out the pipeline as we go before deploying it on a distributed runner. \n", + "\n", + "\n", + "> ℹ️ In this section, we will only be working with a subset of the original dataset since we're only using the the compute resources of the notebook instance.\n" + ], + "metadata": { + "id": "3NO6RgB7GkkE" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5I3G094hoB1P" + }, + "source": [ + "# Loading the data\n", + "\n", + "Pandas has the\n", + "[`pandas.read_csv`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html)\n", + "function to easily read CSV files into DataFrames.\n", + "We're using the beam\n", + "[`beam.dataframe.io.read_csv`](https://beam.apache.org/releases/pydoc/current/apache_beam.dataframe.io.html#apache_beam.dataframe.io.read_csv)\n", + "function that emulates `pandas.read_csv`. The main difference between them is that the beam method returns a deferred Beam DataFrame while pandas return a standard DataFrame.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "X3_OB9cAULav" + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "import numpy as np\n", + "import pandas as pd \n", + "import apache_beam as beam\n", + "import apache_beam.runners.interactive.interactive_beam as ib\n", + "from apache_beam import dataframe\n", + "from apache_beam.runners.interactive.interactive_runner import InteractiveRunner\n", + "from apache_beam.runners.dataflow import DataflowRunner\n", + "\n", + "# Available options: [sample_1000, sample_10000, sample_100000, full] where\n", + "# sample contains all of the dataset (around 1000000 samples)\n", + "\n", + "source_csv_file = 'gs://apache-beam-samples/nasa_jpl_asteroid/sample_10000.csv'\n", + "\n", + "# Initialize pipline\n", + "p = beam.Pipeline(InteractiveRunner())\n", + "\n", + "beam_df = p | beam.dataframe.io.read_csv(source_csv_file)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "paf7yf3YpCh8" + }, + "source": [ + "# Data pre-processing\n", + "\n", + "## Dataset description \n", + "\n", + "### [NASA - Nearest Earth Objects dataset](https://cneos.jpl.nasa.gov/ca/)\n", + "There are an innumerable number of objects in the outer space. Some of them are closer than we think. Even though we might think that a distance of 70,000 Km can not potentially harm us, but at an astronomical scale, this is a very small distance and can disrupt many natural phenomena. \n", + "\n", + "These objects/asteroids can thus prove to be harmful. Hence, it is wise to know what is surrounding us and what can harm us amongst those. Thus, this dataset compiles the list of NASA certified asteroids that are classified as the nearest earth object." + ] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "Let's first inspect the columns of our dataset and their types" + ], + "metadata": { + "id": "cvAu5T0ENjuQ" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LwW77ixE-pjR", + "outputId": "3dfba30d-165e-46a6-b0b9-f12519db1c27" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "spk_id int64\n", + "full_name object\n", + "near_earth_object object\n", + "absolute_magnitude float64\n", + "diameter float64\n", + "albedo float64\n", + "diameter_sigma float64\n", + "eccentricity float64\n", + "inclination float64\n", + "moid_ld float64\n", + "object_class object\n", + "semi_major_axis_au_unit float64\n", + "hazardous_flag object\n", + "dtype: object" + ] + }, + "metadata": {}, + "execution_count": 27 + } + ], + "source": [ + "beam_df.dtypes" + ] + }, + { + "cell_type": "markdown", + "source": [ + "When using Interactive Beam, we can use `ib.collect()` to bring a Beam DataFrame into local memory as a Pandas DataFrame." + ], + "metadata": { + "id": "1Wa6fpbyQige" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 746 + }, + "id": "DPxkAmkpq4Xv", + "outputId": "3f89126d-f6fb-43fc-d87b-5daf8563e057" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + "
\n", + "
\n", + " Processing... collect\n", + "
\n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "application/javascript": [ + "\n", + " if (typeof window.interactive_beam_jquery == 'undefined') {\n", + " var jqueryScript = document.createElement('script');\n", + " jqueryScript.src = 'https://code.jquery.com/jquery-3.4.1.slim.min.js';\n", + " jqueryScript.type = 'text/javascript';\n", + " jqueryScript.onload = function() {\n", + " var datatableScript = document.createElement('script');\n", + " datatableScript.src = 'https://cdn.datatables.net/1.10.20/js/jquery.dataTables.min.js';\n", + " datatableScript.type = 'text/javascript';\n", + " datatableScript.onload = function() {\n", + " window.interactive_beam_jquery = jQuery.noConflict(true);\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " $(\"#progress_indicator_79206f341d7de09f6cacdd05be309575\").remove();\n", + " });\n", + " }\n", + " document.head.appendChild(datatableScript);\n", + " };\n", + " document.head.appendChild(jqueryScript);\n", + " } else {\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " $(\"#progress_indicator_79206f341d7de09f6cacdd05be309575\").remove();\n", + " });\n", + " }" + ] + }, + "metadata": {} + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " spk_id full_name near_earth_object \\\n", + "0 2000001 1 Ceres N \n", + "1 2000002 2 Pallas N \n", + "2 2000003 3 Juno N \n", + "3 2000004 4 Vesta N \n", + "4 2000005 5 Astraea N \n", + "... ... ... ... \n", + "9994 2009995 9995 Alouette (4805 P-L) N \n", + "9995 2009996 9996 ANS (9070 P-L) N \n", + "9996 2009997 9997 COBE (1217 T-1) N \n", + "9997 2009998 9998 ISO (1293 T-1) N \n", + "9998 2009999 9999 Wiles (4196 T-2) N \n", + "\n", + " absolute_magnitude diameter albedo diameter_sigma eccentricity \\\n", + "0 3.40 939.400 0.0900 0.200 0.076009 \n", + "1 4.20 545.000 0.1010 18.000 0.229972 \n", + "2 5.33 246.596 0.2140 10.594 0.256936 \n", + "3 3.00 525.400 0.4228 0.200 0.088721 \n", + "4 6.90 106.699 0.2740 3.140 0.190913 \n", + "... ... ... ... ... ... \n", + "9994 15.10 2.564 0.2450 0.550 0.160610 \n", + "9995 13.60 8.978 0.1130 0.376 0.235174 \n", + "9996 14.30 NaN NaN NaN 0.113059 \n", + "9997 15.10 2.235 0.3880 0.373 0.093852 \n", + "9998 13.00 7.148 0.2620 0.065 0.071351 \n", + "\n", + " inclination moid_ld object_class semi_major_axis_au_unit \\\n", + "0 10.594067 620.640533 MBA 2.769165 \n", + "1 34.832932 480.348639 MBA 2.773841 \n", + "2 12.991043 402.514639 MBA 2.668285 \n", + "3 7.141771 443.451432 MBA 2.361418 \n", + "4 5.367427 426.433027 MBA 2.574037 \n", + "... ... ... ... ... \n", + "9994 2.311731 388.723233 MBA 2.390249 \n", + "9995 7.657713 444.194746 MBA 2.796605 \n", + "9996 2.459643 495.460110 MBA 2.545674 \n", + "9997 3.912263 373.848377 MBA 2.160961 \n", + "9998 3.198839 632.144398 MBA 2.839917 \n", + "\n", + " hazardous_flag \n", + "0 N \n", + "1 N \n", + "2 N \n", + "3 N \n", + "4 N \n", + "... ... \n", + "9994 N \n", + "9995 N \n", + "9996 N \n", + "9997 N \n", + "9998 N \n", + "\n", + "[9999 rows x 13 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
spk_idfull_namenear_earth_objectabsolute_magnitudediameteralbedodiameter_sigmaeccentricityinclinationmoid_ldobject_classsemi_major_axis_au_unithazardous_flag
020000011 CeresN3.40939.4000.09000.2000.07600910.594067620.640533MBA2.769165N
120000022 PallasN4.20545.0000.101018.0000.22997234.832932480.348639MBA2.773841N
220000033 JunoN5.33246.5960.214010.5940.25693612.991043402.514639MBA2.668285N
320000044 VestaN3.00525.4000.42280.2000.0887217.141771443.451432MBA2.361418N
420000055 AstraeaN6.90106.6990.27403.1400.1909135.367427426.433027MBA2.574037N
..........................................
999420099959995 Alouette (4805 P-L)N15.102.5640.24500.5500.1606102.311731388.723233MBA2.390249N
999520099969996 ANS (9070 P-L)N13.608.9780.11300.3760.2351747.657713444.194746MBA2.796605N
999620099979997 COBE (1217 T-1)N14.30NaNNaNNaN0.1130592.459643495.460110MBA2.545674N
999720099989998 ISO (1293 T-1)N15.102.2350.38800.3730.0938523.912263373.848377MBA2.160961N
999820099999999 Wiles (4196 T-2)N13.007.1480.26200.0650.0713513.198839632.144398MBA2.839917N
\n", + "

9999 rows × 13 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 28 + } + ], + "source": [ + "ib.collect(beam_df)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "We can see that our datasets consists of both:\n", + "\n", + "* **Numerical columns:** These columns need to be transformed through [normalization](https://developers.google.com/machine-learning/data-prep/transform/normalization) before they can be used for training a machine learning model.\n", + "\n", + "* **Categorical columns:** We need to transform those columns with [one-hot encoding](https://developers.google.com/machine-learning/data-prep/transform/transform-categorical) to use them during training. \n" + ], + "metadata": { + "id": "8jV9odKhNyF2" + } + }, + { + "cell_type": "markdown", + "source": [ + "We can also explore use the standard pandas command `DataFrame.describe()` to generate descriptive statistics for the numerical columns like percentile, mean, std, etc. " + ], + "metadata": { + "id": "MGAErO0lAYws" + } + }, + { + "cell_type": "code", + "source": [ + "with dataframe.allow_non_parallel_operations():\n", + " beam_df_description = ib.collect(beam_df.describe())\n", + "\n", + "beam_df_description" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 378 + }, + "id": "Befv697VBGM7", + "outputId": "bb465020-94e4-4b3c-fda6-6e43da199be1" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + "
\n", + "
\n", + " Processing... collect\n", + "
\n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "application/javascript": [ + "\n", + " if (typeof window.interactive_beam_jquery == 'undefined') {\n", + " var jqueryScript = document.createElement('script');\n", + " jqueryScript.src = 'https://code.jquery.com/jquery-3.4.1.slim.min.js';\n", + " jqueryScript.type = 'text/javascript';\n", + " jqueryScript.onload = function() {\n", + " var datatableScript = document.createElement('script');\n", + " datatableScript.src = 'https://cdn.datatables.net/1.10.20/js/jquery.dataTables.min.js';\n", + " datatableScript.type = 'text/javascript';\n", + " datatableScript.onload = function() {\n", + " window.interactive_beam_jquery = jQuery.noConflict(true);\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " $(\"#progress_indicator_98687cb0060a8077a8abab6e464e4a75\").remove();\n", + " });\n", + " }\n", + " document.head.appendChild(datatableScript);\n", + " };\n", + " document.head.appendChild(jqueryScript);\n", + " } else {\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " $(\"#progress_indicator_98687cb0060a8077a8abab6e464e4a75\").remove();\n", + " });\n", + " }" + ] + }, + "metadata": {} + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " spk_id absolute_magnitude diameter albedo \\\n", + "count 9.999000e+03 9999.000000 8688.000000 8672.000000 \n", + "mean 2.005000e+06 12.675380 19.245446 0.197723 \n", + "std 2.886607e+03 1.639609 30.190191 0.138819 \n", + "min 2.000001e+06 3.000000 0.300000 0.008000 \n", + "25% 2.002500e+06 11.900000 5.614000 0.074000 \n", + "50% 2.005000e+06 12.900000 9.814000 0.187000 \n", + "75% 2.007500e+06 13.700000 19.156750 0.283000 \n", + "max 2.009999e+06 20.700000 939.400000 1.000000 \n", + "\n", + " diameter_sigma eccentricity inclination moid_ld \\\n", + "count 8591.000000 9999.000000 9999.000000 9999.000000 \n", + "mean 0.454072 0.148716 7.890742 509.805237 \n", + "std 1.093676 0.083803 6.336244 205.046582 \n", + "min 0.006000 0.001003 0.042716 0.131028 \n", + "25% 0.120000 0.093780 3.220137 377.829197 \n", + "50% 0.201000 0.140335 6.018836 470.650523 \n", + "75% 0.375000 0.187092 10.918176 636.010802 \n", + "max 39.297000 0.889831 68.018875 4241.524913 \n", + "\n", + " semi_major_axis_au_unit \n", + "count 9999.000000 \n", + "mean 2.689836 \n", + "std 0.607190 \n", + "min 0.832048 \n", + "25% 2.340816 \n", + "50% 2.614468 \n", + "75% 3.005449 \n", + "max 24.667968 " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
spk_idabsolute_magnitudediameteralbedodiameter_sigmaeccentricityinclinationmoid_ldsemi_major_axis_au_unit
count9.999000e+039999.0000008688.0000008672.0000008591.0000009999.0000009999.0000009999.0000009999.000000
mean2.005000e+0612.67538019.2454460.1977230.4540720.1487167.890742509.8052372.689836
std2.886607e+031.63960930.1901910.1388191.0936760.0838036.336244205.0465820.607190
min2.000001e+063.0000000.3000000.0080000.0060000.0010030.0427160.1310280.832048
25%2.002500e+0611.9000005.6140000.0740000.1200000.0937803.220137377.8291972.340816
50%2.005000e+0612.9000009.8140000.1870000.2010000.1403356.018836470.6505232.614468
75%2.007500e+0613.70000019.1567500.2830000.3750000.18709210.918176636.0108023.005449
max2.009999e+0620.700000939.4000001.00000039.2970000.88983168.0188754241.52491324.667968
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 21 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "D9uJtHLSSAMC" + }, + "source": [ + "Before executing any transformations, we need to check if all the columns need to be used for model training. Let's first have a look at the column description as provided by the [JPL website](https://ssd.jpl.nasa.gov/sbdb_query.cgi):\n", + "\n", + "* **spk_id:** Object primary SPK-ID\n", + "* **full_name:** Asteroid name\n", + "* **near_earth_object:** Near-earth object flag\n", + "* **absolute_magnitude:** the apparent magnitude an object would have if it were located at a distance of 10 parsecs.\n", + "* **diameter:** object diameter (from equivalent sphere) km Unit\n", + "* **albedo:** a measure of the diffuse reflection of solar radiation out of the total solar radiation and measured on a scale from 0 to 1.\n", + "* **diameter_sigma:** 1-sigma uncertainty in object diameter km Unit.\n", + "* **eccentricity:** value between 0 and 1 that referes to how flat or round the shape of the asteroid is \n", + "* **inclination:** angle with respect to x-y ecliptic plane\n", + "* **moid_ld:** Earth Minimum Orbit Intersection Distance au Unit\n", + "* **object_class:** the classification of the asteroid. Checkout this [link](https://pdssbn.astro.umd.edu/data_other/objclass.shtml) for a more detailed description.\n", + "* **Semi-major axis au Unit:** the length of half of the long axis in AU unit\n", + "* **hazardous_flag:** Hazardous Asteroid Flag" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DzYVKbwTp72d" + }, + "source": [ + "Columns **'spk_id'** and **'full_name'** are unique for each row. These columns can be removed since they are not needed for model training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "piRPwH2aqT06" + }, + "outputs": [], + "source": [ + "beam_df = beam_df.drop(['spk_id', 'full_name'], axis='columns', inplace=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fRvNyahSuX_y" + }, + "source": [ + "Let's have a look at the number of missing values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 358 + }, + "id": "A2PLchW8vXvt", + "outputId": "14a4ac64-5b54-4ed4-959d-daea65bb6457" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/content/beam/sdks/python/apache_beam/dataframe/frame_base.py:145: RuntimeWarning: invalid value encountered in long_scalars\n", + " lambda left, right: getattr(left, op)(right), name=op, args=[other])\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + "
\n", + "
\n", + " Processing... collect\n", + "
\n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "application/javascript": [ + "\n", + " if (typeof window.interactive_beam_jquery == 'undefined') {\n", + " var jqueryScript = document.createElement('script');\n", + " jqueryScript.src = 'https://code.jquery.com/jquery-3.4.1.slim.min.js';\n", + " jqueryScript.type = 'text/javascript';\n", + " jqueryScript.onload = function() {\n", + " var datatableScript = document.createElement('script');\n", + " datatableScript.src = 'https://cdn.datatables.net/1.10.20/js/jquery.dataTables.min.js';\n", + " datatableScript.type = 'text/javascript';\n", + " datatableScript.onload = function() {\n", + " window.interactive_beam_jquery = jQuery.noConflict(true);\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " $(\"#progress_indicator_868f8ad001ab00c7013b65472a513917\").remove();\n", + " });\n", + " }\n", + " document.head.appendChild(datatableScript);\n", + " };\n", + " document.head.appendChild(jqueryScript);\n", + " } else {\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " $(\"#progress_indicator_868f8ad001ab00c7013b65472a513917\").remove();\n", + " });\n", + " }" + ] + }, + "metadata": {} + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "near_earth_object 0.000000\n", + "absolute_magnitude 0.000000\n", + "diameter 13.111311\n", + "albedo 13.271327\n", + "diameter_sigma 14.081408\n", + "eccentricity 0.000000\n", + "inclination 0.000000\n", + "moid_ld 0.000000\n", + "object_class 0.000000\n", + "semi_major_axis_au_unit 0.000000\n", + "hazardous_flag 0.000000\n", + "dtype: float64" + ] + }, + "metadata": {}, + "execution_count": 30 + } + ], + "source": [ + "ib.collect(beam_df.isnull().mean() * 100)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "00MRdFGLwQiD" + }, + "source": [ + "It can be observed that most of the columns do not have missing values. However, columns **'diameter'**, **'albedo'** and **'diameter_sigma'** have many missing values. Since these values cannot be measured or derived, we can remove them since they will not be required for training the machine learning model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tHYeCHREwvyB", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 538 + }, + "outputId": "3be686d0-f56a-4054-a71a-d3019bf379e8" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + "
\n", + "
\n", + " Processing... collect\n", + "
\n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "application/javascript": [ + "\n", + " if (typeof window.interactive_beam_jquery == 'undefined') {\n", + " var jqueryScript = document.createElement('script');\n", + " jqueryScript.src = 'https://code.jquery.com/jquery-3.4.1.slim.min.js';\n", + " jqueryScript.type = 'text/javascript';\n", + " jqueryScript.onload = function() {\n", + " var datatableScript = document.createElement('script');\n", + " datatableScript.src = 'https://cdn.datatables.net/1.10.20/js/jquery.dataTables.min.js';\n", + " datatableScript.type = 'text/javascript';\n", + " datatableScript.onload = function() {\n", + " window.interactive_beam_jquery = jQuery.noConflict(true);\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " $(\"#progress_indicator_f88b77f183371d1a45fa87bed4a545f6\").remove();\n", + " });\n", + " }\n", + " document.head.appendChild(datatableScript);\n", + " };\n", + " document.head.appendChild(jqueryScript);\n", + " } else {\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " $(\"#progress_indicator_f88b77f183371d1a45fa87bed4a545f6\").remove();\n", + " });\n", + " }" + ] + }, + "metadata": {} + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " near_earth_object absolute_magnitude eccentricity inclination \\\n", + "0 N 3.40 0.076009 10.594067 \n", + "1 N 4.20 0.229972 34.832932 \n", + "2 N 5.33 0.256936 12.991043 \n", + "3 N 3.00 0.088721 7.141771 \n", + "4 N 6.90 0.190913 5.367427 \n", + "... ... ... ... ... \n", + "9994 N 15.10 0.160610 2.311731 \n", + "9995 N 13.60 0.235174 7.657713 \n", + "9996 N 14.30 0.113059 2.459643 \n", + "9997 N 15.10 0.093852 3.912263 \n", + "9998 N 13.00 0.071351 3.198839 \n", + "\n", + " moid_ld object_class semi_major_axis_au_unit hazardous_flag \n", + "0 620.640533 MBA 2.769165 N \n", + "1 480.348639 MBA 2.773841 N \n", + "2 402.514639 MBA 2.668285 N \n", + "3 443.451432 MBA 2.361418 N \n", + "4 426.433027 MBA 2.574037 N \n", + "... ... ... ... ... \n", + "9994 388.723233 MBA 2.390249 N \n", + "9995 444.194746 MBA 2.796605 N \n", + "9996 495.460110 MBA 2.545674 N \n", + "9997 373.848377 MBA 2.160961 N \n", + "9998 632.144398 MBA 2.839917 N \n", + "\n", + "[9999 rows x 8 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
near_earth_objectabsolute_magnitudeeccentricityinclinationmoid_ldobject_classsemi_major_axis_au_unithazardous_flag
0N3.400.07600910.594067620.640533MBA2.769165N
1N4.200.22997234.832932480.348639MBA2.773841N
2N5.330.25693612.991043402.514639MBA2.668285N
3N3.000.0887217.141771443.451432MBA2.361418N
4N6.900.1909135.367427426.433027MBA2.574037N
...........................
9994N15.100.1606102.311731388.723233MBA2.390249N
9995N13.600.2351747.657713444.194746MBA2.796605N
9996N14.300.1130592.459643495.460110MBA2.545674N
9997N15.100.0938523.912263373.848377MBA2.160961N
9998N13.000.0713513.198839632.144398MBA2.839917N
\n", + "

9999 rows × 8 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 31 + } + ], + "source": [ + "beam_df = beam_df.drop(['diameter', 'albedo', 'diameter_sigma'], axis='columns', inplace=False)\n", + "ib.collect(beam_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "a3PojL3WBqgE" + }, + "source": [ + "Next, we need to normalize the numerical columns before using them to train a model. A common method of standarization is to subtract the mean and divide by standard deviation (a.k.a [z-score](https://developers.google.com/machine-learning/data-prep/transform/normalization#z-score)). This improves the performance and training stability of the model during training and inference.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sZ2_gB8wENF1" + }, + "source": [ + "Let's first get both the the numerical columns and categorical columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vsWY8xW5d_Wn" + }, + "outputs": [], + "source": [ + "numerical_cols = beam_df.select_dtypes(include=np.number).columns.tolist()\n", + "categorical_cols = list(set(beam_df.columns) - set(numerical_cols))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "v03ABuXJKEmv" + }, + "source": [ + "Normalizing the data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 587 + }, + "id": "PD_DTxPCP4hs", + "outputId": "16fede03-f67e-4c26-8714-fd3fc6892109" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/content/beam/sdks/python/apache_beam/dataframe/frame_base.py:145: RuntimeWarning: invalid value encountered in double_scalars\n", + " lambda left, right: getattr(left, op)(right), name=op, args=[other])\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + "
\n", + "
\n", + " Processing... collect\n", + "
\n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "application/javascript": [ + "\n", + " if (typeof window.interactive_beam_jquery == 'undefined') {\n", + " var jqueryScript = document.createElement('script');\n", + " jqueryScript.src = 'https://code.jquery.com/jquery-3.4.1.slim.min.js';\n", + " jqueryScript.type = 'text/javascript';\n", + " jqueryScript.onload = function() {\n", + " var datatableScript = document.createElement('script');\n", + " datatableScript.src = 'https://cdn.datatables.net/1.10.20/js/jquery.dataTables.min.js';\n", + " datatableScript.type = 'text/javascript';\n", + " datatableScript.onload = function() {\n", + " window.interactive_beam_jquery = jQuery.noConflict(true);\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " $(\"#progress_indicator_55302fa5950ce6ceb9f99ff9a168097a\").remove();\n", + " });\n", + " }\n", + " document.head.appendChild(datatableScript);\n", + " };\n", + " document.head.appendChild(jqueryScript);\n", + " } else {\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " $(\"#progress_indicator_55302fa5950ce6ceb9f99ff9a168097a\").remove();\n", + " });\n", + " }" + ] + }, + "metadata": {} + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " absolute_magnitude eccentricity inclination moid_ld \\\n", + "306 -1.570727 -0.062543 -0.278518 0.373194 \n", + "310 -1.631718 -1.724526 -0.736389 1.087833 \n", + "546 -1.753698 1.028793 1.415303 -0.339489 \n", + "635 -1.875678 0.244869 0.005905 0.214107 \n", + "701 -3.278451 -1.570523 2.006145 1.542754 \n", + "... ... ... ... ... \n", + "9697 0.807888 -1.151809 -0.082944 -0.129556 \n", + "9813 1.722740 0.844551 -0.583247 -1.006447 \n", + "9868 0.807888 -0.207399 -0.784665 -0.462136 \n", + "9903 0.868878 0.460086 0.092258 -0.107597 \n", + "9956 0.746898 -0.234132 -0.161116 -0.601379 \n", + "\n", + " semi_major_axis_au_unit \n", + "306 0.357201 \n", + "310 0.344233 \n", + "546 0.139080 \n", + "635 0.367559 \n", + "701 0.829337 \n", + "... ... \n", + "9697 -0.533538 \n", + "9813 -0.677961 \n", + "9868 -0.539794 \n", + "9903 0.071794 \n", + "9956 -0.664887 \n", + "\n", + "[9999 rows x 5 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
absolute_magnitudeeccentricityinclinationmoid_ldsemi_major_axis_au_unit
306-1.570727-0.062543-0.2785180.3731940.357201
310-1.631718-1.724526-0.7363891.0878330.344233
546-1.7536981.0287931.415303-0.3394890.139080
635-1.8756780.2448690.0059050.2141070.367559
701-3.278451-1.5705232.0061451.5427540.829337
..................
96970.807888-1.151809-0.082944-0.129556-0.533538
98131.7227400.844551-0.583247-1.006447-0.677961
98680.807888-0.207399-0.784665-0.462136-0.539794
99030.8688780.4600860.092258-0.1075970.071794
99560.746898-0.234132-0.161116-0.601379-0.664887
\n", + "

9999 rows × 5 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 33 + } + ], + "source": [ + "# Get numerical columns\n", + "beam_df_numericals = beam_df.filter(items=numerical_cols)\n", + "\n", + "# Standarize dataframes only with numerical columns\n", + "beam_df_numericals = (beam_df_numericals - beam_df_numericals.mean())/beam_df_numericals.std()\n", + "\n", + "ib.collect(beam_df_numericals)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qdNILsajFvex" + }, + "source": [ + "Next, we need to convert the categorical columns into one-hot encoded variables to use them during training. \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ngoxg0rSywVd" + }, + "outputs": [], + "source": [ + "def get_one_hot_encoding(df: pd.DataFrame, categorical_col:list) -> pd.DataFrame:\n", + " beam_df_categorical= beam_df[categorical_col]\n", + " # Get unique values\n", + " with dataframe.allow_non_parallel_operations():\n", + " unique_classes = pd.CategoricalDtype(ib.collect(beam_df_categorical.unique(as_series=True)))\n", + " # Use `str.get_dummies()` to get the one-hot encoded representation of the categorical columns\n", + " beam_df_categorical = beam_df_categorical.astype(unique_classes).str.get_dummies()\n", + " # Add column name prefix to the newly created categorical columns\n", + " beam_df_categorical = beam_df_categorical.add_prefix(f'{categorical_col}_')\n", + "\n", + " return beam_df_categorical" + ] + }, + { + "cell_type": "code", + "source": [ + "for categorical_col in categorical_cols:\n", + " beam_df_categorical = get_one_hot_encoding(df=beam_df, categorical_col=categorical_col)\n", + " beam_df_numericals = beam_df_numericals.merge(beam_df_categorical, left_index = True, right_index = True)\n", + "ib.collect(beam_df_numericals)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 602 + }, + "id": "k9rvtWqHf6Qw", + "outputId": "b8d8ae57-6dba-45b4-e7ae-e4b14084eede" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + "
\n", + "
\n", + " Processing... collect\n", + "
\n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "application/javascript": [ + "\n", + " if (typeof window.interactive_beam_jquery == 'undefined') {\n", + " var jqueryScript = document.createElement('script');\n", + " jqueryScript.src = 'https://code.jquery.com/jquery-3.4.1.slim.min.js';\n", + " jqueryScript.type = 'text/javascript';\n", + " jqueryScript.onload = function() {\n", + " var datatableScript = document.createElement('script');\n", + " datatableScript.src = 'https://cdn.datatables.net/1.10.20/js/jquery.dataTables.min.js';\n", + " datatableScript.type = 'text/javascript';\n", + " datatableScript.onload = function() {\n", + " window.interactive_beam_jquery = jQuery.noConflict(true);\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " $(\"#progress_indicator_6b2563c7f661bc0fc5729c2577d6f232\").remove();\n", + " });\n", + " }\n", + " document.head.appendChild(datatableScript);\n", + " };\n", + " document.head.appendChild(jqueryScript);\n", + " } else {\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " $(\"#progress_indicator_6b2563c7f661bc0fc5729c2577d6f232\").remove();\n", + " });\n", + " }" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + "
\n", + "
\n", + " Processing... collect\n", + "
\n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "application/javascript": [ + "\n", + " if (typeof window.interactive_beam_jquery == 'undefined') {\n", + " var jqueryScript = document.createElement('script');\n", + " jqueryScript.src = 'https://code.jquery.com/jquery-3.4.1.slim.min.js';\n", + " jqueryScript.type = 'text/javascript';\n", + " jqueryScript.onload = function() {\n", + " var datatableScript = document.createElement('script');\n", + " datatableScript.src = 'https://cdn.datatables.net/1.10.20/js/jquery.dataTables.min.js';\n", + " datatableScript.type = 'text/javascript';\n", + " datatableScript.onload = function() {\n", + " window.interactive_beam_jquery = jQuery.noConflict(true);\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " $(\"#progress_indicator_6fa896083b128ad99059af69a3d7fc7e\").remove();\n", + " });\n", + " }\n", + " document.head.appendChild(datatableScript);\n", + " };\n", + " document.head.appendChild(jqueryScript);\n", + " } else {\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " $(\"#progress_indicator_6fa896083b128ad99059af69a3d7fc7e\").remove();\n", + " });\n", + " }" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + "
\n", + "
\n", + " Processing... collect\n", + "
\n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "application/javascript": [ + "\n", + " if (typeof window.interactive_beam_jquery == 'undefined') {\n", + " var jqueryScript = document.createElement('script');\n", + " jqueryScript.src = 'https://code.jquery.com/jquery-3.4.1.slim.min.js';\n", + " jqueryScript.type = 'text/javascript';\n", + " jqueryScript.onload = function() {\n", + " var datatableScript = document.createElement('script');\n", + " datatableScript.src = 'https://cdn.datatables.net/1.10.20/js/jquery.dataTables.min.js';\n", + " datatableScript.type = 'text/javascript';\n", + " datatableScript.onload = function() {\n", + " window.interactive_beam_jquery = jQuery.noConflict(true);\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " $(\"#progress_indicator_6339347de9805da541eba53abaee2d5e\").remove();\n", + " });\n", + " }\n", + " document.head.appendChild(datatableScript);\n", + " };\n", + " document.head.appendChild(jqueryScript);\n", + " } else {\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " $(\"#progress_indicator_6339347de9805da541eba53abaee2d5e\").remove();\n", + " });\n", + " }" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + "
\n", + "
\n", + " Processing... collect\n", + "
\n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "application/javascript": [ + "\n", + " if (typeof window.interactive_beam_jquery == 'undefined') {\n", + " var jqueryScript = document.createElement('script');\n", + " jqueryScript.src = 'https://code.jquery.com/jquery-3.4.1.slim.min.js';\n", + " jqueryScript.type = 'text/javascript';\n", + " jqueryScript.onload = function() {\n", + " var datatableScript = document.createElement('script');\n", + " datatableScript.src = 'https://cdn.datatables.net/1.10.20/js/jquery.dataTables.min.js';\n", + " datatableScript.type = 'text/javascript';\n", + " datatableScript.onload = function() {\n", + " window.interactive_beam_jquery = jQuery.noConflict(true);\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " $(\"#progress_indicator_1af5b908898a1e5949dcc20549f650eb\").remove();\n", + " });\n", + " }\n", + " document.head.appendChild(datatableScript);\n", + " };\n", + " document.head.appendChild(jqueryScript);\n", + " } else {\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " $(\"#progress_indicator_1af5b908898a1e5949dcc20549f650eb\").remove();\n", + " });\n", + " }" + ] + }, + "metadata": {} + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " absolute_magnitude eccentricity inclination moid_ld \\\n", + "0 -5.657067 -0.867596 0.426645 0.540537 \n", + "12 -3.583402 -0.756931 1.364340 0.238610 \n", + "47 -3.400432 -0.912290 -0.211925 1.136060 \n", + "381 -2.363599 0.271412 -0.078826 0.535299 \n", + "515 -2.729540 1.469775 0.799915 -0.602881 \n", + "... ... ... ... ... \n", + "9146 0.563927 -0.508757 -0.327512 -0.637391 \n", + "9657 1.478779 0.487849 -0.637779 -0.648240 \n", + "9704 0.380957 -0.238383 0.443053 0.670490 \n", + "9879 1.295809 -0.442966 -0.698505 -0.494818 \n", + "9980 0.746898 -1.455992 -0.849144 0.592902 \n", + "\n", + " semi_major_axis_au_unit near_earth_object_N near_earth_object_Y \\\n", + "0 0.130649 1 0 \n", + "12 -0.187375 1 0 \n", + "47 0.691182 1 0 \n", + "381 0.712755 1 0 \n", + "515 -0.014654 1 0 \n", + "... ... ... ... \n", + "9146 -0.820638 1 0 \n", + "9657 -0.468778 1 0 \n", + "9704 0.587128 1 0 \n", + "9879 -0.662602 1 0 \n", + "9980 -0.022726 1 0 \n", + "\n", + " near_earth_object_nan object_class_AMO object_class_APO ... \\\n", + "0 0 0 0 ... \n", + "12 0 0 0 ... \n", + "47 0 0 0 ... \n", + "381 0 0 0 ... \n", + "515 0 0 0 ... \n", + "... ... ... ... ... \n", + "9146 0 0 0 ... \n", + "9657 0 0 0 ... \n", + "9704 0 0 0 ... \n", + "9879 0 0 0 ... \n", + "9980 0 0 0 ... \n", + "\n", + " object_class_CEN object_class_IMB object_class_MBA object_class_MCA \\\n", + "0 0 0 1 0 \n", + "12 0 0 1 0 \n", + "47 0 0 1 0 \n", + "381 0 0 1 0 \n", + "515 0 0 1 0 \n", + "... ... ... ... ... \n", + "9146 0 0 1 0 \n", + "9657 0 0 1 0 \n", + "9704 0 0 1 0 \n", + "9879 0 0 1 0 \n", + "9980 0 0 1 0 \n", + "\n", + " object_class_OMB object_class_TJN object_class_nan hazardous_flag_N \\\n", + "0 0 0 0 1 \n", + "12 0 0 0 1 \n", + "47 0 0 0 1 \n", + "381 0 0 0 1 \n", + "515 0 0 0 1 \n", + "... ... ... ... ... \n", + "9146 0 0 0 1 \n", + "9657 0 0 0 1 \n", + "9704 0 0 0 1 \n", + "9879 0 0 0 1 \n", + "9980 0 0 0 1 \n", + "\n", + " hazardous_flag_Y hazardous_flag_nan \n", + "0 0 0 \n", + "12 0 0 \n", + "47 0 0 \n", + "381 0 0 \n", + "515 0 0 \n", + "... ... ... \n", + "9146 0 0 \n", + "9657 0 0 \n", + "9704 0 0 \n", + "9879 0 0 \n", + "9980 0 0 \n", + "\n", + "[9999 rows x 22 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
absolute_magnitudeeccentricityinclinationmoid_ldsemi_major_axis_au_unitnear_earth_object_Nnear_earth_object_Ynear_earth_object_nanobject_class_AMOobject_class_APO...object_class_CENobject_class_IMBobject_class_MBAobject_class_MCAobject_class_OMBobject_class_TJNobject_class_nanhazardous_flag_Nhazardous_flag_Yhazardous_flag_nan
0-5.657067-0.8675960.4266450.5405370.13064910000...0010000100
12-3.583402-0.7569311.3643400.238610-0.18737510000...0010000100
47-3.400432-0.912290-0.2119251.1360600.69118210000...0010000100
381-2.3635990.271412-0.0788260.5352990.71275510000...0010000100
515-2.7295401.4697750.799915-0.602881-0.01465410000...0010000100
..................................................................
91460.563927-0.508757-0.327512-0.637391-0.82063810000...0010000100
96571.4787790.487849-0.637779-0.648240-0.46877810000...0010000100
97040.380957-0.2383830.4430530.6704900.58712810000...0010000100
98791.295809-0.442966-0.698505-0.494818-0.66260210000...0010000100
99800.746898-1.455992-0.8491440.592902-0.02272610000...0010000100
\n", + "

9999 rows × 22 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 35 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rVdSIyCB0spw" + }, + "source": [ + "# Putting it all together\n", + "\n", + "Let's now try to summarize all the steps that we've executed above into a full pipeline implementation and visualize our pre-processed data.\n", + "\n", + "> ℹ️ Note that the only standard Beam method invoked here is the `pipeline` instance. The rest of the pre-processing commands are all based on native pandas methods that have been integrated with the Beam DataFrame API. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ndaSNond0v8Q", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 651 + }, + "outputId": "b265e915-e649-44e4-a31a-95ac85c0ebf6" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/content/beam/sdks/python/apache_beam/dataframe/frame_base.py:145: RuntimeWarning: invalid value encountered in double_scalars\n", + " lambda left, right: getattr(left, op)(right), name=op, args=[other])\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + "
\n", + "
\n", + " Processing... collect\n", + "
\n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "application/javascript": [ + "\n", + " if (typeof window.interactive_beam_jquery == 'undefined') {\n", + " var jqueryScript = document.createElement('script');\n", + " jqueryScript.src = 'https://code.jquery.com/jquery-3.4.1.slim.min.js';\n", + " jqueryScript.type = 'text/javascript';\n", + " jqueryScript.onload = function() {\n", + " var datatableScript = document.createElement('script');\n", + " datatableScript.src = 'https://cdn.datatables.net/1.10.20/js/jquery.dataTables.min.js';\n", + " datatableScript.type = 'text/javascript';\n", + " datatableScript.onload = function() {\n", + " window.interactive_beam_jquery = jQuery.noConflict(true);\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " $(\"#progress_indicator_cb06c945824aa1bb68aa31ad7e601b74\").remove();\n", + " });\n", + " }\n", + " document.head.appendChild(datatableScript);\n", + " };\n", + " document.head.appendChild(jqueryScript);\n", + " } else {\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " $(\"#progress_indicator_cb06c945824aa1bb68aa31ad7e601b74\").remove();\n", + " });\n", + " }" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + "
\n", + "
\n", + " Processing... collect\n", + "
\n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "application/javascript": [ + "\n", + " if (typeof window.interactive_beam_jquery == 'undefined') {\n", + " var jqueryScript = document.createElement('script');\n", + " jqueryScript.src = 'https://code.jquery.com/jquery-3.4.1.slim.min.js';\n", + " jqueryScript.type = 'text/javascript';\n", + " jqueryScript.onload = function() {\n", + " var datatableScript = document.createElement('script');\n", + " datatableScript.src = 'https://cdn.datatables.net/1.10.20/js/jquery.dataTables.min.js';\n", + " datatableScript.type = 'text/javascript';\n", + " datatableScript.onload = function() {\n", + " window.interactive_beam_jquery = jQuery.noConflict(true);\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " $(\"#progress_indicator_fb923f80fecb72b4fa55e5cfdba16d23\").remove();\n", + " });\n", + " }\n", + " document.head.appendChild(datatableScript);\n", + " };\n", + " document.head.appendChild(jqueryScript);\n", + " } else {\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " $(\"#progress_indicator_fb923f80fecb72b4fa55e5cfdba16d23\").remove();\n", + " });\n", + " }" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + "
\n", + "
\n", + " Processing... collect\n", + "
\n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "application/javascript": [ + "\n", + " if (typeof window.interactive_beam_jquery == 'undefined') {\n", + " var jqueryScript = document.createElement('script');\n", + " jqueryScript.src = 'https://code.jquery.com/jquery-3.4.1.slim.min.js';\n", + " jqueryScript.type = 'text/javascript';\n", + " jqueryScript.onload = function() {\n", + " var datatableScript = document.createElement('script');\n", + " datatableScript.src = 'https://cdn.datatables.net/1.10.20/js/jquery.dataTables.min.js';\n", + " datatableScript.type = 'text/javascript';\n", + " datatableScript.onload = function() {\n", + " window.interactive_beam_jquery = jQuery.noConflict(true);\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " $(\"#progress_indicator_3f4b1a0f483cd017e004e11816a91d3b\").remove();\n", + " });\n", + " }\n", + " document.head.appendChild(datatableScript);\n", + " };\n", + " document.head.appendChild(jqueryScript);\n", + " } else {\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " $(\"#progress_indicator_3f4b1a0f483cd017e004e11816a91d3b\").remove();\n", + " });\n", + " }" + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + " \n", + "
\n", + "
\n", + " Processing... collect\n", + "
\n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "application/javascript": [ + "\n", + " if (typeof window.interactive_beam_jquery == 'undefined') {\n", + " var jqueryScript = document.createElement('script');\n", + " jqueryScript.src = 'https://code.jquery.com/jquery-3.4.1.slim.min.js';\n", + " jqueryScript.type = 'text/javascript';\n", + " jqueryScript.onload = function() {\n", + " var datatableScript = document.createElement('script');\n", + " datatableScript.src = 'https://cdn.datatables.net/1.10.20/js/jquery.dataTables.min.js';\n", + " datatableScript.type = 'text/javascript';\n", + " datatableScript.onload = function() {\n", + " window.interactive_beam_jquery = jQuery.noConflict(true);\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " $(\"#progress_indicator_fce8902eccbfaa17e32ba0c7c242ccec\").remove();\n", + " });\n", + " }\n", + " document.head.appendChild(datatableScript);\n", + " };\n", + " document.head.appendChild(jqueryScript);\n", + " } else {\n", + " window.interactive_beam_jquery(document).ready(function($){\n", + " \n", + " $(\"#progress_indicator_fce8902eccbfaa17e32ba0c7c242ccec\").remove();\n", + " });\n", + " }" + ] + }, + "metadata": {} + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " absolute_magnitude eccentricity inclination moid_ld \\\n", + "0 -5.657067 -0.867596 0.426645 0.540537 \n", + "12 -3.583402 -0.756931 1.364340 0.238610 \n", + "47 -3.400432 -0.912290 -0.211925 1.136060 \n", + "381 -2.363599 0.271412 -0.078826 0.535299 \n", + "515 -2.729540 1.469775 0.799915 -0.602881 \n", + "... ... ... ... ... \n", + "9146 0.563927 -0.508757 -0.327512 -0.637391 \n", + "9657 1.478779 0.487849 -0.637779 -0.648240 \n", + "9704 0.380957 -0.238383 0.443053 0.670490 \n", + "9879 1.295809 -0.442966 -0.698505 -0.494818 \n", + "9980 0.746898 -1.455992 -0.849144 0.592902 \n", + "\n", + " semi_major_axis_au_unit near_earth_object_N near_earth_object_Y \\\n", + "0 0.130649 1 0 \n", + "12 -0.187375 1 0 \n", + "47 0.691182 1 0 \n", + "381 0.712755 1 0 \n", + "515 -0.014654 1 0 \n", + "... ... ... ... \n", + "9146 -0.820638 1 0 \n", + "9657 -0.468778 1 0 \n", + "9704 0.587128 1 0 \n", + "9879 -0.662602 1 0 \n", + "9980 -0.022726 1 0 \n", + "\n", + " near_earth_object_nan object_class_AMO object_class_APO ... \\\n", + "0 0 0 0 ... \n", + "12 0 0 0 ... \n", + "47 0 0 0 ... \n", + "381 0 0 0 ... \n", + "515 0 0 0 ... \n", + "... ... ... ... ... \n", + "9146 0 0 0 ... \n", + "9657 0 0 0 ... \n", + "9704 0 0 0 ... \n", + "9879 0 0 0 ... \n", + "9980 0 0 0 ... \n", + "\n", + " object_class_CEN object_class_IMB object_class_MBA object_class_MCA \\\n", + "0 0 0 1 0 \n", + "12 0 0 1 0 \n", + "47 0 0 1 0 \n", + "381 0 0 1 0 \n", + "515 0 0 1 0 \n", + "... ... ... ... ... \n", + "9146 0 0 1 0 \n", + "9657 0 0 1 0 \n", + "9704 0 0 1 0 \n", + "9879 0 0 1 0 \n", + "9980 0 0 1 0 \n", + "\n", + " object_class_OMB object_class_TJN object_class_nan hazardous_flag_N \\\n", + "0 0 0 0 1 \n", + "12 0 0 0 1 \n", + "47 0 0 0 1 \n", + "381 0 0 0 1 \n", + "515 0 0 0 1 \n", + "... ... ... ... ... \n", + "9146 0 0 0 1 \n", + "9657 0 0 0 1 \n", + "9704 0 0 0 1 \n", + "9879 0 0 0 1 \n", + "9980 0 0 0 1 \n", + "\n", + " hazardous_flag_Y hazardous_flag_nan \n", + "0 0 0 \n", + "12 0 0 \n", + "47 0 0 \n", + "381 0 0 \n", + "515 0 0 \n", + "... ... ... \n", + "9146 0 0 \n", + "9657 0 0 \n", + "9704 0 0 \n", + "9879 0 0 \n", + "9980 0 0 \n", + "\n", + "[9999 rows x 22 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
absolute_magnitudeeccentricityinclinationmoid_ldsemi_major_axis_au_unitnear_earth_object_Nnear_earth_object_Ynear_earth_object_nanobject_class_AMOobject_class_APO...object_class_CENobject_class_IMBobject_class_MBAobject_class_MCAobject_class_OMBobject_class_TJNobject_class_nanhazardous_flag_Nhazardous_flag_Yhazardous_flag_nan
0-5.657067-0.8675960.4266450.5405370.13064910000...0010000100
12-3.583402-0.7569311.3643400.238610-0.18737510000...0010000100
47-3.400432-0.912290-0.2119251.1360600.69118210000...0010000100
381-2.3635990.271412-0.0788260.5352990.71275510000...0010000100
515-2.7295401.4697750.799915-0.602881-0.01465410000...0010000100
..................................................................
91460.563927-0.508757-0.327512-0.637391-0.82063810000...0010000100
96571.4787790.487849-0.637779-0.648240-0.46877810000...0010000100
97040.380957-0.2383830.4430530.6704900.58712810000...0010000100
98791.295809-0.442966-0.698505-0.494818-0.66260210000...0010000100
99800.746898-1.455992-0.8491440.592902-0.02272610000...0010000100
\n", + "

9999 rows × 22 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 36 + } + ], + "source": [ + "# Specify the location of source csv file to be processed\n", + "source_csv_file = 'gs://apache-beam-samples/nasa_jpl_asteroid/sample_10000.csv'\n", + "\n", + "# Initialize pipline\n", + "p = beam.Pipeline(InteractiveRunner())\n", + "\n", + "# Create a deferred Beam DataFrame with the contents of our csv file.\n", + "beam_df = p | beam.dataframe.io.read_csv(source_csv_file)\n", + "\n", + "# Drop irrelavant columns/columns with missing values\n", + "beam_df = beam_df.drop(['spk_id', 'full_name','diameter', 'albedo', 'diameter_sigma'], axis='columns', inplace=False)\n", + "\n", + "# Get numerical columns/columns with categorical variables\n", + "numerical_cols = beam_df.select_dtypes(include=np.number).columns.tolist()\n", + "categorical_cols = list(set(beam_df.columns) - set(numerical_cols))\n", + "\n", + "# Normalize the numerical variables \n", + "beam_df_numericals = beam_df.filter(items=numerical_cols)\n", + "beam_df_numericals = (beam_df_numericals - beam_df_numericals.mean())/beam_df_numericals.std()\n", + "\n", + "\n", + "# One-hot encode the categorical variables \n", + "for categorical_col in categorical_cols:\n", + " beam_df_categorical= get_one_hot_encoding(df=beam_df, categorical_col=categorical_col)\n", + " beam_df_numericals = beam_df_numericals.merge(beam_df_categorical, left_index = True, right_index = True)\n", + "\n", + "ib.collect(beam_df_numericals)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xZvJTqa3XKI_" + }, + "source": [ + "# Part II : Process the full dataset with the Distributed Runner\n", + "Now that we've showcased how to build and execute the pipeline locally using the Interactive Runner. It's time to execute our pipeline on our full dataset by switching to a distributed runner. For this example, we will exectue our pipeline on [Dataflow](https://cloud.google.com/dataflow/docs/guides/deploying-a-pipeline)." + ] + }, + { + "cell_type": "code", + "source": [ + "PROJECT_ID = \"\"\n", + "REGION = \"us-central1\"\n", + "TEMP_DIR = \"gs:///tmp\"\n", + "OUTPUT_DIR = \"gs:///dataframe-result\"" + ], + "metadata": { + "id": "dDBYbMEWbL4t" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "> ℹ️ Note that we are now processing the full dataset `full.csv` that containts approximately 1 million rows. We're also writing the results to a `csv` file instead of using `ib.collect()` to materialize the deferred dataframe.\n", + "\n", + "> ℹ️ The only things we need to change to switch from an interactive runner towards a distributed one are the pipeline options. The rest of the pipeline steps are exactly identical." + ], + "metadata": { + "id": "Qk1GaYoSc9-1" + } + }, + { + "cell_type": "code", + "source": [ + "# Specify the location of source csv file to be processed (full dataset)\n", + "source_csv_file = 'gs://apache-beam-samples/nasa_jpl_asteroid/full.csv'\n", + "\n", + "# Build a new pipeline that will execute on Dataflow.\n", + "p = beam.Pipeline(DataflowRunner(),\n", + " options=beam.options.pipeline_options.PipelineOptions(\n", + " project=PROJECT_ID,\n", + " region=REGION,\n", + " temp_location=TEMP_DIR,\n", + " # Disable autoscaling for a quicker demo\n", + " autoscaling_algorithm='NONE',\n", + " num_workers=10))\n", + "\n", + "# Create a deferred Beam DataFrame with the contents of our csv file.\n", + "beam_df = p | beam.dataframe.io.read_csv(source_csv_file)\n", + "\n", + "# Drop irrelavant columns/columns with missing values\n", + "beam_df = beam_df.drop(['spk_id', 'full_name','diameter', 'albedo', 'diameter_sigma'], axis='columns', inplace=False)\n", + "\n", + "# Get numerical columns/columns with categorical variables\n", + "numerical_cols = beam_df.select_dtypes(include=np.number).columns.tolist()\n", + "categorical_cols = list(set(beam_df.columns) - set(numerical_cols))\n", + "\n", + "# Normalize the numerical variables \n", + "beam_df_numericals = beam_df.filter(items=numerical_cols)\n", + "beam_df_numericals = (beam_df_numericals - beam_df_numericals.mean())/beam_df_numericals.std()\n", + "\n", + "\n", + "# One-hot encode the categorical variables \n", + "for categorical_col in categorical_cols:\n", + " beam_df_categorical= get_one_hot_encoding(df=beam_df, categorical_col=categorical_col)\n", + " beam_df_numericals = beam_df_numericals.merge(beam_df_categorical, left_index = True, right_index = True\n", + "\n", + "# Write the pre-processed dataset to csv\n", + "beam_df_numericals.to_csv(os.path.join(OUTPUT_DIR, \"preprocessed_data.csv\"))" + ], + "metadata": { + "id": "1XovR0gKbMlK" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Let's now submit and execute our pipeline." + ], + "metadata": { + "id": "a789u4Yecs_g" + } + }, + { + "cell_type": "code", + "source": [ + "p.run().wait_until_finish()" + ], + "metadata": { + "id": "pbUlC102bPaZ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "The execution of the pipeline job will take some time until it finishes." + ], + "metadata": { + "id": "dzdqmzKzTOng" + } + }, + { + "cell_type": "markdown", + "source": [ + "# What's next \n", + "\n", + "Now that we've seen how we can analyze and preprocess a large-scale dataset with the Beam DataFrames API, we can now train a model on a classification task on our preprocessed dataset. \n", + "\n", + "To learn more on how to get started with classifying structured data, refer to:\n", + "\n", + "* [Structred data classification from scratch](https://keras.io/examples/structured_data/structured_data_classification_from_scratch/)\n", + "\n", + "We suggest finding another dataset to try out the Beam DataFrames API processing with. Make sure think carefully about which features to include in your model and how they should be represented.\n", + "\n" + ], + "metadata": { + "id": "UOLr6YgOOSVQ" + } + }, + { + "cell_type": "markdown", + "source": [ + "# References\n", + "\n", + "* [Beam DataFrames overview](https://beam.apache.org/documentation/dsls/dataframes/overview) -- an overview of the Beam DataFrames API.\n", + "* [Differences from pandas](https://beam.apache.org/documentation/dsls/dataframes/differences-from-pandas) -- goes through some of the differences between Beam DataFrames and Pandas DataFrames, as well as some of the workarounds for unsupported operations.\n", + "* [10 minutes to Pandas](https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html) -- a quickstart guide to Pandas DataFrames.\n", + "* [Pandas DataFrame API](https://pandas.pydata.org/pandas-docs/stable/reference/frame.html) -- the API reference for Pandas DataFrames.\n", + "* [Data preparation and feature training in ML](https://developers.google.com/machine-learning/data-prep) -- A guideline on data transformation for ML training." + ], + "metadata": { + "id": "nG9WXXVcMCe_" + } + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/examples/notebooks/beam-ml/run_inference_pytorch_tensorflow_sklearn.ipynb b/examples/notebooks/beam-ml/run_inference_pytorch_tensorflow_sklearn.ipynb new file mode 100644 index 000000000000..cbca4a1e896b --- /dev/null +++ b/examples/notebooks/beam-ml/run_inference_pytorch_tensorflow_sklearn.ipynb @@ -0,0 +1,1178 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "LzOTNrs_P6Vv" + }, + "outputs": [], + "source": [ + "# @title ###### Licensed to the Apache Software Foundation (ASF), Version 2.0 (the \"License\")\n", + "\n", + "# Licensed to the Apache Software Foundation (ASF) under one\n", + "# or more contributor license agreements. See the NOTICE file\n", + "# distributed with this work for additional information\n", + "# regarding copyright ownership. The ASF licenses this file\n", + "# to you under the Apache License, Version 2.0 (the\n", + "# \"License\"); you may not use this file except in compliance\n", + "# with the License. You may obtain a copy of the License at\n", + "#\n", + "# http://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing,\n", + "# software distributed under the License is distributed on an\n", + "# \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY\n", + "# KIND, either express or implied. See the License for the\n", + "# specific language governing permissions and limitations\n", + "# under the License" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "faayYQYrQzY3" + }, + "source": [ + "## RunInference in Beam" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JjAt1GesQ9sg" + }, + "source": [ + "Starting with Apache Beam 2.40.0, a new API called RunInference can be used for using machine learning (ML) models to do local and remote inference with batch and streaming pipelines. RunInference API leverages Apache Beam concepts such as the BatchElements transform and the Shared class, to enable you to use models in your pipelines to create transforms optimized for machine learning inferences.\n", + "\n", + "One can find more details about RunInference API, here:https://beam.apache.org/documentation/sdks/python-machine-learning/" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "A8xNRyZMW1yK" + }, + "source": [ + "In this notebook, we show how to use RunInference with three different popular ML frameworks: PyTorch, TensorFlow and Scikit-learn. We showcase three pipelines that uses a text classification model for generating prediction.\n", + "\n", + "The different steps needed to build this pipeline can be summarized as follows:\n", + "* Read the images.\n", + "* Preprocess the text if needed\n", + "* Inference with PyTorch/TensorFlow/Scikit-learn Model\n", + "* PostProcess the output from RunInference if needed " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CTtBTpsHZFCk" + }, + "source": [ + "### RunInference with a PyTorch Model\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5kkjbcIzZIf6" + }, + "source": [ + "#### Install Dependency" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "MRASwRTxY-2u", + "outputId": "28760c59-c4dc-4486-dbd2-e7ac2c92c3b8" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Requirement already satisfied: pip in /usr/local/lib/python3.7/dist-packages (22.3)\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0mLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Requirement already satisfied: transformers in /usr/local/lib/python3.7/dist-packages (4.23.1)\n", + "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers) (4.64.1)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (1.21.6)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from transformers) (21.3)\n", + "Requirement already satisfied: huggingface-hub<1.0,>=0.10.0 in /usr/local/lib/python3.7/dist-packages (from transformers) (0.10.1)\n", + "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from transformers) (4.13.0)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.7/dist-packages (from transformers) (6.0)\n", + "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.7/dist-packages (from transformers) (0.13.1)\n", + "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (2022.6.2)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers) (3.8.0)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers) (2.28.1)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.7/dist-packages (from huggingface-hub<1.0,>=0.10.0->transformers) (4.1.1)\n", + "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.0->transformers) (3.0.9)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->transformers) (3.9.0)\n", + "Requirement already satisfied: charset-normalizer<3,>=2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2.1.1)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2022.9.24)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2.10)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (1.24.3)\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0mLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting google-api-core==1.32\n", + " Using cached google_api_core-1.32.0-py2.py3-none-any.whl (93 kB)\n", + "Requirement already satisfied: protobuf<4.0.0dev,>=3.12.0 in /usr/local/lib/python3.7/dist-packages (from google-api-core==1.32) (3.20.3)\n", + "Requirement already satisfied: pytz in /usr/local/lib/python3.7/dist-packages (from google-api-core==1.32) (2022.4)\n", + "Requirement already satisfied: google-auth<2.0dev,>=1.25.0 in /usr/local/lib/python3.7/dist-packages (from google-api-core==1.32) (1.35.0)\n", + "Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.6.0 in /usr/local/lib/python3.7/dist-packages (from google-api-core==1.32) (1.56.4)\n", + "Requirement already satisfied: setuptools>=40.3.0 in /usr/local/lib/python3.7/dist-packages (from google-api-core==1.32) (57.4.0)\n", + "Requirement already satisfied: six>=1.13.0 in /usr/local/lib/python3.7/dist-packages (from google-api-core==1.32) (1.15.0)\n", + "Requirement already satisfied: packaging>=14.3 in /usr/local/lib/python3.7/dist-packages (from google-api-core==1.32) (21.3)\n", + "Requirement already satisfied: requests<3.0.0dev,>=2.18.0 in /usr/local/lib/python3.7/dist-packages (from google-api-core==1.32) (2.28.1)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.7/dist-packages (from google-auth<2.0dev,>=1.25.0->google-api-core==1.32) (4.9)\n", + "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from google-auth<2.0dev,>=1.25.0->google-api-core==1.32) (4.2.4)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.7/dist-packages (from google-auth<2.0dev,>=1.25.0->google-api-core==1.32) (0.2.8)\n", + "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=14.3->google-api-core==1.32) (3.0.9)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0dev,>=2.18.0->google-api-core==1.32) (2.10)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0dev,>=2.18.0->google-api-core==1.32) (2022.9.24)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0dev,>=2.18.0->google-api-core==1.32) (1.24.3)\n", + "Requirement already satisfied: charset-normalizer<3,>=2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0dev,>=2.18.0->google-api-core==1.32) (2.1.1)\n", + "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.7/dist-packages (from pyasn1-modules>=0.2.1->google-auth<2.0dev,>=1.25.0->google-api-core==1.32) (0.4.8)\n", + "Installing collected packages: google-api-core\n", + " Attempting uninstall: google-api-core\n", + " Found existing installation: google-api-core 1.33.2\n", + " Uninstalling google-api-core-1.33.2:\n", + " Successfully uninstalled google-api-core-1.33.2\n", + "Successfully installed google-api-core-1.32.0\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + }, + { + "data": { + "application/vnd.colab-display-data+json": { + "pip_warning": { + "packages": [ + "google" + ] + } + } + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "!pip install --upgrade pip\n", + "!pip install apache_beam[gcp]>=2.40.0\n", + "!pip install transformers\n", + "!pip install google-api-core==1.32" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ObRPUrlEbjHj" + }, + "source": [ + "#### Model\n", + "\n", + "We are using a pretrained text classification model, [distilbert-base-uncased-finetuned-sst-2-english](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english?text=I+like+you.+I+love+you). This model is a fine-tune checkpoint of DistilBERT-base-uncased, fine-tuned on SST-2 dataset.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "vfDyy4WNQaJM", + "outputId": "75683116-f415-4956-f44c-baa953c564e1" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Error: Failed to call git rev-parse --git-dir --show-toplevel: \"fatal: not a git repository (or any of the parent directories): .git\\n\"\n", + "Git LFS initialized.\n", + "fatal: destination path 'distilbert-base-uncased-finetuned-sst-2-english' already exists and is not an empty directory.\n", + "'=2.40.0' distilbert-base-uncased-finetuned-sst-2-english sample_data\n" + ] + } + ], + "source": [ + "! git lfs install\n", + "! git clone https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english\n", + "! ls" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vA1UmbFRb5C-" + }, + "source": [ + "#### Helper Functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c4ZwN8wsbvgK" + }, + "outputs": [], + "source": [ + "from collections import defaultdict\n", + "\n", + "import torch\n", + "from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig\n", + "\n", + "import apache_beam as beam\n", + "from apache_beam.ml.inference import RunInference\n", + "from apache_beam.ml.inference.base import PredictionResult, KeyedModelHandler\n", + "from apache_beam.ml.inference.pytorch_inference import PytorchModelHandlerKeyedTensor\n", + "\n", + "\n", + "class HuggingFaceStripBatchingWrapper(DistilBertForSequenceClassification):\n", + " \"\"\"Wrapper around HugginFace model because RunInference requires a batch\n", + " as a list of dicts instead of a dict of lists. Another workaround can be found\n", + " here where they disable batching instead.\n", + " https://github.com/apache/beam/blob/master/sdks/python/apache_beam/examples/inference/pytorch_language_modeling.py\"\"\"\n", + " def forward(self, **kwargs):\n", + " output = super().forward(**kwargs)\n", + " return [dict(zip(output, v)) for v in zip(*output.values())]\n", + "\n", + "\n", + "\n", + "class Tokenize(beam.DoFn):\n", + " def __init__(self, model_name: str):\n", + " self._model_name = model_name\n", + "\n", + " def setup(self):\n", + " self._tokenizer = DistilBertTokenizer.from_pretrained(self._model_name)\n", + " \n", + " def process(self, text_input: str):\n", + " # We need to pad the tokens tensors to max length to make sure that all the tensors\n", + " # are of the same length and hence stack-able by the RunInference API, normally you would batch first\n", + " # and tokenize the batch after and pad each tensor the the max length in the batch.\n", + " # see: https://beam.apache.org/documentation/sdks/python-machine-learning/#unable-to-batch-tensor-elements\n", + " tokens = self._tokenizer(text_input, return_tensors='pt', padding='max_length', max_length=512)\n", + " # squeeze because tokenization adds an extra dimension, which is empty\n", + " # in this case because we're tokenizing one element at a time.\n", + " tokens = {key: torch.squeeze(val) for key, val in tokens.items()}\n", + " return [(text_input, tokens)]\n", + "\n", + "class PostProcessor(beam.DoFn):\n", + " def process(self, tuple_):\n", + " text_input, prediction_result = tuple_\n", + " softmax = torch.nn.Softmax(dim=-1)(prediction_result.inference['logits']).detach().numpy()\n", + " return [{\"input\": text_input, \"softmax\": softmax}]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WYYbQTMWctkW" + }, + "source": [ + "#### RunInference Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lLb8D2n2n09n" + }, + "outputs": [], + "source": [ + "inputs = [\n", + " \"This is the worst food I have ever eaten\",\n", + " \"In my soul and in my heart, I’m convinced I’m wrong!\",\n", + " \"Be with me always—take any form—drive me mad! only do not leave me in this abyss, where I cannot find you!\",\n", + " \"Do I want to live? Would you like to live with your soul in the grave?\",\n", + " \"Honest people don’t hide their deeds.\",\n", + " \"Nelly, I am Heathcliff! He’s always, always in my mind: not as a pleasure, any more than I am always a pleasure to myself, but as my own being.\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 269 + }, + "id": "TDmMARxGb751", + "outputId": "437e168a-b4c5-463b-ce5f-09a8cb8d8191" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:10: FutureWarning: PytorchModelHandlerKeyedTensor is experimental. No backwards-compatibility guarantees.\n", + " # Remove the CWD from sys.path while we load stuff.\n", + "WARNING:apache_beam.runners.interactive.interactive_environment:Dependencies required for Interactive Beam PCollection visualization are not available, please use: `pip install apache-beam[interactive]` to install necessary dependencies to enable all data visualization features.\n" + ] + }, + { + "data": { + "application/javascript": "\n if (typeof window.interactive_beam_jquery == 'undefined') {\n var jqueryScript = document.createElement('script');\n jqueryScript.src = 'https://code.jquery.com/jquery-3.4.1.slim.min.js';\n jqueryScript.type = 'text/javascript';\n jqueryScript.onload = function() {\n var datatableScript = document.createElement('script');\n datatableScript.src = 'https://cdn.datatables.net/1.10.20/js/jquery.dataTables.min.js';\n datatableScript.type = 'text/javascript';\n datatableScript.onload = function() {\n window.interactive_beam_jquery = jQuery.noConflict(true);\n window.interactive_beam_jquery(document).ready(function($){\n \n });\n }\n document.head.appendChild(datatableScript);\n };\n document.head.appendChild(jqueryScript);\n } else {\n window.interactive_beam_jquery(document).ready(function($){\n \n });\n }" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.7/dist-packages/dill/_dill.py:472: FutureWarning: PytorchModelHandlerKeyedTensor is experimental. No backwards-compatibility guarantees.\n", + " obj = StockUnpickler.load(self)\n", + "/usr/local/lib/python3.7/dist-packages/dill/_dill.py:472: FutureWarning: PytorchModelHandlerKeyedTensor is experimental. No backwards-compatibility guarantees.\n", + " obj = StockUnpickler.load(self)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input: This is the worst food I have ever eaten -> negative=99.9777%/positive=0.0223%\n", + "Input: In my soul and in my heart, I’m convinced I’m wrong! -> negative=1.6313%/positive=98.3687%\n", + "Input: Be with me always—take any form—drive me mad! only do not leave me in this abyss, where I cannot find you! -> negative=62.1188%/positive=37.8812%\n", + "Input: Do I want to live? Would you like to live with your soul in the grave? -> negative=73.6841%/positive=26.3159%\n", + "Input: Honest people don’t hide their deeds. -> negative=0.2377%/positive=99.7623%\n", + "Input: Nelly, I am Heathcliff! He’s always, always in my mind: not as a pleasure, any more than I am always a pleasure to myself, but as my own being. -> negative=0.0672%/positive=99.9328%\n" + ] + } + ], + "source": [ + "model_handler = PytorchModelHandlerKeyedTensor(\n", + " state_dict_path=\"./distilbert-base-uncased-finetuned-sst-2-english/pytorch_model.bin\",\n", + " model_class=HuggingFaceStripBatchingWrapper,\n", + " model_params={\"config\": DistilBertConfig.from_pretrained(\"./distilbert-base-uncased-finetuned-sst-2-english/config.json\")},\n", + " device='cuda:0')\n", + "\n", + "keyed_model_handler = KeyedModelHandler(model_handler)\n", + "\n", + "with beam.Pipeline() as pipeline:\n", + " _ = (pipeline | \"Create inputs\" >> beam.Create(inputs)\n", + " | \"Tokenize\" >> beam.ParDo(Tokenize(\"distilbert-base-uncased-finetuned-sst-2-english\"))\n", + " | \"Inference\" >> RunInference(model_handler=keyed_model_handler)\n", + " | \"Postprocess\" >> beam.ParDo(PostProcessor())\n", + " | \"Print\" >> beam.Map(lambda x: print(f\"Input: {x['input']} -> negative={100 * x['softmax'][0]:.4f}%/positive={100 * x['softmax'][1]:.4f}%\"))\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7KXeaQg3eCcp" + }, + "source": [ + "### RunInference with a TensorFlow Model\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hEHxNka4eOhC" + }, + "source": [ + "Note: Tensorflow models are supported through tfx-bsl." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8KyXULYbeYlD" + }, + "source": [ + "#### Install Dependency" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "uqWJhQBlc4oT", + "outputId": "2a17a966-fe2d-45d8-b6b9-02534f40c9a8" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Requirement already satisfied: pip in /usr/local/lib/python3.7/dist-packages (22.3)\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0mLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting apache_beam[gcp]==2.41.0\n", + " Downloading apache_beam-2.41.0-cp37-cp37m-manylinux2010_x86_64.whl (10.9 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m10.9/10.9 MB\u001b[0m \u001b[31m42.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: proto-plus<2,>=1.7.1 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (1.22.1)\n", + "Requirement already satisfied: pydot<2,>=1.2.0 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (1.3.0)\n", + "Requirement already satisfied: numpy<1.23.0,>=1.14.3 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (1.21.6)\n", + "Requirement already satisfied: pyarrow<8.0.0,>=0.15.1 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (6.0.1)\n", + "Requirement already satisfied: fastavro<2,>=0.23.6 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (1.6.1)\n", + "Requirement already satisfied: hdfs<3.0.0,>=2.1.0 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (2.7.0)\n", + "Requirement already satisfied: dill<0.3.2,>=0.3.1.1 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (0.3.1.1)\n", + "Requirement already satisfied: requests<3.0.0,>=2.24.0 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (2.28.1)\n", + "Requirement already satisfied: python-dateutil<3,>=2.8.0 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (2.8.2)\n", + "Requirement already satisfied: pytz>=2018.3 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (2022.4)\n", + "Requirement already satisfied: crcmod<2.0,>=1.7 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (1.7)\n", + "Requirement already satisfied: protobuf<4,>=3.12.2 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (3.20.3)\n", + "Requirement already satisfied: cloudpickle<3,>=2.1.0 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (2.1.0)\n", + "Requirement already satisfied: orjson<4.0 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (3.8.0)\n", + "Requirement already satisfied: pymongo<4.0.0,>=3.8.0 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (3.12.3)\n", + "Requirement already satisfied: grpcio<2,>=1.33.1 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (1.49.1)\n", + "Requirement already satisfied: typing-extensions>=3.7.0 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (4.1.1)\n", + "Requirement already satisfied: httplib2<0.21.0,>=0.8 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (0.17.4)\n", + "Requirement already satisfied: google-cloud-language<2,>=1.3.0 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (1.3.2)\n", + "Requirement already satisfied: google-cloud-pubsub<3,>=2.1.0 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (2.13.10)\n", + "Requirement already satisfied: google-apitools<0.5.32,>=0.5.31 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (0.5.31)\n", + "Requirement already satisfied: google-cloud-recommendations-ai<0.8.0,>=0.1.0 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (0.7.1)\n", + "Requirement already satisfied: cachetools<5,>=3.1.0 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (4.2.4)\n", + "Requirement already satisfied: google-cloud-bigtable<2,>=0.31.1 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (1.7.2)\n", + "Requirement already satisfied: google-cloud-dlp<4,>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (3.9.2)\n", + "Requirement already satisfied: google-auth-httplib2<0.2.0,>=0.1.0 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (0.1.0)\n", + "Requirement already satisfied: google-cloud-datastore<2,>=1.8.0 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (1.8.0)\n", + "Requirement already satisfied: google-cloud-spanner<2,>=1.13.0 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (1.19.3)\n", + "Requirement already satisfied: google-cloud-bigquery-storage<2.14,>=2.6.3 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (2.13.2)\n", + "Requirement already satisfied: google-cloud-vision<2,>=0.38.0 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (1.0.2)\n", + "Requirement already satisfied: google-cloud-core<3,>=0.28.1 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (1.7.3)\n", + "Requirement already satisfied: google-cloud-videointelligence<2,>=1.8.0 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (1.16.3)\n", + "Requirement already satisfied: grpcio-gcp<1,>=0.2.2 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (0.2.2)\n", + "Requirement already satisfied: google-cloud-pubsublite<2,>=1.2.0 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (1.6.0)\n", + "Requirement already satisfied: google-auth<3,>=1.18.0 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (1.35.0)\n", + "Requirement already satisfied: google-cloud-bigquery<3,>=1.6.0 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (1.21.0)\n", + "Requirement already satisfied: google-api-core!=2.8.2,<3 in /usr/local/lib/python3.7/dist-packages (from apache_beam[gcp]==2.41.0) (1.32.0)\n", + "Requirement already satisfied: six>=1.13.0 in /usr/local/lib/python3.7/dist-packages (from google-api-core!=2.8.2,<3->apache_beam[gcp]==2.41.0) (1.15.0)\n", + "Requirement already satisfied: packaging>=14.3 in /usr/local/lib/python3.7/dist-packages (from google-api-core!=2.8.2,<3->apache_beam[gcp]==2.41.0) (21.3)\n", + "Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.6.0 in /usr/local/lib/python3.7/dist-packages (from google-api-core!=2.8.2,<3->apache_beam[gcp]==2.41.0) (1.56.4)\n", + "Requirement already satisfied: setuptools>=40.3.0 in /usr/local/lib/python3.7/dist-packages (from google-api-core!=2.8.2,<3->apache_beam[gcp]==2.41.0) (57.4.0)\n", + "Requirement already satisfied: fasteners>=0.14 in /usr/local/lib/python3.7/dist-packages (from google-apitools<0.5.32,>=0.5.31->apache_beam[gcp]==2.41.0) (0.18)\n", + "Requirement already satisfied: oauth2client>=1.4.12 in /usr/local/lib/python3.7/dist-packages (from google-apitools<0.5.32,>=0.5.31->apache_beam[gcp]==2.41.0) (4.1.3)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.7/dist-packages (from google-auth<3,>=1.18.0->apache_beam[gcp]==2.41.0) (4.9)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.7/dist-packages (from google-auth<3,>=1.18.0->apache_beam[gcp]==2.41.0) (0.2.8)\n", + "Requirement already satisfied: google-resumable-media!=0.4.0,<0.5.0dev,>=0.3.1 in /usr/local/lib/python3.7/dist-packages (from google-cloud-bigquery<3,>=1.6.0->apache_beam[gcp]==2.41.0) (0.4.1)\n", + "Requirement already satisfied: grpc-google-iam-v1<0.13dev,>=0.12.3 in /usr/local/lib/python3.7/dist-packages (from google-cloud-bigtable<2,>=0.31.1->apache_beam[gcp]==2.41.0) (0.12.4)\n", + "Requirement already satisfied: grpcio-status>=1.16.0 in /usr/local/lib/python3.7/dist-packages (from google-cloud-pubsub<3,>=2.1.0->apache_beam[gcp]==2.41.0) (1.48.2)\n", + "Requirement already satisfied: overrides<7.0.0,>=6.0.1 in /usr/local/lib/python3.7/dist-packages (from google-cloud-pubsublite<2,>=1.2.0->apache_beam[gcp]==2.41.0) (6.5.0)\n", + "Requirement already satisfied: docopt in /usr/local/lib/python3.7/dist-packages (from hdfs<3.0.0,>=2.1.0->apache_beam[gcp]==2.41.0) (0.6.2)\n", + "Requirement already satisfied: pyparsing>=2.1.4 in /usr/local/lib/python3.7/dist-packages (from pydot<2,>=1.2.0->apache_beam[gcp]==2.41.0) (3.0.9)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.24.0->apache_beam[gcp]==2.41.0) (1.24.3)\n", + "Requirement already satisfied: charset-normalizer<3,>=2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.24.0->apache_beam[gcp]==2.41.0) (2.1.1)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.24.0->apache_beam[gcp]==2.41.0) (2.10)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.24.0->apache_beam[gcp]==2.41.0) (2022.9.24)\n", + "Requirement already satisfied: pyasn1>=0.1.7 in /usr/local/lib/python3.7/dist-packages (from oauth2client>=1.4.12->google-apitools<0.5.32,>=0.5.31->apache_beam[gcp]==2.41.0) (0.4.8)\n", + "Installing collected packages: apache_beam\n", + " Attempting uninstall: apache_beam\n", + " Found existing installation: apache-beam 2.42.0\n", + " Uninstalling apache-beam-2.42.0:\n", + " Successfully uninstalled apache-beam-2.42.0\n", + "Successfully installed apache_beam-2.41.0\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + }, + { + "data": { + "application/vnd.colab-display-data+json": { + "pip_warning": { + "packages": [ + "apache_beam" + ] + } + } + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting tensorflow==2.8\n", + " Downloading https://us-python.pkg.dev/colab-wheels/public/tensorflow/tensorflow-2.8.0%2Bzzzcolab20220506162203-cp37-cp37m-linux_x86_64.whl (668.3 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m668.3/668.3 MB\u001b[0m \u001b[31m2.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: libclang>=9.0.1 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.8) (14.0.6)\n", + "Requirement already satisfied: h5py>=2.9.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.8) (3.1.0)\n", + "Requirement already satisfied: google-pasta>=0.1.1 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.8) (0.2.0)\n", + "Collecting keras<2.9,>=2.8.0rc0\n", + " Downloading keras-2.8.0-py2.py3-none-any.whl (1.4 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.4/1.4 MB\u001b[0m \u001b[31m17.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: flatbuffers>=1.12 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.8) (1.12)\n", + "Requirement already satisfied: gast>=0.2.1 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.8) (0.4.0)\n", + "Requirement already satisfied: opt-einsum>=2.3.2 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.8) (3.3.0)\n", + "Requirement already satisfied: six>=1.12.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.8) (1.15.0)\n", + "Requirement already satisfied: grpcio<2.0,>=1.24.3 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.8) (1.49.1)\n", + "Requirement already satisfied: typing-extensions>=3.6.6 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.8) (4.1.1)\n", + "Requirement already satisfied: tensorflow-io-gcs-filesystem>=0.23.1 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.8) (0.27.0)\n", + "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.8) (57.4.0)\n", + "Requirement already satisfied: keras-preprocessing>=1.1.1 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.8) (1.1.2)\n", + "Requirement already satisfied: absl-py>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.8) (1.3.0)\n", + "Requirement already satisfied: protobuf>=3.9.2 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.8) (3.20.3)\n", + "Requirement already satisfied: astunparse>=1.6.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.8) (1.6.3)\n", + "Requirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.8) (2.0.1)\n", + "Collecting tf-estimator-nightly==2.8.0.dev2021122109\n", + " Downloading tf_estimator_nightly-2.8.0.dev2021122109-py2.py3-none-any.whl (462 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m462.5/462.5 kB\u001b[0m \u001b[31m19.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting tensorboard<2.9,>=2.8\n", + " Downloading tensorboard-2.8.0-py3-none-any.whl (5.8 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.8/5.8 MB\u001b[0m \u001b[31m62.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: wrapt>=1.11.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.8) (1.14.1)\n", + "Requirement already satisfied: numpy>=1.20 in /usr/local/lib/python3.7/dist-packages (from tensorflow==2.8) (1.21.6)\n", + "Requirement already satisfied: wheel<1.0,>=0.23.0 in /usr/local/lib/python3.7/dist-packages (from astunparse>=1.6.0->tensorflow==2.8) (0.37.1)\n", + "Requirement already satisfied: cached-property in /usr/local/lib/python3.7/dist-packages (from h5py>=2.9.0->tensorflow==2.8) (1.5.2)\n", + "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.7/dist-packages (from tensorboard<2.9,>=2.8->tensorflow==2.8) (3.4.1)\n", + "Requirement already satisfied: requests<3,>=2.21.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard<2.9,>=2.8->tensorflow==2.8) (2.28.1)\n", + "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /usr/local/lib/python3.7/dist-packages (from tensorboard<2.9,>=2.8->tensorflow==2.8) (0.4.6)\n", + "Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.7/dist-packages (from tensorboard<2.9,>=2.8->tensorflow==2.8) (1.0.1)\n", + "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard<2.9,>=2.8->tensorflow==2.8) (1.8.1)\n", + "Requirement already satisfied: google-auth<3,>=1.6.3 in /usr/local/lib/python3.7/dist-packages (from tensorboard<2.9,>=2.8->tensorflow==2.8) (1.35.0)\n", + "Requirement already satisfied: tensorboard-data-server<0.7.0,>=0.6.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard<2.9,>=2.8->tensorflow==2.8) (0.6.1)\n", + "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from google-auth<3,>=1.6.3->tensorboard<2.9,>=2.8->tensorflow==2.8) (4.2.4)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.7/dist-packages (from google-auth<3,>=1.6.3->tensorboard<2.9,>=2.8->tensorflow==2.8) (0.2.8)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.7/dist-packages (from google-auth<3,>=1.6.3->tensorboard<2.9,>=2.8->tensorflow==2.8) (4.9)\n", + "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.7/dist-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard<2.9,>=2.8->tensorflow==2.8) (1.3.1)\n", + "Requirement already satisfied: importlib-metadata>=4.4 in /usr/local/lib/python3.7/dist-packages (from markdown>=2.6.8->tensorboard<2.9,>=2.8->tensorflow==2.8) (4.13.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.21.0->tensorboard<2.9,>=2.8->tensorflow==2.8) (2.10)\n", + "Requirement already satisfied: charset-normalizer<3,>=2 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.21.0->tensorboard<2.9,>=2.8->tensorflow==2.8) (2.1.1)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.21.0->tensorboard<2.9,>=2.8->tensorflow==2.8) (2022.9.24)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.21.0->tensorboard<2.9,>=2.8->tensorflow==2.8) (1.24.3)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=4.4->markdown>=2.6.8->tensorboard<2.9,>=2.8->tensorflow==2.8) (3.9.0)\n", + "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.7/dist-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard<2.9,>=2.8->tensorflow==2.8) (0.4.8)\n", + "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard<2.9,>=2.8->tensorflow==2.8) (3.2.1)\n", + "Installing collected packages: tf-estimator-nightly, keras, tensorboard, tensorflow\n", + " Attempting uninstall: keras\n", + " Found existing installation: keras 2.9.0\n", + " Uninstalling keras-2.9.0:\n", + " Successfully uninstalled keras-2.9.0\n", + " Attempting uninstall: tensorboard\n", + " Found existing installation: tensorboard 2.9.1\n", + " Uninstalling tensorboard-2.9.1:\n", + " Successfully uninstalled tensorboard-2.9.1\n", + " Attempting uninstall: tensorflow\n", + " Found existing installation: tensorflow 2.9.2\n", + " Uninstalling tensorflow-2.9.2:\n", + " Successfully uninstalled tensorflow-2.9.2\n", + "Successfully installed keras-2.8.0 tensorboard-2.8.0 tensorflow-2.8.0+zzzcolab20220506162203 tf-estimator-nightly-2.8.0.dev2021122109\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0mLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting tfx_bsl\n", + " Downloading tfx_bsl-1.10.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (21.6 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.6/21.6 MB\u001b[0m \u001b[31m49.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: tensorflow-metadata<1.11.0,>=1.10.0 in /usr/local/lib/python3.7/dist-packages (from tfx_bsl) (1.10.0)\n", + "Collecting tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,<3,>=1.15.5\n", + " Downloading tensorflow-2.10.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (578.0 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m578.0/578.0 MB\u001b[0m \u001b[31m2.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: google-api-python-client<2,>=1.7.11 in /usr/local/lib/python3.7/dist-packages (from tfx_bsl) (1.12.11)\n", + "Collecting tensorflow-serving-api!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,<3,>=1.15\n", + " Downloading tensorflow_serving_api-2.10.0-py2.py3-none-any.whl (37 kB)\n", + "Requirement already satisfied: numpy<2,>=1.16 in /usr/local/lib/python3.7/dist-packages (from tfx_bsl) (1.21.6)\n", + "Requirement already satisfied: apache-beam[gcp]<3,>=2.40 in /usr/local/lib/python3.7/dist-packages (from tfx_bsl) (2.41.0)\n", + "Requirement already satisfied: absl-py<2.0.0,>=0.9 in /usr/local/lib/python3.7/dist-packages (from tfx_bsl) (1.3.0)\n", + "Requirement already satisfied: protobuf<3.21,>=3.13 in /usr/local/lib/python3.7/dist-packages (from tfx_bsl) (3.20.3)\n", + "Requirement already satisfied: pyarrow<7,>=6 in /usr/local/lib/python3.7/dist-packages (from tfx_bsl) (6.0.1)\n", + "Requirement already satisfied: pandas<2,>=1.0 in /usr/local/lib/python3.7/dist-packages (from tfx_bsl) (1.3.5)\n", + "Requirement already satisfied: requests<3.0.0,>=2.24.0 in /usr/local/lib/python3.7/dist-packages (from apache-beam[gcp]<3,>=2.40->tfx_bsl) (2.28.1)\n", + "Requirement already satisfied: dill<0.3.2,>=0.3.1.1 in /usr/local/lib/python3.7/dist-packages (from apache-beam[gcp]<3,>=2.40->tfx_bsl) (0.3.1.1)\n", + "Requirement already satisfied: pymongo<4.0.0,>=3.8.0 in /usr/local/lib/python3.7/dist-packages (from apache-beam[gcp]<3,>=2.40->tfx_bsl) (3.12.3)\n", + "Requirement already satisfied: cloudpickle<3,>=2.1.0 in /usr/local/lib/python3.7/dist-packages (from apache-beam[gcp]<3,>=2.40->tfx_bsl) (2.1.0)\n", + "Requirement already satisfied: fastavro<2,>=0.23.6 in /usr/local/lib/python3.7/dist-packages (from apache-beam[gcp]<3,>=2.40->tfx_bsl) (1.6.1)\n", + "Requirement already satisfied: pydot<2,>=1.2.0 in /usr/local/lib/python3.7/dist-packages (from apache-beam[gcp]<3,>=2.40->tfx_bsl) (1.3.0)\n", + "Requirement already satisfied: pytz>=2018.3 in /usr/local/lib/python3.7/dist-packages (from apache-beam[gcp]<3,>=2.40->tfx_bsl) (2022.4)\n", + "Requirement already satisfied: grpcio<2,>=1.33.1 in /usr/local/lib/python3.7/dist-packages (from apache-beam[gcp]<3,>=2.40->tfx_bsl) (1.49.1)\n", + "Requirement already satisfied: crcmod<2.0,>=1.7 in /usr/local/lib/python3.7/dist-packages (from apache-beam[gcp]<3,>=2.40->tfx_bsl) (1.7)\n", + "Requirement already satisfied: httplib2<0.21.0,>=0.8 in /usr/local/lib/python3.7/dist-packages (from apache-beam[gcp]<3,>=2.40->tfx_bsl) (0.17.4)\n", + "Requirement already satisfied: hdfs<3.0.0,>=2.1.0 in /usr/local/lib/python3.7/dist-packages (from apache-beam[gcp]<3,>=2.40->tfx_bsl) (2.7.0)\n", + "Requirement already satisfied: proto-plus<2,>=1.7.1 in /usr/local/lib/python3.7/dist-packages (from apache-beam[gcp]<3,>=2.40->tfx_bsl) (1.22.1)\n", + "Requirement already satisfied: typing-extensions>=3.7.0 in /usr/local/lib/python3.7/dist-packages (from apache-beam[gcp]<3,>=2.40->tfx_bsl) (4.1.1)\n", + "Requirement already satisfied: orjson<4.0 in /usr/local/lib/python3.7/dist-packages (from apache-beam[gcp]<3,>=2.40->tfx_bsl) (3.8.0)\n", + "Requirement already satisfied: python-dateutil<3,>=2.8.0 in /usr/local/lib/python3.7/dist-packages (from apache-beam[gcp]<3,>=2.40->tfx_bsl) (2.8.2)\n", + "Requirement already satisfied: cachetools<5,>=3.1.0 in /usr/local/lib/python3.7/dist-packages (from apache-beam[gcp]<3,>=2.40->tfx_bsl) (4.2.4)\n", + "Requirement already satisfied: google-cloud-spanner<2,>=1.13.0 in /usr/local/lib/python3.7/dist-packages (from apache-beam[gcp]<3,>=2.40->tfx_bsl) (1.19.3)\n", + "Requirement already satisfied: grpcio-gcp<1,>=0.2.2 in /usr/local/lib/python3.7/dist-packages (from apache-beam[gcp]<3,>=2.40->tfx_bsl) (0.2.2)\n", + "Requirement already satisfied: google-cloud-videointelligence<2,>=1.8.0 in /usr/local/lib/python3.7/dist-packages (from apache-beam[gcp]<3,>=2.40->tfx_bsl) (1.16.3)\n", + "Requirement already satisfied: google-cloud-language<2,>=1.3.0 in /usr/local/lib/python3.7/dist-packages (from apache-beam[gcp]<3,>=2.40->tfx_bsl) (1.3.2)\n", + "Requirement already satisfied: google-cloud-pubsub<3,>=2.1.0 in /usr/local/lib/python3.7/dist-packages (from apache-beam[gcp]<3,>=2.40->tfx_bsl) (2.13.10)\n", + "Requirement already satisfied: google-cloud-core<3,>=0.28.1 in /usr/local/lib/python3.7/dist-packages (from apache-beam[gcp]<3,>=2.40->tfx_bsl) (1.7.3)\n", + "Requirement already satisfied: google-cloud-dlp<4,>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from apache-beam[gcp]<3,>=2.40->tfx_bsl) (3.9.2)\n", + "Requirement already satisfied: google-auth<3,>=1.18.0 in /usr/local/lib/python3.7/dist-packages (from apache-beam[gcp]<3,>=2.40->tfx_bsl) (1.35.0)\n", + "Requirement already satisfied: google-auth-httplib2<0.2.0,>=0.1.0 in /usr/local/lib/python3.7/dist-packages (from apache-beam[gcp]<3,>=2.40->tfx_bsl) (0.1.0)\n", + "Requirement already satisfied: google-cloud-bigtable<2,>=0.31.1 in /usr/local/lib/python3.7/dist-packages (from apache-beam[gcp]<3,>=2.40->tfx_bsl) (1.7.2)\n", + "Requirement already satisfied: google-cloud-bigquery-storage<2.14,>=2.6.3 in /usr/local/lib/python3.7/dist-packages (from apache-beam[gcp]<3,>=2.40->tfx_bsl) (2.13.2)\n", + "Requirement already satisfied: google-api-core!=2.8.2,<3 in /usr/local/lib/python3.7/dist-packages (from apache-beam[gcp]<3,>=2.40->tfx_bsl) (1.32.0)\n", + "Requirement already satisfied: google-cloud-datastore<2,>=1.8.0 in /usr/local/lib/python3.7/dist-packages (from apache-beam[gcp]<3,>=2.40->tfx_bsl) (1.8.0)\n", + "Requirement already satisfied: google-cloud-recommendations-ai<0.8.0,>=0.1.0 in /usr/local/lib/python3.7/dist-packages (from apache-beam[gcp]<3,>=2.40->tfx_bsl) (0.7.1)\n", + "Requirement already satisfied: google-apitools<0.5.32,>=0.5.31 in /usr/local/lib/python3.7/dist-packages (from apache-beam[gcp]<3,>=2.40->tfx_bsl) (0.5.31)\n", + "Requirement already satisfied: google-cloud-vision<2,>=0.38.0 in /usr/local/lib/python3.7/dist-packages (from apache-beam[gcp]<3,>=2.40->tfx_bsl) (1.0.2)\n", + "Requirement already satisfied: google-cloud-bigquery<3,>=1.6.0 in /usr/local/lib/python3.7/dist-packages (from apache-beam[gcp]<3,>=2.40->tfx_bsl) (1.21.0)\n", + "Requirement already satisfied: google-cloud-pubsublite<2,>=1.2.0 in /usr/local/lib/python3.7/dist-packages (from apache-beam[gcp]<3,>=2.40->tfx_bsl) (1.6.0)\n", + "Requirement already satisfied: six<2dev,>=1.13.0 in /usr/local/lib/python3.7/dist-packages (from google-api-python-client<2,>=1.7.11->tfx_bsl) (1.15.0)\n", + "Requirement already satisfied: uritemplate<4dev,>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from google-api-python-client<2,>=1.7.11->tfx_bsl) (3.0.1)\n", + "Collecting tensorflow-estimator<2.11,>=2.10.0\n", + " Downloading tensorflow_estimator-2.10.0-py2.py3-none-any.whl (438 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m438.7/438.7 kB\u001b[0m \u001b[31m31.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: opt-einsum>=2.3.2 in /usr/local/lib/python3.7/dist-packages (from tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,<3,>=1.15.5->tfx_bsl) (3.3.0)\n", + "Requirement already satisfied: h5py>=2.9.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,<3,>=1.15.5->tfx_bsl) (3.1.0)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,<3,>=1.15.5->tfx_bsl) (21.3)\n", + "Requirement already satisfied: gast<=0.4.0,>=0.2.1 in /usr/local/lib/python3.7/dist-packages (from tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,<3,>=1.15.5->tfx_bsl) (0.4.0)\n", + "Requirement already satisfied: astunparse>=1.6.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,<3,>=1.15.5->tfx_bsl) (1.6.3)\n", + "Requirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,<3,>=1.15.5->tfx_bsl) (2.0.1)\n", + "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,<3,>=1.15.5->tfx_bsl) (57.4.0)\n", + "Collecting protobuf<3.21,>=3.13\n", + " Downloading protobuf-3.19.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m24.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: google-pasta>=0.1.1 in /usr/local/lib/python3.7/dist-packages (from tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,<3,>=1.15.5->tfx_bsl) (0.2.0)\n", + "Requirement already satisfied: wrapt>=1.11.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,<3,>=1.15.5->tfx_bsl) (1.14.1)\n", + "Requirement already satisfied: tensorflow-io-gcs-filesystem>=0.23.1 in /usr/local/lib/python3.7/dist-packages (from tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,<3,>=1.15.5->tfx_bsl) (0.27.0)\n", + "Collecting flatbuffers>=2.0\n", + " Downloading flatbuffers-22.9.24-py2.py3-none-any.whl (26 kB)\n", + "Collecting tensorboard<2.11,>=2.10\n", + " Downloading tensorboard-2.10.1-py3-none-any.whl (5.9 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m58.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting keras<2.11,>=2.10.0\n", + " Downloading keras-2.10.0-py2.py3-none-any.whl (1.7 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m38.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: libclang>=13.0.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,<3,>=1.15.5->tfx_bsl) (14.0.6)\n", + "Requirement already satisfied: keras-preprocessing>=1.1.1 in /usr/local/lib/python3.7/dist-packages (from tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,<3,>=1.15.5->tfx_bsl) (1.1.2)\n", + "Requirement already satisfied: googleapis-common-protos<2,>=1.52.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow-metadata<1.11.0,>=1.10.0->tfx_bsl) (1.56.4)\n", + "Requirement already satisfied: wheel<1.0,>=0.23.0 in /usr/local/lib/python3.7/dist-packages (from astunparse>=1.6.0->tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,<3,>=1.15.5->tfx_bsl) (0.37.1)\n", + "Requirement already satisfied: fasteners>=0.14 in /usr/local/lib/python3.7/dist-packages (from google-apitools<0.5.32,>=0.5.31->apache-beam[gcp]<3,>=2.40->tfx_bsl) (0.18)\n", + "Requirement already satisfied: oauth2client>=1.4.12 in /usr/local/lib/python3.7/dist-packages (from google-apitools<0.5.32,>=0.5.31->apache-beam[gcp]<3,>=2.40->tfx_bsl) (4.1.3)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.7/dist-packages (from google-auth<3,>=1.18.0->apache-beam[gcp]<3,>=2.40->tfx_bsl) (0.2.8)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.7/dist-packages (from google-auth<3,>=1.18.0->apache-beam[gcp]<3,>=2.40->tfx_bsl) (4.9)\n", + "Requirement already satisfied: google-resumable-media!=0.4.0,<0.5.0dev,>=0.3.1 in /usr/local/lib/python3.7/dist-packages (from google-cloud-bigquery<3,>=1.6.0->apache-beam[gcp]<3,>=2.40->tfx_bsl) (0.4.1)\n", + "Requirement already satisfied: grpc-google-iam-v1<0.13dev,>=0.12.3 in /usr/local/lib/python3.7/dist-packages (from google-cloud-bigtable<2,>=0.31.1->apache-beam[gcp]<3,>=2.40->tfx_bsl) (0.12.4)\n", + "Requirement already satisfied: grpcio-status>=1.16.0 in /usr/local/lib/python3.7/dist-packages (from google-cloud-pubsub<3,>=2.1.0->apache-beam[gcp]<3,>=2.40->tfx_bsl) (1.48.2)\n", + "Requirement already satisfied: overrides<7.0.0,>=6.0.1 in /usr/local/lib/python3.7/dist-packages (from google-cloud-pubsublite<2,>=1.2.0->apache-beam[gcp]<3,>=2.40->tfx_bsl) (6.5.0)\n", + "Requirement already satisfied: cached-property in /usr/local/lib/python3.7/dist-packages (from h5py>=2.9.0->tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,<3,>=1.15.5->tfx_bsl) (1.5.2)\n", + "Requirement already satisfied: docopt in /usr/local/lib/python3.7/dist-packages (from hdfs<3.0.0,>=2.1.0->apache-beam[gcp]<3,>=2.40->tfx_bsl) (0.6.2)\n", + "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,<3,>=1.15.5->tfx_bsl) (3.0.9)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.24.0->apache-beam[gcp]<3,>=2.40->tfx_bsl) (2022.9.24)\n", + "Requirement already satisfied: charset-normalizer<3,>=2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.24.0->apache-beam[gcp]<3,>=2.40->tfx_bsl) (2.1.1)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.24.0->apache-beam[gcp]<3,>=2.40->tfx_bsl) (2.10)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.24.0->apache-beam[gcp]<3,>=2.40->tfx_bsl) (1.24.3)\n", + "Requirement already satisfied: werkzeug>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from tensorboard<2.11,>=2.10->tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,<3,>=1.15.5->tfx_bsl) (1.0.1)\n", + "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /usr/local/lib/python3.7/dist-packages (from tensorboard<2.11,>=2.10->tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,<3,>=1.15.5->tfx_bsl) (0.4.6)\n", + "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard<2.11,>=2.10->tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,<3,>=1.15.5->tfx_bsl) (1.8.1)\n", + "Requirement already satisfied: tensorboard-data-server<0.7.0,>=0.6.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard<2.11,>=2.10->tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,<3,>=1.15.5->tfx_bsl) (0.6.1)\n", + "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.7/dist-packages (from tensorboard<2.11,>=2.10->tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,<3,>=1.15.5->tfx_bsl) (3.4.1)\n", + "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.7/dist-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard<2.11,>=2.10->tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,<3,>=1.15.5->tfx_bsl) (1.3.1)\n", + "Requirement already satisfied: importlib-metadata>=4.4 in /usr/local/lib/python3.7/dist-packages (from markdown>=2.6.8->tensorboard<2.11,>=2.10->tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,<3,>=1.15.5->tfx_bsl) (4.13.0)\n", + "Requirement already satisfied: pyasn1>=0.1.7 in /usr/local/lib/python3.7/dist-packages (from oauth2client>=1.4.12->google-apitools<0.5.32,>=0.5.31->apache-beam[gcp]<3,>=2.40->tfx_bsl) (0.4.8)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=4.4->markdown>=2.6.8->tensorboard<2.11,>=2.10->tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,<3,>=1.15.5->tfx_bsl) (3.9.0)\n", + "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard<2.11,>=2.10->tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,<3,>=1.15.5->tfx_bsl) (3.2.1)\n", + "Installing collected packages: keras, flatbuffers, tensorflow-estimator, protobuf, tensorboard, tensorflow, tensorflow-serving-api, tfx_bsl\n", + " Attempting uninstall: keras\n", + " Found existing installation: keras 2.8.0\n", + " Uninstalling keras-2.8.0:\n", + " Successfully uninstalled keras-2.8.0\n", + " Attempting uninstall: flatbuffers\n", + " Found existing installation: flatbuffers 1.12\n", + " Uninstalling flatbuffers-1.12:\n", + " Successfully uninstalled flatbuffers-1.12\n", + " Attempting uninstall: tensorflow-estimator\n", + " Found existing installation: tensorflow-estimator 2.9.0\n", + " Uninstalling tensorflow-estimator-2.9.0:\n", + " Successfully uninstalled tensorflow-estimator-2.9.0\n", + " Attempting uninstall: protobuf\n", + " Found existing installation: protobuf 3.20.3\n", + " Uninstalling protobuf-3.20.3:\n", + " Successfully uninstalled protobuf-3.20.3\n", + " Attempting uninstall: tensorboard\n", + " Found existing installation: tensorboard 2.8.0\n", + " Uninstalling tensorboard-2.8.0:\n", + " Successfully uninstalled tensorboard-2.8.0\n", + " Attempting uninstall: tensorflow\n", + " Found existing installation: tensorflow 2.8.0+zzzcolab20220506162203\n", + " Uninstalling tensorflow-2.8.0+zzzcolab20220506162203:\n", + " Successfully uninstalled tensorflow-2.8.0+zzzcolab20220506162203\n", + "Successfully installed flatbuffers-22.9.24 keras-2.10.0 protobuf-3.19.6 tensorboard-2.10.1 tensorflow-2.10.0 tensorflow-estimator-2.10.0 tensorflow-serving-api-2.10.0 tfx_bsl-1.10.1\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + }, + { + "data": { + "application/vnd.colab-display-data+json": { + "pip_warning": { + "packages": [ + "google" + ] + } + } + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting tensorflow-text==2.8.1\n", + " Downloading tensorflow_text-2.8.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (4.9 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m39.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: tensorflow-hub>=0.8.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow-text==2.8.1) (0.12.0)\n", + "Collecting tensorflow<2.9,>=2.8.0\n", + " Downloading tensorflow-2.8.3-cp37-cp37m-manylinux2010_x86_64.whl (497.9 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m497.9/497.9 MB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: termcolor>=1.1.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (2.0.1)\n", + "Requirement already satisfied: protobuf<3.20,>=3.9.2 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (3.19.6)\n", + "Requirement already satisfied: flatbuffers>=1.12 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (22.9.24)\n", + "Requirement already satisfied: typing-extensions>=3.6.6 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (4.1.1)\n", + "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (57.4.0)\n", + "Requirement already satisfied: opt-einsum>=2.3.2 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (3.3.0)\n", + "Requirement already satisfied: grpcio<2.0,>=1.24.3 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (1.49.1)\n", + "Requirement already satisfied: absl-py>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (1.3.0)\n", + "Collecting tensorboard<2.9,>=2.8\n", + " Using cached tensorboard-2.8.0-py3-none-any.whl (5.8 MB)\n", + "Collecting tensorflow-estimator<2.9,>=2.8\n", + " Downloading tensorflow_estimator-2.8.0-py2.py3-none-any.whl (462 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m462.3/462.3 kB\u001b[0m \u001b[31m25.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: numpy>=1.20 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (1.21.6)\n", + "Requirement already satisfied: tensorflow-io-gcs-filesystem>=0.23.1 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (0.27.0)\n", + "Requirement already satisfied: libclang>=9.0.1 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (14.0.6)\n", + "Requirement already satisfied: wrapt>=1.11.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (1.14.1)\n", + "Requirement already satisfied: google-pasta>=0.1.1 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (0.2.0)\n", + "Collecting keras<2.9,>=2.8.0rc0\n", + " Using cached keras-2.8.0-py2.py3-none-any.whl (1.4 MB)\n", + "Requirement already satisfied: gast>=0.2.1 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (0.4.0)\n", + "Requirement already satisfied: h5py>=2.9.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (3.1.0)\n", + "Requirement already satisfied: six>=1.12.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (1.15.0)\n", + "Requirement already satisfied: keras-preprocessing>=1.1.1 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (1.1.2)\n", + "Requirement already satisfied: astunparse>=1.6.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (1.6.3)\n", + "Requirement already satisfied: wheel<1.0,>=0.23.0 in /usr/local/lib/python3.7/dist-packages (from astunparse>=1.6.0->tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (0.37.1)\n", + "Requirement already satisfied: cached-property in /usr/local/lib/python3.7/dist-packages (from h5py>=2.9.0->tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (1.5.2)\n", + "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.7/dist-packages (from tensorboard<2.9,>=2.8->tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (3.4.1)\n", + "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /usr/local/lib/python3.7/dist-packages (from tensorboard<2.9,>=2.8->tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (0.4.6)\n", + "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard<2.9,>=2.8->tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (1.8.1)\n", + "Requirement already satisfied: requests<3,>=2.21.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard<2.9,>=2.8->tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (2.28.1)\n", + "Requirement already satisfied: google-auth<3,>=1.6.3 in /usr/local/lib/python3.7/dist-packages (from tensorboard<2.9,>=2.8->tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (1.35.0)\n", + "Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.7/dist-packages (from tensorboard<2.9,>=2.8->tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (1.0.1)\n", + "Requirement already satisfied: tensorboard-data-server<0.7.0,>=0.6.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard<2.9,>=2.8->tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (0.6.1)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.7/dist-packages (from google-auth<3,>=1.6.3->tensorboard<2.9,>=2.8->tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (4.9)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.7/dist-packages (from google-auth<3,>=1.6.3->tensorboard<2.9,>=2.8->tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (0.2.8)\n", + "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from google-auth<3,>=1.6.3->tensorboard<2.9,>=2.8->tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (4.2.4)\n", + "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.7/dist-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard<2.9,>=2.8->tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (1.3.1)\n", + "Requirement already satisfied: importlib-metadata>=4.4 in /usr/local/lib/python3.7/dist-packages (from markdown>=2.6.8->tensorboard<2.9,>=2.8->tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (4.13.0)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.21.0->tensorboard<2.9,>=2.8->tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (2022.9.24)\n", + "Requirement already satisfied: charset-normalizer<3,>=2 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.21.0->tensorboard<2.9,>=2.8->tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (2.1.1)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.21.0->tensorboard<2.9,>=2.8->tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (2.10)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.21.0->tensorboard<2.9,>=2.8->tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (1.24.3)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata>=4.4->markdown>=2.6.8->tensorboard<2.9,>=2.8->tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (3.9.0)\n", + "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.7/dist-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard<2.9,>=2.8->tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (0.4.8)\n", + "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard<2.9,>=2.8->tensorflow<2.9,>=2.8.0->tensorflow-text==2.8.1) (3.2.1)\n", + "Installing collected packages: tensorflow-estimator, keras, tensorboard, tensorflow, tensorflow-text\n", + " Attempting uninstall: tensorflow-estimator\n", + " Found existing installation: tensorflow-estimator 2.10.0\n", + " Uninstalling tensorflow-estimator-2.10.0:\n", + " Successfully uninstalled tensorflow-estimator-2.10.0\n", + " Attempting uninstall: keras\n", + " Found existing installation: keras 2.10.0\n", + " Uninstalling keras-2.10.0:\n", + " Successfully uninstalled keras-2.10.0\n", + " Attempting uninstall: tensorboard\n", + " Found existing installation: tensorboard 2.10.1\n", + " Uninstalling tensorboard-2.10.1:\n", + " Successfully uninstalled tensorboard-2.10.1\n", + " Attempting uninstall: tensorflow\n", + " Found existing installation: tensorflow 2.10.0\n", + " Uninstalling tensorflow-2.10.0:\n", + " Successfully uninstalled tensorflow-2.10.0\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "tfx-bsl 1.10.1 requires tensorflow!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,<3,>=1.15.5, but you have tensorflow 2.8.3 which is incompatible.\n", + "tensorflow-serving-api 2.10.0 requires tensorflow<3,>=2.10.0, but you have tensorflow 2.8.3 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0mSuccessfully installed keras-2.8.0 tensorboard-2.8.0 tensorflow-2.8.3 tensorflow-estimator-2.8.0 tensorflow-text-2.8.1\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install --upgrade pip\n", + "!pip install google-api-core==1.32\n", + "!pip install apache_beam[gcp]==2.41.0\n", + "!pip install tensorflow==2.8\n", + "!pip install tfx_bsl\n", + "!pip install tensorflow-text==2.8.1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "642maF_redwC" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import tensorflow as tf\n", + "import tensorflow_text as text\n", + "from scipy.special import expit\n", + "\n", + "import apache_beam as beam\n", + "import tfx_bsl\n", + "from tfx_bsl.public.beam import RunInference\n", + "from tfx_bsl.public import tfxio\n", + "from tfx_bsl.public.proto import model_spec_pb2\n", + "from tfx_bsl.public.tfxio import TFExampleRecord\n", + "from tensorflow_serving.apis import prediction_log_pb2" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "h2JP7zsqerCT" + }, + "source": [ + "#### Model" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ydYQ_5EyfeEM" + }, + "source": [ + "Download a pretrained binary classifier to perform sentiment analysis on an IMDB dataset from GCS. This model was trained by following this [tutorial](https://www.tensorflow.org/tutorials/keras/text_classification)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BucRWly0flz8" + }, + "outputs": [], + "source": [ + "model_dir = \"gs://apache-beam-testing-ml-examples/imdb_bert\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GZ-Ioc8ZfyIT" + }, + "source": [ + "#### Helper Functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pZ0LNtHUfsRq" + }, + "outputs": [], + "source": [ + "class ExampleProcessor:\n", + " \"\"\"\n", + " Process the raw text input to a format suitable for RunInference.\n", + " TensorFlow model handler expects a serialized tf.Example as input\n", + " \"\"\"\n", + " def create_example(self, feature):\n", + " return tf.train.Example(\n", + " features=tf.train.Features(\n", + " feature={'x' : self.create_feature(feature)})\n", + " )\n", + "\n", + " def create_feature(self, element):\n", + " return tf.train.Feature(bytes_list=tf.train.BytesList(value=[element]))\n", + "\n", + "class PredictionProcessor(beam.DoFn):\n", + " \"\"\"\n", + " Process the RunInference output to return the input text and the softmax probability\n", + " \"\"\"\n", + " def process(\n", + " self,\n", + " element: prediction_log_pb2.PredictionLog):\n", + " predict_log = element.predict_log\n", + " input_value = tf.train.Example.FromString(predict_log.request.inputs['text'].string_val[0])\n", + " output_value = predict_log.response.outputs\n", + " # print(output_value)\n", + " yield (f\"input is [{input_value.features.feature['x'].bytes_list.value}] output is {expit(output_value['classifier'].float_val)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PZVwI4BbgaAI" + }, + "source": [ + "#### Prepare the Input" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "TOXX1KMKi_mm" + }, + "outputs": [], + "source": [ + "inputs = np.array([\n", + " b\"this is such an amazing movie\",\n", + " b\"The movie was great\",\n", + " b\"The movie was okish\",\n", + " b\"The movie was terrible\"\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "O2Y15WmfgZXQ" + }, + "outputs": [], + "source": [ + "input_strings_file = 'input_strings.tfrecord'\n", + "\n", + "# Preprocess the input as RunInference is expecting a serialized tf.example as an input\n", + "# Write the processed input to a file \n", + "# One can also do it as a pipeline step by using beam.Map() \n", + "\n", + "with tf.io.TFRecordWriter(input_strings_file) as writer:\n", + " for i in inputs:\n", + " example = ExampleProcessor().create_example(feature=i)\n", + " writer.write(example.SerializeToString())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BYkQl_l8gRgo" + }, + "source": [ + "#### RunInference Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "uh5bMhxdgA7Q", + "outputId": "2a22059f-519c-44f7-e36f-59e09b1cb24a" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /usr/local/lib/python3.7/dist-packages/tfx_bsl/beam/run_inference.py:615: load (from tensorflow.python.saved_model.loader_impl) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.loader.load or tf.compat.v1.saved_model.load. There will be a new function for importing SavedModels in Tensorflow 2.0.\n", + "WARNING:apache_beam.io.tfrecordio:Couldn't find python-snappy so the implementation of _TFRecordUtil._masked_crc32c is not as fast as it could be.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "input is [[b'this is such an amazing movie']] output is [0.99906057]\n", + "input is [[b'The movie was great']] output is [0.99307914]\n", + "input is [[b'The movie was okish']] output is [0.03274685]\n", + "input is [[b'The movie was terrible']] output is [0.00680008]\n" + ] + } + ], + "source": [ + "saved_model_spec = model_spec_pb2.SavedModelSpec(model_path=model_dir)\n", + "inference_spec_type = model_spec_pb2.InferenceSpecType(saved_model_spec=saved_model_spec)\n", + "\n", + "#A Beam IO that reads a file of serialized tf.Examples\n", + "tfexample_beam_record = TFExampleRecord(file_pattern='input_strings.tfrecord')\n", + "\n", + "with beam.Pipeline() as pipeline:\n", + " _ = ( pipeline | \"Create Input PCollection\" >> tfexample_beam_record.RawRecordBeamSource()\n", + " | \"Do Inference\" >> RunInference(model_spec_pb2.InferenceSpecType(\n", + " saved_model_spec=model_spec_pb2.SavedModelSpec(model_path=model_dir)))\n", + " | \"Post Process\" >> beam.ParDo(PredictionProcessor())\n", + " | beam.Map(print)\n", + " )\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8wBUckzHjGV6" + }, + "source": [ + "### RunInference with Scikit-Learn\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6ArL_55kjxkO" + }, + "source": [ + "#### Install Dependency" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "R4p6Mil0jxSy" + }, + "outputs": [], + "source": [ + "!pip install --upgrade pip\n", + "!pip install google-api-core==1.32\n", + "!pip install apache_beam[gcp]==2.41.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_YtRRxh1hLag" + }, + "outputs": [], + "source": [ + "import pickle\n", + "\n", + "import apache_beam as beam\n", + "from apache_beam.ml.inference import RunInference\n", + "from apache_beam.ml.inference.sklearn_inference import SklearnModelHandlerNumpy, ModelFileType" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-7ABKlZvkFHy" + }, + "source": [ + "#### Model\n", + "\n", + "Train and save a sentiment analysis pipeline on movie reviews to classify movie reviews as either positive or negative" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WI_UXluPkRYq" + }, + "source": [ + "This model was trained by following this [tutorial](https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html#exercise-2-sentiment-analysis-on-movie-reviews)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_dir = \"gs://apache-beam-testing-ml-examples/sklearn-text-classification/sklearn_sentiment_analysis_pipeline.pkl\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KL4Cx8s0mBqn" + }, + "source": [ + "#### RunInference Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kyN2Aco8l7SR" + }, + "outputs": [], + "source": [ + "inputs = [\n", + " \"In my soul and in my heart, I’m convinced I’m wrong!\",\n", + " \"Be with me always—take any form—drive me mad! only do not leave me in this abyss, where I cannot find you!\",\n", + " \"Do I want to live? Would you like to live with your soul in the grave?\",\n", + " \"Honest people don’t hide their deeds.\",\n", + " \"Nelly, I am Heathcliff! He’s always, always in my mind: not as a pleasure, any more than I am always a pleasure to myself, but as my own being.\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "QnQ6ePcgmEeR", + "outputId": "b0d4d31a-76c1-49e4-aa5a-8003a95bbb47" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "input: In my soul and in my heart, I’m convinced I’m wrong! -> negative\n", + "input: Be with me always—take any form—drive me mad! only do not leave me in this abyss, where I cannot find you! -> positive\n", + "input: Do I want to live? Would you like to live with your soul in the grave? -> positive\n", + "input: Honest people don’t hide their deeds. -> negative\n", + "input: Nelly, I am Heathcliff! He’s always, always in my mind: not as a pleasure, any more than I am always a pleasure to myself, but as my own being. -> negative\n" + ] + } + ], + "source": [ + "# One can choose a Sklearn model handler based on their input data type:\n", + "# 1. SklearnModelHandlerNumpy: For using numpy arrays as an input\n", + "# 2. SklearnModelHandlerPandas: For using pandas dataframes as an input\n", + "\n", + "# Sklearn model handler supports loading of two serialized format: \n", + "# 1. ModelFileType.PICKLE: For models saved using pickle\n", + "# 2. ModelFileType.JOBLIB: For models saved using Joblib\n", + "\n", + "model_handler = SklearnModelHandlerNumpy(model_uri=model_dir, model_file_type=ModelFileType.PICKLE)\n", + "\n", + "with beam.Pipeline() as pipeline:\n", + " _ = (pipeline | \"Create inputs\" >> beam.Create(inputs)\n", + " | \"Inference\" >> RunInference(model_handler=model_handler)\n", + " | \"Print\" >> beam.Map(lambda x: print(f\"input: {x.example} -> {'positive' if x.inference == 0 else 'negative'}\"))\n", + " )" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [ + "5kkjbcIzZIf6", + "vA1UmbFRb5C-", + "-7ABKlZvkFHy" + ], + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/examples/notebooks/tour-of-beam/dataframes.ipynb b/examples/notebooks/tour-of-beam/dataframes.ipynb index e1ae406f668f..06330ade3ada 100644 --- a/examples/notebooks/tour-of-beam/dataframes.ipynb +++ b/examples/notebooks/tour-of-beam/dataframes.ipynb @@ -1,6 +1,6 @@ { "nbformat": 4, - "nbformat_minor": 2, + "nbformat_minor": 0, "metadata": { "colab": { "name": "Beam DataFrames", @@ -64,8 +64,7 @@ "> ℹ️ To learn more about Beam DataFrames, take a look at the\n", "[Beam DataFrames overview](https://beam.apache.org/documentation/dsls/dataframes/overview) page.\n", "\n", - "First, we need to install Apache Beam with the `interactive` extra for the Interactive runner.", - "We also need to install a version of `pandas` supported by the DataFrame API, which we can get with the `dataframe` extra in Beam 2.34.0 and newer." + "First, we need to install Apache Beam with the `interactive` extra for the Interactive runner.We also need to install a version of `pandas` supported by the DataFrame API, which we can get with the `dataframe` extra in Beam 2.34.0 and newer." ], "metadata": { "id": "hDuXLLSZnI1D" @@ -135,7 +134,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "source": [ "import apache_beam as beam\n", "import apache_beam.runners.interactive.interactive_beam as ib\n", @@ -283,7 +282,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "source": [ "import apache_beam.runners.interactive.interactive_beam as ib\n", "\n", @@ -408,7 +407,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "source": [ "import apache_beam as beam\n", "from apache_beam.dataframe import convert\n", @@ -470,7 +469,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "source": [ "import pandas as pd\n", "import apache_beam as beam\n", @@ -533,7 +532,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "source": [ "import pandas as pd\n", "import apache_beam as beam\n", @@ -600,7 +599,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "source": [ "import csv\n", "import apache_beam as beam\n", @@ -676,7 +675,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "source": [ "import csv\n", "import pandas as pd\n", @@ -738,11 +737,12 @@ "* [Beam DataFrames overview](https://beam.apache.org/documentation/dsls/dataframes/overview) -- an overview of the Beam DataFrames API.\n", "* [Differences from pandas](https://beam.apache.org/documentation/dsls/dataframes/differences-from-pandas) -- goes through some of the differences between Beam DataFrames and Pandas DataFrames, as well as some of the workarounds for unsupported operations.\n", "* [10 minutes to Pandas](https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html) -- a quickstart guide to Pandas DataFrames.\n", - "* [Pandas DataFrame API](https://pandas.pydata.org/pandas-docs/stable/reference/frame.html) -- the API reference for Pandas DataFrames" + "* [Pandas DataFrame API](https://pandas.pydata.org/pandas-docs/stable/reference/frame.html) -- the API reference for Pandas DataFrames\n", + "* [Preprocessing with Beam Dataframes](https://colab.research.google.com/github/apache/beam/blob/master/examples/notebooks/beam-ml/dataframe_api_preprocessing.ipynb) -- an example of data preprocessing for ML training using Beam DataFrames API\n" ], "metadata": { "id": "UflW6AJp6-ss" } } ] -} +} \ No newline at end of file diff --git a/learning/katas/python/Common Transforms/Aggregation/Count/task.py b/learning/katas/python/Common Transforms/Aggregation/Count/task.py index 188360e5a258..a4e5b0cb53ee 100644 --- a/learning/katas/python/Common Transforms/Aggregation/Count/task.py +++ b/learning/katas/python/Common Transforms/Aggregation/Count/task.py @@ -28,10 +28,8 @@ import apache_beam as beam -from log_elements import LogElements - with beam.Pipeline() as p: (p | beam.Create(range(1, 11)) | beam.combiners.Count.Globally() - | LogElements()) + | beam.LogElements()) diff --git a/learning/katas/python/Common Transforms/Aggregation/Largest/task.py b/learning/katas/python/Common Transforms/Aggregation/Largest/task.py index 5798a2366714..fbbe17223742 100644 --- a/learning/katas/python/Common Transforms/Aggregation/Largest/task.py +++ b/learning/katas/python/Common Transforms/Aggregation/Largest/task.py @@ -28,10 +28,8 @@ import apache_beam as beam -from log_elements import LogElements - with beam.Pipeline() as p: (p | beam.Create(range(1, 11)) | beam.combiners.Top.Largest(2) - | LogElements()) + | beam.LogElements()) diff --git a/learning/katas/python/Common Transforms/Aggregation/Mean/task.py b/learning/katas/python/Common Transforms/Aggregation/Mean/task.py index 6b05b1d25cec..024f1b02d14c 100644 --- a/learning/katas/python/Common Transforms/Aggregation/Mean/task.py +++ b/learning/katas/python/Common Transforms/Aggregation/Mean/task.py @@ -28,10 +28,8 @@ import apache_beam as beam -from log_elements import LogElements - with beam.Pipeline() as p: (p | beam.Create(range(1, 11)) | beam.combiners.Mean.Globally() - | LogElements()) + | beam.LogElements()) diff --git a/learning/katas/python/Common Transforms/Aggregation/Smallest/task.py b/learning/katas/python/Common Transforms/Aggregation/Smallest/task.py index c2f2f54ca48d..9b2ec87586de 100644 --- a/learning/katas/python/Common Transforms/Aggregation/Smallest/task.py +++ b/learning/katas/python/Common Transforms/Aggregation/Smallest/task.py @@ -28,10 +28,8 @@ import apache_beam as beam -from log_elements import LogElements - with beam.Pipeline() as p: (p | beam.Create(range(1, 11)) | beam.combiners.Top.Smallest(1) - | LogElements()) + | beam.LogElements()) diff --git a/learning/katas/python/Common Transforms/Aggregation/Sum/task.py b/learning/katas/python/Common Transforms/Aggregation/Sum/task.py index e857c73a9334..a5c8c997279f 100644 --- a/learning/katas/python/Common Transforms/Aggregation/Sum/task.py +++ b/learning/katas/python/Common Transforms/Aggregation/Sum/task.py @@ -28,10 +28,8 @@ import apache_beam as beam -from log_elements import LogElements - with beam.Pipeline() as p: (p | beam.Create(range(1, 11)) | beam.CombineGlobally(sum) - | LogElements()) + | beam.LogElements()) diff --git a/learning/katas/python/Common Transforms/Filter/Filter/task.py b/learning/katas/python/Common Transforms/Filter/Filter/task.py index 2024eaf41840..756e7a7d22a9 100644 --- a/learning/katas/python/Common Transforms/Filter/Filter/task.py +++ b/learning/katas/python/Common Transforms/Filter/Filter/task.py @@ -28,10 +28,8 @@ import apache_beam as beam -from log_elements import LogElements - with beam.Pipeline() as p: (p | beam.Create(range(1, 11)) | beam.Filter(lambda num: num % 2 == 0) - | LogElements()) + | beam.LogElements()) diff --git a/learning/katas/python/Common Transforms/Filter/ParDo/task.py b/learning/katas/python/Common Transforms/Filter/ParDo/task.py index 38c18bdf0e2a..f6f148342072 100644 --- a/learning/katas/python/Common Transforms/Filter/ParDo/task.py +++ b/learning/katas/python/Common Transforms/Filter/ParDo/task.py @@ -28,8 +28,6 @@ import apache_beam as beam -from log_elements import LogElements - class FilterOutEvenNumber(beam.DoFn): @@ -41,4 +39,4 @@ def process(self, element): with beam.Pipeline() as p: (p | beam.Create(range(1, 11)) | beam.ParDo(FilterOutEvenNumber()) - | LogElements()) + | beam.LogElements()) diff --git a/learning/katas/python/Common Transforms/WithKeys/WithKeys/task.py b/learning/katas/python/Common Transforms/WithKeys/WithKeys/task.py index 9d5ea7d51f39..35c44e7b3043 100644 --- a/learning/katas/python/Common Transforms/WithKeys/WithKeys/task.py +++ b/learning/katas/python/Common Transforms/WithKeys/WithKeys/task.py @@ -28,10 +28,8 @@ import apache_beam as beam -from log_elements import LogElements - with beam.Pipeline() as p: (p | beam.Create(['apple', 'banana', 'cherry', 'durian', 'guava', 'melon']) | beam.WithKeys(lambda word: word[0:1]) - | LogElements()) + | beam.LogElements()) diff --git a/learning/katas/python/Core Transforms/Branching/Branching/task.py b/learning/katas/python/Core Transforms/Branching/Branching/task.py index bc69caa8c96e..53ffdf0723ac 100644 --- a/learning/katas/python/Core Transforms/Branching/Branching/task.py +++ b/learning/katas/python/Core Transforms/Branching/Branching/task.py @@ -30,8 +30,6 @@ import apache_beam as beam -from log_elements import LogElements - with beam.Pipeline() as p: numbers = p | beam.Create([1, 2, 3, 4, 5]) @@ -39,5 +37,5 @@ mult5_results = numbers | beam.Map(lambda num: num * 5) mult10_results = numbers | beam.Map(lambda num: num * 10) - mult5_results | 'Log multiply 5' >> LogElements(prefix='Multiplied by 5: ') - mult10_results | 'Log multiply 10' >> LogElements(prefix='Multiplied by 10: ') + mult5_results | 'Log multiply 5' >> beam.LogElements(prefix='Multiplied by 5: ') + mult10_results | 'Log multiply 10' >> beam.LogElements(prefix='Multiplied by 10: ') diff --git a/learning/katas/python/Core Transforms/CoGroupByKey/CoGroupByKey/task.py b/learning/katas/python/Core Transforms/CoGroupByKey/CoGroupByKey/task.py index 2c83e7b23857..636cc79d17bc 100644 --- a/learning/katas/python/Core Transforms/CoGroupByKey/CoGroupByKey/task.py +++ b/learning/katas/python/Core Transforms/CoGroupByKey/CoGroupByKey/task.py @@ -31,8 +31,6 @@ import apache_beam as beam -from log_elements import LogElements - class WordsAlphabet: @@ -67,4 +65,4 @@ def cogbk_result_to_wordsalphabet(cgbk_result): countries = p | 'Countries' >> beam.Create(['australia', 'brazil', 'canada']) (apply_transforms(fruits, countries) - | LogElements()) + | beam.LogElements()) diff --git a/learning/katas/python/Core Transforms/Combine/Combine PerKey/task.py b/learning/katas/python/Core Transforms/Combine/Combine PerKey/task.py index 8066c01ea34d..6ab01d208728 100644 --- a/learning/katas/python/Core Transforms/Combine/Combine PerKey/task.py +++ b/learning/katas/python/Core Transforms/Combine/Combine PerKey/task.py @@ -30,8 +30,6 @@ import apache_beam as beam -from log_elements import LogElements - PLAYER_1 = 'Player 1' PLAYER_2 = 'Player 2' PLAYER_3 = 'Player 3' @@ -41,4 +39,4 @@ (p | beam.Create([(PLAYER_1, 15), (PLAYER_2, 10), (PLAYER_1, 100), (PLAYER_3, 25), (PLAYER_2, 75)]) | beam.CombinePerKey(sum) - | LogElements()) + | beam.LogElements()) diff --git a/learning/katas/python/Core Transforms/Combine/CombineFn/task.py b/learning/katas/python/Core Transforms/Combine/CombineFn/task.py index 396a82caa65d..1ff506247765 100644 --- a/learning/katas/python/Core Transforms/Combine/CombineFn/task.py +++ b/learning/katas/python/Core Transforms/Combine/CombineFn/task.py @@ -28,8 +28,6 @@ import apache_beam as beam -from log_elements import LogElements - class AverageFn(beam.CombineFn): @@ -53,4 +51,4 @@ def extract_output(self, accumulator): (p | beam.Create([10, 20, 50, 70, 90]) | beam.CombineGlobally(AverageFn()) - | LogElements()) + | beam.LogElements()) diff --git a/learning/katas/python/Core Transforms/Combine/Simple Function/task.py b/learning/katas/python/Core Transforms/Combine/Simple Function/task.py index 1ccc335b42eb..7ac6768c1d34 100644 --- a/learning/katas/python/Core Transforms/Combine/Simple Function/task.py +++ b/learning/katas/python/Core Transforms/Combine/Simple Function/task.py @@ -29,8 +29,6 @@ import apache_beam as beam -from log_elements import LogElements - def sum(numbers): total = 0 @@ -45,4 +43,4 @@ def sum(numbers): (p | beam.Create([1, 2, 3, 4, 5]) | beam.CombineGlobally(sum) - | LogElements()) + | beam.LogElements()) diff --git a/learning/katas/python/Core Transforms/Composite Transform/Composite Transform/task.py b/learning/katas/python/Core Transforms/Composite Transform/Composite Transform/task.py index 87e13fc0bd8b..66960ab1c4b3 100644 --- a/learning/katas/python/Core Transforms/Composite Transform/Composite Transform/task.py +++ b/learning/katas/python/Core Transforms/Composite Transform/Composite Transform/task.py @@ -30,8 +30,6 @@ import apache_beam as beam -from log_elements import LogElements - class ExtractAndMultiplyNumbers(beam.PTransform): @@ -46,4 +44,4 @@ def expand(self, pcoll): (p | beam.Create(['1,2,3,4,5', '6,7,8,9,10']) | ExtractAndMultiplyNumbers() - | LogElements()) + | beam.LogElements()) diff --git a/learning/katas/python/Core Transforms/Flatten/Flatten/task.py b/learning/katas/python/Core Transforms/Flatten/Flatten/task.py index e13017a7a31a..0d3cfb26be46 100644 --- a/learning/katas/python/Core Transforms/Flatten/Flatten/task.py +++ b/learning/katas/python/Core Transforms/Flatten/Flatten/task.py @@ -28,8 +28,6 @@ import apache_beam as beam -from log_elements import LogElements - with beam.Pipeline() as p: wordsStartingWithA = \ @@ -40,4 +38,4 @@ ((wordsStartingWithA, wordsStartingWithB) | beam.Flatten() - | LogElements()) + | beam.LogElements()) diff --git a/learning/katas/python/Core Transforms/GroupByKey/GroupByKey/task.py b/learning/katas/python/Core Transforms/GroupByKey/GroupByKey/task.py index 36f944ce6b80..e47136554538 100644 --- a/learning/katas/python/Core Transforms/GroupByKey/GroupByKey/task.py +++ b/learning/katas/python/Core Transforms/GroupByKey/GroupByKey/task.py @@ -29,11 +29,9 @@ import apache_beam as beam -from log_elements import LogElements - with beam.Pipeline() as p: (p | beam.Create(['apple', 'ball', 'car', 'bear', 'cheetah', 'ant']) | beam.Map(lambda word: (word[0], word)) | beam.GroupByKey() - | LogElements()) + | beam.LogElements()) diff --git a/learning/katas/python/Core Transforms/Map/FlatMap/task.py b/learning/katas/python/Core Transforms/Map/FlatMap/task.py index 5d6b382ad634..3b9e876d6f20 100644 --- a/learning/katas/python/Core Transforms/Map/FlatMap/task.py +++ b/learning/katas/python/Core Transforms/Map/FlatMap/task.py @@ -29,10 +29,8 @@ import apache_beam as beam -from log_elements import LogElements - with beam.Pipeline() as p: (p | beam.Create(['Apache Beam', 'Unified Batch and Streaming']) | beam.FlatMap(lambda sentence: sentence.split()) - | LogElements()) + | beam.LogElements()) diff --git a/learning/katas/python/Core Transforms/Map/Map/task.py b/learning/katas/python/Core Transforms/Map/Map/task.py index 4fddc4394a58..aa6714ba9489 100644 --- a/learning/katas/python/Core Transforms/Map/Map/task.py +++ b/learning/katas/python/Core Transforms/Map/Map/task.py @@ -28,10 +28,8 @@ import apache_beam as beam -from log_elements import LogElements - with beam.Pipeline() as p: (p | beam.Create([10, 20, 30, 40, 50]) | beam.Map(lambda num: num * 5) - | LogElements()) + | beam.LogElements()) diff --git a/learning/katas/python/Core Transforms/Map/ParDo OneToMany/task.py b/learning/katas/python/Core Transforms/Map/ParDo OneToMany/task.py index fc28fb9adc47..cadd56b53021 100644 --- a/learning/katas/python/Core Transforms/Map/ParDo OneToMany/task.py +++ b/learning/katas/python/Core Transforms/Map/ParDo OneToMany/task.py @@ -29,8 +29,6 @@ import apache_beam as beam -from log_elements import LogElements - class BreakIntoWordsDoFn(beam.DoFn): @@ -43,5 +41,5 @@ def process(self, element): (p | beam.Create(['Hello Beam', 'It is awesome']) | beam.ParDo(BreakIntoWordsDoFn()) - | LogElements()) + | beam.LogElements()) diff --git a/learning/katas/python/Core Transforms/Map/ParDo/task.py b/learning/katas/python/Core Transforms/Map/ParDo/task.py index e54faf2daa16..b6a0aed8d99d 100644 --- a/learning/katas/python/Core Transforms/Map/ParDo/task.py +++ b/learning/katas/python/Core Transforms/Map/ParDo/task.py @@ -28,8 +28,6 @@ import apache_beam as beam -from log_elements import LogElements - class MultiplyByTenDoFn(beam.DoFn): @@ -41,5 +39,5 @@ def process(self, element): (p | beam.Create([1, 2, 3, 4, 5]) | beam.ParDo(MultiplyByTenDoFn()) - | LogElements()) + | beam.LogElements()) diff --git a/learning/katas/python/Core Transforms/Partition/Partition/task.py b/learning/katas/python/Core Transforms/Partition/Partition/task.py index cd28eba307a9..b06a1662e55c 100644 --- a/learning/katas/python/Core Transforms/Partition/Partition/task.py +++ b/learning/katas/python/Core Transforms/Partition/Partition/task.py @@ -29,8 +29,6 @@ import apache_beam as beam -from log_elements import LogElements - def partition_fn(number, num_partitions): if number > 100: @@ -45,5 +43,5 @@ def partition_fn(number, num_partitions): (p | beam.Create([1, 2, 3, 4, 5, 100, 110, 150, 250]) | beam.Partition(partition_fn, 2)) - results[0] | 'Log numbers > 100' >> LogElements(prefix='Number > 100: ') - results[1] | 'Log numbers <= 100' >> LogElements(prefix='Number <= 100: ') + results[0] | 'Log numbers > 100' >> beam.LogElements(prefix='Number > 100: ') + results[1] | 'Log numbers <= 100' >> beam.LogElements(prefix='Number <= 100: ') diff --git a/learning/katas/python/Core Transforms/Side Input/Side Input/task.py b/learning/katas/python/Core Transforms/Side Input/Side Input/task.py index edda30d6308e..5943907b5ab3 100644 --- a/learning/katas/python/Core Transforms/Side Input/Side Input/task.py +++ b/learning/katas/python/Core Transforms/Side Input/Side Input/task.py @@ -29,8 +29,6 @@ import apache_beam as beam -from log_elements import LogElements - class Person: def __init__(self, name, city, country=''): @@ -64,4 +62,4 @@ def process(self, element, cities_to_countries): (p | beam.Create(persons) | beam.ParDo(EnrichCountryDoFn(), beam.pvalue.AsDict(cities_to_countries)) - | LogElements()) + | beam.LogElements()) diff --git a/learning/katas/python/Core Transforms/Side Output/Side Output/task.py b/learning/katas/python/Core Transforms/Side Output/Side Output/task.py index b29f1cd99d17..f6dad94a5671 100644 --- a/learning/katas/python/Core Transforms/Side Output/Side Output/task.py +++ b/learning/katas/python/Core Transforms/Side Output/Side Output/task.py @@ -32,8 +32,6 @@ import apache_beam as beam from apache_beam import pvalue -from log_elements import LogElements - num_below_100_tag = 'num_below_100' num_above_100_tag = 'num_above_100' @@ -54,5 +52,5 @@ def process(self, element): | beam.ParDo(ProcessNumbersDoFn()) .with_outputs(num_above_100_tag, main=num_below_100_tag)) - results[num_below_100_tag] | 'Log numbers <= 100' >> LogElements(prefix='Number <= 100: ') - results[num_above_100_tag] | 'Log numbers > 100' >> LogElements(prefix='Number > 100: ') + results[num_below_100_tag] | 'Log numbers <= 100' >> beam.LogElements(prefix='Number <= 100: ') + results[num_above_100_tag] | 'Log numbers > 100' >> beam.LogElements(prefix='Number > 100: ') diff --git a/learning/katas/python/Examples/Word Count/Word Count/task.py b/learning/katas/python/Examples/Word Count/Word Count/task.py index af0df927fb4e..4c605c401ef8 100644 --- a/learning/katas/python/Examples/Word Count/Word Count/task.py +++ b/learning/katas/python/Examples/Word Count/Word Count/task.py @@ -31,8 +31,6 @@ import apache_beam as beam -from log_elements import LogElements - lines = [ "apple orange grape banana apple banana", "banana orange banana papaya" @@ -44,4 +42,4 @@ | beam.FlatMap(lambda sentence: sentence.split()) | beam.combiners.Count.PerElement() | beam.MapTuple(lambda k, v: k + ":" + str(v)) - | LogElements()) + | beam.LogElements()) diff --git a/learning/katas/python/IO/TextIO/ReadFromText/task.py b/learning/katas/python/IO/TextIO/ReadFromText/task.py index 2e764c0e836e..720d9214abc8 100644 --- a/learning/katas/python/IO/TextIO/ReadFromText/task.py +++ b/learning/katas/python/IO/TextIO/ReadFromText/task.py @@ -30,11 +30,9 @@ import apache_beam as beam -from log_elements import LogElements - with beam.Pipeline() as p: file_path = 'countries.txt' (p | beam.io.ReadFromText(file_path) | beam.Map(lambda country: country.upper()) - | LogElements()) + | beam.LogElements()) diff --git a/learning/katas/python/Introduction/Hello Beam/Hello Beam/task.py b/learning/katas/python/Introduction/Hello Beam/Hello Beam/task.py index a9be129bc37f..bed032039ef7 100644 --- a/learning/katas/python/Introduction/Hello Beam/Hello Beam/task.py +++ b/learning/katas/python/Introduction/Hello Beam/Hello Beam/task.py @@ -28,10 +28,8 @@ import apache_beam as beam -from log_elements import LogElements - with beam.Pipeline() as p: (p | beam.Create(['Hello Beam']) - | LogElements()) + | beam.LogElements()) diff --git a/learning/katas/python/Streaming/Timestamps/Add Timestamps/task.py b/learning/katas/python/Streaming/Timestamps/Add Timestamps/task.py index abbaa3c2851e..51e53dd74e38 100644 --- a/learning/katas/python/Streaming/Timestamps/Add Timestamps/task.py +++ b/learning/katas/python/Streaming/Timestamps/Add Timestamps/task.py @@ -32,8 +32,6 @@ import apache_beam as beam from apache_beam.transforms import window -from log_elements import LogElements - class Event: def __init__(self, id, event, timestamp): @@ -60,5 +58,5 @@ def process(self, element, **kwargs): Event('5', 'book-order', datetime.datetime(2020, 3, 8, 0, 0, 0, 0, tzinfo=pytz.UTC)), ]) | beam.ParDo(AddTimestampDoFn()) - | LogElements(with_timestamp=True)) + | beam.LogElements(with_timestamp=True)) diff --git a/learning/katas/python/Streaming/Triggers/Early Triggers/task.py b/learning/katas/python/Streaming/Triggers/Early Triggers/task.py index 0817560ce1cf..830e612a4307 100644 --- a/learning/katas/python/Streaming/Triggers/Early Triggers/task.py +++ b/learning/katas/python/Streaming/Triggers/Early Triggers/task.py @@ -41,7 +41,7 @@ from apache_beam.transforms.trigger import AfterCount from apache_beam.transforms.trigger import AccumulationMode from apache_beam.utils.timestamp import Duration -from log_elements import LogElements +from apache_beam.transforms.util import LogElements class CountEventsWithEarlyTrigger(beam.PTransform): diff --git a/learning/katas/python/Streaming/Triggers/Event Time Triggers/task.py b/learning/katas/python/Streaming/Triggers/Event Time Triggers/task.py index 4476721ec83a..283648499e1a 100644 --- a/learning/katas/python/Streaming/Triggers/Event Time Triggers/task.py +++ b/learning/katas/python/Streaming/Triggers/Event Time Triggers/task.py @@ -37,7 +37,7 @@ from apache_beam.transforms.trigger import AccumulationMode from apache_beam.transforms.trigger import AfterWatermark from apache_beam.utils.timestamp import Duration -from log_elements import LogElements +from apache_beam.transforms.util import LogElements class CountEvents(beam.PTransform): diff --git a/learning/katas/python/Streaming/Triggers/Window Accumulation Modes/task.py b/learning/katas/python/Streaming/Triggers/Window Accumulation Modes/task.py index 5e7881b1e94c..51f592722a92 100644 --- a/learning/katas/python/Streaming/Triggers/Window Accumulation Modes/task.py +++ b/learning/katas/python/Streaming/Triggers/Window Accumulation Modes/task.py @@ -40,7 +40,7 @@ from apache_beam.utils.timestamp import Duration from apache_beam.options.pipeline_options import PipelineOptions from apache_beam.options.pipeline_options import StandardOptions -from log_elements import LogElements +from apache_beam.transforms.util import LogElements class CountEventsWithAccumulating(beam.PTransform): diff --git a/learning/katas/python/Streaming/Windows/Fixed Windows/task.py b/learning/katas/python/Streaming/Windows/Fixed Windows/task.py index 627326342917..3bf218e6b5e6 100644 --- a/learning/katas/python/Streaming/Windows/Fixed Windows/task.py +++ b/learning/katas/python/Streaming/Windows/Fixed Windows/task.py @@ -35,8 +35,6 @@ import apache_beam as beam from apache_beam.transforms import window -from log_elements import LogElements - with beam.Pipeline() as p: @@ -54,4 +52,4 @@ ]) | beam.WindowInto(window.FixedWindows(24*60*60)) | beam.combiners.Count.PerElement() - | LogElements(with_window=True)) + | beam.LogElements(with_window=True)) diff --git a/learning/katas/python/log_elements.py b/learning/katas/python/log_elements.py deleted file mode 100644 index 4477256da7d9..000000000000 --- a/learning/katas/python/log_elements.py +++ /dev/null @@ -1,54 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import apache_beam as beam - - -class LogElements(beam.PTransform): - - class _LoggingFn(beam.DoFn): - - def __init__(self, prefix='', with_timestamp=False, with_window=False): - super().__init__() - self.prefix = prefix - self.with_timestamp = with_timestamp - self.with_window = with_window - - def process(self, element, timestamp=beam.DoFn.TimestampParam, - window=beam.DoFn.WindowParam, **kwargs): - log_line = self.prefix + str(element) - - if self.with_timestamp: - log_line += ', timestamp=' + repr(timestamp.to_rfc3339()) - - if self.with_window: - log_line += ', window(start=' + window.start.to_rfc3339() - log_line += ', end=' + window.end.to_rfc3339() + ')' - - print(log_line) - yield element - - def __init__(self, label=None, prefix='', - with_timestamp=False, with_window=False): - super().__init__(label) - self.prefix = prefix - self.with_timestamp = with_timestamp - self.with_window = with_window - - def expand(self, input): - input | beam.ParDo( - self._LoggingFn(self.prefix, self.with_timestamp, - self.with_window)) diff --git a/learning/tour-of-beam/backend/README.md b/learning/tour-of-beam/backend/README.md index 61b0a93be338..6432c854b6fa 100644 --- a/learning/tour-of-beam/backend/README.md +++ b/learning/tour-of-beam/backend/README.md @@ -19,12 +19,25 @@ and currently logged-in user's snippets and progress. Currently it supports Java, Python, and Go Beam SDK. It is comprised of several Cloud Functions, with Firerstore in Datastore mode as a storage. -* get-sdk-list -* get-content-tree?sdk=(java|go|python) -* get-unit-content?unitId= -TODO: add response schemas -TODO: add save functions info -TODO: add user token info +Public endpoints: + +* getSdkList +* getContentTree?sdk=(java|go|python) +* getUnitContent?sdk=&id= + +Authorized endpoints also consume `Authorization: Bearer ` header + +* getUserProgress?sdk= +* postUnitContent?sdk=&id= + +### Playground GRPC API + +We use Playground GRPC to save/get user snippets, so we keep the generated stubs in [playground_api](playground_api) +To re-generate: +``` +$ go generate -x ./... +``` + ### Datastore schema @@ -55,37 +68,45 @@ __Kinds__ parentKey: parent module/group key +- tb_user + + key: `uid` from IDToken + +- tb_user_progress + + key: `_` + + parentKey: tb_user entity key + ### Deployment Prerequisites: - - GCP project with enabled Billing API & Cloud Functions API + - GCP project with enabled + * Billing API + * Cloud Functions API + * Firebase Admin API - set environment variables: * PROJECT_ID: GCP id * REGION: the region, "us-central1" fe - existing setup of Playground backend in a project -1. Deploy Datastore indexes +1. Deploy Datastore indexes (but don't delete existing Playground indexes!) ``` gcloud datastore indexes create ./internal/storage/index.yaml ``` 2. Deploy cloud functions ``` -$ gcloud functions deploy getSdkList --entry-point getSdkList \ - --region $REGION --runtime go116 --allow-unauthenticated \ - --trigger-http --set-env-vars="DATASTORE_PROJECT_ID=$PROJECT_ID" - -$ gcloud functions deploy getContentTree --entry-point getContentTree \ +for endpoint in "getSdkList getContentTree getUnitComplete getUserProgress postUnitComplete"; do +gcloud functions deploy $endpoint --entry-point $endpoint \ --region $REGION --runtime go116 --allow-unauthenticated \ - --trigger-http --set-env-vars="DATASTORE_PROJECT_ID=$PROJECT_ID" + --trigger-http --set-env-vars="DATASTORE_PROJECT_ID=$PROJECT_ID,GOOGLE_PROJECT_ID=$PROJECT_ID" -$ gcloud functions deploy getUnitContent --entry-point getUnitContent \ - --region $REGION --runtime go116 --allow-unauthenticated \ - --trigger-http --set-env-vars="DATASTORE_PROJECT_ID=$PROJECT_ID" ``` 3. Set environment variables: - TOB_MOCK: set to 1 to deliver mock responses from samples/api - DATASTORE_PROJECT_ID: Google Cloud PROJECT_ID +- GOOGLE_PROJECT_ID: Google Cloud PROJECT_ID (consumed by Firebase Admin SDK) - GOOGLE_APPLICATION_CREDENTIALS: path to json auth key - TOB_LEARNING_PATH: path the content tree root @@ -94,23 +115,36 @@ $ gcloud functions deploy getUnitContent --entry-point getUnitContent \ $ go run cmd/ci_cd/ci_cd.go ``` -### Sample usage +## Sample usage Entry point: list sdk names ``` -$ curl -X GET https://$REGION-$PROJECT_ID.cloudfunctions.net/getSdkList | json_pp +$ curl -X GET "https://$REGION-$PROJECT_ID.cloudfunctions.net/getSdkList" | json_pp ``` [response](./samples/api/get_sdk_list.json) -Get content tree by sdk name (SDK name == SDK id) +### Get content tree by sdk name (SDK name == SDK id) ``` -$ curl -X GET 'https://$REGION-$PROJECT_ID.cloudfunctions.net/getContentTree?sdk=python' +$ curl -X GET "https://$REGION-$PROJECT_ID.cloudfunctions.net/getContentTree?sdk=python" ``` [response](./samples/api/get_content_tree.json) -Get unit content tree by sdk name and unitId +### Get unit content by sdk name and unitId ``` -$ curl -X GET 'https://$REGION-$PROJECT_ID.cloudfunctions.net/getContentTree?sdk=python&id=challenge1' +$ curl -X GET "https://$REGION-$PROJECT_ID.cloudfunctions.net/getUnitContent?sdk=python&id=challenge1" ``` [response](./samples/api/get_unit_content.json) + +### Set unit as complete +``` +$ curl -X POST -H "Authorization: Bearer $token" \ + "https://$REGION-$PROJECT_ID.cloudfunctions.net/postUnitComplete?sdk=python&id=challenge1" -d '{}' +``` + +### Get user progress by sdk name +``` +$ curl -X GET -H "Authorization: Bearer $token" \ + "https://$REGION-$PROJECT_ID.cloudfunctions.net/getUserProgress?sdk=python" +``` +[response](./samples/api/get_user_progress.json) diff --git a/learning/tour-of-beam/backend/auth.go b/learning/tour-of-beam/backend/auth.go new file mode 100644 index 000000000000..e307c00fe293 --- /dev/null +++ b/learning/tour-of-beam/backend/auth.go @@ -0,0 +1,92 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package tob + +import ( + "context" + "log" + "net/http" + "strings" + + tob "beam.apache.org/learning/tour-of-beam/backend/internal" + "beam.apache.org/learning/tour-of-beam/backend/internal/storage" + firebase "firebase.google.com/go/v4" +) + +// HandleFunc enriched with sdk and authenticated user uid. +type HandlerFuncAuthWithSdk func(w http.ResponseWriter, r *http.Request, sdk tob.Sdk, uid string) + +const BEARER_SCHEMA = "Bearer " + +type Authorizer struct { + fbApp *firebase.App + repo storage.Iface +} + +func MakeAuthorizer(ctx context.Context, repo storage.Iface) *Authorizer { + // setup authorizer + // consumes: + // GOOGLE_PROJECT_ID + // GOOGLE_APPLICATION_CREDENTIALS + // OR + // FIREBASE_AUTH_EMULATOR_HOST + fbApp, err := firebase.NewApp(ctx, nil) + if err != nil { + log.Fatalf("error initializing firebase: %v", err) + } + return &Authorizer{fbApp, repo} +} + +// middleware to parse authorization header, verify the ID token and extract uid. +func (a *Authorizer) ParseAuthHeader(next HandlerFuncAuthWithSdk) HandlerFuncWithSdk { + return func(w http.ResponseWriter, r *http.Request, sdk tob.Sdk) { + ctx := r.Context() + header := r.Header.Get("authorization") // returns "" if no header + if !strings.HasPrefix(header, BEARER_SCHEMA) { + log.Printf("Bad authorization header") + finalizeErrResponse(w, http.StatusUnauthorized, UNAUTHORIZED, "bad auth header") + return + } + + client, err := a.fbApp.Auth(ctx) + if err != nil { + log.Println("Failed to get auth client:", err) + finalizeErrResponse(w, http.StatusInternalServerError, INTERNAL_ERROR, "auth client failed") + return + } + + tokenEncoded := header[len(BEARER_SCHEMA):] + token, err := client.VerifyIDTokenAndCheckRevoked(ctx, tokenEncoded) + if err != nil { + log.Println("Failed to verify token:", err) + finalizeErrResponse(w, http.StatusUnauthorized, UNAUTHORIZED, "failed to verify token") + return + } + + uid := token.UID + // store in tb_user + // TODO: implement IDToken caching in tb_user to optimize calls to Firebase API + if err = a.repo.SaveUser(ctx, uid); err != nil { + log.Println("Failed to store user info:", err) + finalizeErrResponse(w, http.StatusInternalServerError, INTERNAL_ERROR, "failed to store user") + return + } + + next(w, r, sdk, uid) + } +} diff --git a/learning/tour-of-beam/backend/docker-compose.yml b/learning/tour-of-beam/backend/docker-compose.yml index 67a289f1ac3f..50823ab1bfef 100644 --- a/learning/tour-of-beam/backend/docker-compose.yml +++ b/learning/tour-of-beam/backend/docker-compose.yml @@ -17,12 +17,32 @@ version: "3" services: datastore: - build: internal/storage/image + build: integration_tests/emulators/datastore volumes: - ${DATASTORE_EMULATOR_DATADIR}:/opt/data environment: - - DATASTORE_PROJECT_ID=project-test + - DATASTORE_PROJECT_ID - DATASTORE_LISTEN_ADDRESS=0.0.0.0:8081 ports: - "8081:8081" command: --consistency=1.0 --store-on-disk + + firebase_auth: + build: integration_tests/emulators/firebase + environment: + - PROJECT_ID=${GOOGLE_CLOUD_PROJECT} + ports: + - "9099:9099" + + playground-router: + image: apache/beam_playground-backend-router + environment: + - GOOGLE_CLOUD_PROJECT + - DATASTORE_EMULATOR_HOST=datastore:8081 + - CACHE_TYPE=local + - SDK_CONFIG=/opt/playground/backend/sdks-emulator.yaml + - PROTOCOL_TYPE=TCP + ports: + - "8000:8080" + depends_on: + - datastore diff --git a/learning/tour-of-beam/backend/function.go b/learning/tour-of-beam/backend/function.go index 363c1585b928..2138ba19da0a 100644 --- a/learning/tour-of-beam/backend/function.go +++ b/learning/tour-of-beam/backend/function.go @@ -15,11 +15,17 @@ // specific language governing permissions and limitations // under the License. +//go:generate protoc -I ../../../playground/api/v1 --go_out=playground_api --go_opt=paths=source_relative api.proto +//go:generate protoc -I ../../../playground/api/v1 --go-grpc_out=playground_api --go-grpc_opt=paths=source_relative api.proto +//go:generate moq -rm -out playground_api/mock.go playground_api PlaygroundServiceClient + package tob import ( "context" "encoding/json" + "errors" + "fmt" "log" "net/http" "os" @@ -27,14 +33,18 @@ import ( tob "beam.apache.org/learning/tour-of-beam/backend/internal" "beam.apache.org/learning/tour-of-beam/backend/internal/service" "beam.apache.org/learning/tour-of-beam/backend/internal/storage" + pb "beam.apache.org/learning/tour-of-beam/backend/playground_api" "cloud.google.com/go/datastore" "github.com/GoogleCloudPlatform/functions-framework-go/functions" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" + grpc_status "google.golang.org/grpc/status" ) -const ( - BAD_FORMAT = "BAD_FORMAT" - INTERNAL_ERROR = "INTERNAL_ERROR" - NOT_FOUND = "NOT_FOUND" +var ( + svc service.IContent + auth *Authorizer + pgClient pb.PlaygroundServiceClient ) // Helper to format http error messages. @@ -45,31 +55,66 @@ func finalizeErrResponse(w http.ResponseWriter, status int, code, message string _ = json.NewEncoder(w).Encode(resp) } -var svc service.IContent - -func init() { +func MakeRepo(ctx context.Context) storage.Iface { // dependencies // required: // * TOB_MOCK: respond with static samples // OR + // * GOOGLE_APPLICATION_CREDENTIALS: json file path to cloud credentials // * DATASTORE_PROJECT_ID: cloud project id // optional: // * DATASTORE_EMULATOR_HOST: emulator host/port (ex. 0.0.0.0:8888) if os.Getenv("TOB_MOCK") > "" { - svc = &service.Mock{} + fmt.Println("Initialize mock storage") + return &storage.Mock{} } else { // consumes DATASTORE_* env variables - client, err := datastore.NewClient(context.Background(), "") + client, err := datastore.NewClient(ctx, "") if err != nil { log.Fatalf("new datastore client: %v", err) } - svc = &service.Svc{Repo: &storage.DatastoreDb{Client: client}} + + return &storage.DatastoreDb{Client: client} + } +} + +func MakePlaygroundClient(ctx context.Context) pb.PlaygroundServiceClient { + // dependencies + // required: + // * TOB_MOCK: use mock implementation + // OR + // * PLAYGROUND_ROUTER_HOST: playground API host/port + if os.Getenv("TOB_MOCK") > "" { + fmt.Println("Using mock playground client") + return pb.GetMockClient() + } else { + host := os.Getenv("PLAYGROUND_ROUTER_HOST") + cc, err := grpc.Dial(host, grpc.WithTransportCredentials(insecure.NewCredentials())) + if err != nil { + log.Fatalf("fail to dial playground: %v", err) + } + return pb.NewPlaygroundServiceClient(cc) } +} + +func init() { + ctx := context.Background() + + repo := MakeRepo(ctx) + pgClient = MakePlaygroundClient(ctx) + svc = &service.Svc{Repo: repo, PgClient: pgClient} + auth = MakeAuthorizer(ctx, repo) + commonGet := Common(http.MethodGet) + commonPost := Common(http.MethodPost) // functions framework - functions.HTTP("getSdkList", Common(getSdkList)) - functions.HTTP("getContentTree", Common(ParseSdkParam(getContentTree))) - functions.HTTP("getUnitContent", Common(ParseSdkParam(getUnitContent))) + functions.HTTP("getSdkList", commonGet(getSdkList)) + functions.HTTP("getContentTree", commonGet(ParseSdkParam(getContentTree))) + functions.HTTP("getUnitContent", commonGet(ParseSdkParam(getUnitContent))) + + functions.HTTP("getUserProgress", commonGet(ParseSdkParam(auth.ParseAuthHeader(getUserProgress)))) + functions.HTTP("postUnitComplete", commonPost(ParseSdkParam(auth.ParseAuthHeader(postUnitComplete)))) + functions.HTTP("postUserCode", commonPost(ParseSdkParam(auth.ParseAuthHeader(postUserCode)))) } // Get list of SDK names @@ -85,12 +130,10 @@ func getSdkList(w http.ResponseWriter, r *http.Request) { } } -// Get the content tree for a given SDK and user -// Merges info from the default tree and per-user information: -// user code snippets and progress +// Get the content tree for a given SDK // Required to be wrapped into ParseSdkParam middleware. func getContentTree(w http.ResponseWriter, r *http.Request, sdk tob.Sdk) { - tree, err := svc.GetContentTree(r.Context(), sdk, nil /*TODO userId*/) + tree, err := svc.GetContentTree(r.Context(), sdk) if err != nil { log.Println("Get content tree error:", err) finalizeErrResponse(w, http.StatusInternalServerError, INTERNAL_ERROR, "storage error") @@ -112,8 +155,8 @@ func getContentTree(w http.ResponseWriter, r *http.Request, sdk tob.Sdk) { func getUnitContent(w http.ResponseWriter, r *http.Request, sdk tob.Sdk) { unitId := r.URL.Query().Get("id") - unit, err := svc.GetUnitContent(r.Context(), sdk, unitId, nil /*TODO userId*/) - if err == service.ErrNoUnit { + unit, err := svc.GetUnitContent(r.Context(), sdk, unitId) + if errors.Is(err, tob.ErrNoUnit) { finalizeErrResponse(w, http.StatusNotFound, NOT_FOUND, "unit not found") return } @@ -130,3 +173,61 @@ func getUnitContent(w http.ResponseWriter, r *http.Request, sdk tob.Sdk) { return } } + +// Get user progress +func getUserProgress(w http.ResponseWriter, r *http.Request, sdk tob.Sdk, uid string) { + progress, err := svc.GetUserProgress(r.Context(), sdk, uid) + + if err != nil { + log.Println("Get user progress error:", err) + finalizeErrResponse(w, http.StatusInternalServerError, INTERNAL_ERROR, "storage error") + return + } + + err = json.NewEncoder(w).Encode(progress) + if err != nil { + log.Println("Format user progress error:", err) + finalizeErrResponse(w, http.StatusInternalServerError, INTERNAL_ERROR, "format user progress content") + return + } +} + +// Mark unit completed +func postUnitComplete(w http.ResponseWriter, r *http.Request, sdk tob.Sdk, uid string) { + unitId := r.URL.Query().Get("id") + + err := svc.SetUnitComplete(r.Context(), sdk, unitId, uid) + if err != nil { + log.Println("Set unit complete error:", err) + finalizeErrResponse(w, http.StatusInternalServerError, INTERNAL_ERROR, "storage error") + return + } + + fmt.Fprint(w, "{}") +} + +// Save user code for unit +func postUserCode(w http.ResponseWriter, r *http.Request, sdk tob.Sdk, uid string) { + unitId := r.URL.Query().Get("id") + + var userCodeRequest tob.UserCodeRequest + err := json.NewDecoder(r.Body).Decode(&userCodeRequest) + if err != nil { + log.Println("body decode error:", err) + finalizeErrResponse(w, http.StatusBadRequest, BAD_FORMAT, "bad request body") + return + } + + err = svc.SaveUserCode(r.Context(), sdk, unitId, uid, userCodeRequest) + if err != nil { + log.Println("Save user code error:", err) + message := "storage error" + if st, ok := grpc_status.FromError(err); ok { + message = fmt.Sprintf("playground api error: %s", st) + } + finalizeErrResponse(w, http.StatusInternalServerError, INTERNAL_ERROR, message) + return + } + + fmt.Fprint(w, "{}") +} diff --git a/learning/tour-of-beam/backend/go.mod b/learning/tour-of-beam/backend/go.mod index 6601abeee276..d796f60bc9d4 100644 --- a/learning/tour-of-beam/backend/go.mod +++ b/learning/tour-of-beam/backend/go.mod @@ -24,5 +24,9 @@ require ( require ( cloud.google.com/go/datastore v1.8.0 + cloud.google.com/go/firestore v1.7.0 // indirect + firebase.google.com/go/v4 v4.9.0 github.com/stretchr/testify v1.8.0 + google.golang.org/grpc v1.49.0 + google.golang.org/protobuf v1.28.1 ) diff --git a/learning/tour-of-beam/backend/go.sum b/learning/tour-of-beam/backend/go.sum index 45dd6a5e1909..ba9fb37df5d1 100644 --- a/learning/tour-of-beam/backend/go.sum +++ b/learning/tour-of-beam/backend/go.sum @@ -27,38 +27,104 @@ cloud.google.com/go v0.94.1/go.mod h1:qAlAugsXlC+JWO+Bke5vCtc9ONxjQT3drlTTnAplMW cloud.google.com/go v0.97.0/go.mod h1:GF7l59pYBVlXQIBLx3a761cZ41F9bBH3JUlihCt2Udc= cloud.google.com/go v0.99.0/go.mod h1:w0Xx2nLzqWJPuozYQX+hFfCSI8WioryfRDzkoI/Y2ZA= cloud.google.com/go v0.100.2/go.mod h1:4Xra9TjzAeYHrl5+oeLlzbM2k3mjVhZh4UqTZ//w99A= -cloud.google.com/go v0.102.1 h1:vpK6iQWv/2uUeFJth4/cBHsQAGjn1iIE6AAlxipRaA0= +cloud.google.com/go v0.102.0/go.mod h1:oWcCzKlqJ5zgHQt9YsaeTY9KzIvjyy0ArmiBUgpQ+nc= cloud.google.com/go v0.102.1/go.mod h1:XZ77E9qnTEnrgEOvr4xzfdX5TRo7fB4T2F4O6+34hIU= +cloud.google.com/go v0.104.0 h1:gSmWO7DY1vOm0MVU6DNXM11BWHHsTUmsC5cv1fuW5X8= +cloud.google.com/go v0.104.0/go.mod h1:OO6xxXdJyvuJPcEPBLN9BJPD+jep5G1+2U5B5gkRYtA= +cloud.google.com/go/analytics v0.11.0/go.mod h1:DjEWCu41bVbYcKyvlws9Er60YE4a//bK6mnhWvQeFNI= +cloud.google.com/go/area120 v0.5.0/go.mod h1:DE/n4mp+iqVyvxHN41Vf1CR602GiHQjFPusMFW6bGR4= +cloud.google.com/go/artifactregistry v1.6.0/go.mod h1:IYt0oBPSAGYj/kprzsBjZ/4LnG/zOcHyFHjWPCi6SAQ= +cloud.google.com/go/asset v1.5.0/go.mod h1:5mfs8UvcM5wHhqtSv8J1CtxxaQq3AdBxxQi2jGW/K4o= +cloud.google.com/go/assuredworkloads v1.5.0/go.mod h1:n8HOZ6pff6re5KYfBXcFvSViQjDwxFkAkmUFffJRbbY= +cloud.google.com/go/automl v1.5.0/go.mod h1:34EjfoFGMZ5sgJ9EoLsRtdPSNZLcfflJR39VbVNS2M0= cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o= cloud.google.com/go/bigquery v1.3.0/go.mod h1:PjpwJnslEMmckchkHFfq+HTD2DmtT67aNFKH1/VBDHE= cloud.google.com/go/bigquery v1.4.0/go.mod h1:S8dzgnTigyfTmLBfrtrhyYhwRxG72rYxvftPBK2Dvzc= cloud.google.com/go/bigquery v1.5.0/go.mod h1:snEHRnqQbz117VIFhE8bmtwIDY80NLUZUMb4Nv6dBIg= cloud.google.com/go/bigquery v1.7.0/go.mod h1://okPTzCYNXSlb24MZs83e2Do+h+VXtc4gLoIoXIAPc= cloud.google.com/go/bigquery v1.8.0/go.mod h1:J5hqkt3O0uAFnINi6JXValWIb1v0goeZM77hZzJN/fQ= +cloud.google.com/go/billing v1.4.0/go.mod h1:g9IdKBEFlItS8bTtlrZdVLWSSdSyFUZKXNS02zKMOZY= +cloud.google.com/go/binaryauthorization v1.1.0/go.mod h1:xwnoWu3Y84jbuHa0zd526MJYmtnVXn0syOjaJgy4+dM= +cloud.google.com/go/cloudtasks v1.5.0/go.mod h1:fD92REy1x5woxkKEkLdvavGnPJGEn8Uic9nWuLzqCpY= cloud.google.com/go/compute v0.1.0/go.mod h1:GAesmwr110a34z04OlxYkATPBEfVhkymfTBXtfbBFow= cloud.google.com/go/compute v1.3.0/go.mod h1:cCZiE1NHEtai4wiufUhW8I8S1JKkAnhnQJWM7YD99wM= cloud.google.com/go/compute v1.5.0/go.mod h1:9SMHyhJlzhlkJqrPAc839t2BZFTSk6Jdj6mkzQJeu0M= cloud.google.com/go/compute v1.6.0/go.mod h1:T29tfhtVbq1wvAPo0E3+7vhgmkOYeXjhFvz/FMzPu0s= -cloud.google.com/go/compute v1.6.1 h1:2sMmt8prCn7DPaG4Pmh0N3Inmc8cT8ae5k1M6VJ9Wqc= cloud.google.com/go/compute v1.6.1/go.mod h1:g85FgpzFvNULZ+S8AYq87axRKuf2Kh7deLqV/jJ3thU= +cloud.google.com/go/compute v1.7.0 h1:v/k9Eueb8aAJ0vZuxKMrgm6kPhCLZU9HxFU+AFDs9Uk= +cloud.google.com/go/compute v1.7.0/go.mod h1:435lt8av5oL9P3fv1OEzSbSUe+ybHXGMPQHHZWZxy9U= +cloud.google.com/go/datacatalog v1.5.0/go.mod h1:M7GPLNQeLfWqeIm3iuiruhPzkt65+Bx8dAKvScX8jvs= +cloud.google.com/go/dataflow v0.6.0/go.mod h1:9QwV89cGoxjjSR9/r7eFDqqjtvbKxAK2BaYU6PVk9UM= +cloud.google.com/go/dataform v0.3.0/go.mod h1:cj8uNliRlHpa6L3yVhDOBrUXH+BPAO1+KFMQQNSThKo= +cloud.google.com/go/datalabeling v0.5.0/go.mod h1:TGcJ0G2NzcsXSE/97yWjIZO0bXj0KbVlINXMG9ud42I= +cloud.google.com/go/dataqna v0.5.0/go.mod h1:90Hyk596ft3zUQ8NkFfvICSIfHFh1Bc7C4cK3vbhkeo= cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE= cloud.google.com/go/datastore v1.1.0/go.mod h1:umbIZjpQpHh4hmRpGhH4tLFup+FVzqBi1b3c64qFpCk= cloud.google.com/go/datastore v1.8.0 h1:2qo2G7hABSeqswa+5Ga3+QB8/ZwKOJmDsCISM9scmsU= cloud.google.com/go/datastore v1.8.0/go.mod h1:q1CpHVByTlXppdqTcu4LIhCsTn3fhtZ5R7+TajciO+M= -cloud.google.com/go/functions v1.0.0 h1:cOFEDJ3sgAFRjRULSUJ0Q8cw9qFa5JdpXIBWoNX5uDw= +cloud.google.com/go/datastream v1.2.0/go.mod h1:i/uTP8/fZwgATHS/XFu0TcNUhuA0twZxxQ3EyCUQMwo= +cloud.google.com/go/dialogflow v1.15.0/go.mod h1:HbHDWs33WOGJgn6rfzBW1Kv807BE3O1+xGbn59zZWI4= +cloud.google.com/go/documentai v1.7.0/go.mod h1:lJvftZB5NRiFSX4moiye1SMxHx0Bc3x1+p9e/RfXYiU= +cloud.google.com/go/domains v0.6.0/go.mod h1:T9Rz3GasrpYk6mEGHh4rymIhjlnIuB4ofT1wTxDeT4Y= +cloud.google.com/go/edgecontainer v0.1.0/go.mod h1:WgkZ9tp10bFxqO8BLPqv2LlfmQF1X8lZqwW4r1BTajk= +cloud.google.com/go/firestore v1.6.1/go.mod h1:asNXNOzBdyVQmEU+ggO8UPodTkEVFW5Qx+rwHnAz+EY= +cloud.google.com/go/firestore v1.7.0 h1:cNkQyruzd5v7FjmL6eeDqwqgX+FbPCjbHxz7vsMhGoo= +cloud.google.com/go/firestore v1.7.0/go.mod h1:0b8DxQkXhbg/PmsjhCUAg4EExIuifAvbHj5Z/iX3BYI= cloud.google.com/go/functions v1.0.0/go.mod h1:O9KS8UweFVo6GbbbCBKh5yEzbW08PVkg2spe3RfPMd4= +cloud.google.com/go/functions v1.6.0 h1:Oveqoadoi2f+yMpJtf1/OrwhTIzaR38l+6Q8/RPyM18= +cloud.google.com/go/functions v1.6.0/go.mod h1:3H1UA3qiIPRWD7PeZKLvHZ9SaQhR26XIJcC0A5GbvAk= +cloud.google.com/go/gaming v1.5.0/go.mod h1:ol7rGcxP/qHTRQE/RO4bxkXq+Fix0j6D4LFPzYTIrDM= +cloud.google.com/go/gkeconnect v0.5.0/go.mod h1:c5lsNAg5EwAy7fkqX/+goqFsU1Da/jQFqArp+wGNr/o= +cloud.google.com/go/gkehub v0.9.0/go.mod h1:WYHN6WG8w9bXU0hqNxt8rm5uxnk8IH+lPY9J2TV7BK0= +cloud.google.com/go/iam v0.3.0 h1:exkAomrVUuzx9kWFI1wm3KI0uoDeUFPB4kKGzx6x+Gc= cloud.google.com/go/iam v0.3.0/go.mod h1:XzJPvDayI+9zsASAFO68Hk07u3z+f+JrT2xXNdp4bnY= +cloud.google.com/go/language v1.4.0/go.mod h1:F9dRpNFQmJbkaop6g0JhSBXCNlO90e1KWx5iDdxbWic= +cloud.google.com/go/lifesciences v0.5.0/go.mod h1:3oIKy8ycWGPUyZDR/8RNnTOYevhaMLqh5vLUXs9zvT8= +cloud.google.com/go/mediatranslation v0.5.0/go.mod h1:jGPUhGTybqsPQn91pNXw0xVHfuJ3leR1wj37oU3y1f4= +cloud.google.com/go/memcache v1.4.0/go.mod h1:rTOfiGZtJX1AaFUrOgsMHX5kAzaTQ8azHiuDoTPzNsE= +cloud.google.com/go/metastore v1.5.0/go.mod h1:2ZNrDcQwghfdtCwJ33nM0+GrBGlVuh8rakL3vdPY3XY= +cloud.google.com/go/networkconnectivity v1.4.0/go.mod h1:nOl7YL8odKyAOtzNX73/M5/mGZgqqMeryi6UPZTk/rA= +cloud.google.com/go/networksecurity v0.5.0/go.mod h1:xS6fOCoqpVC5zx15Z/MqkfDwH4+m/61A3ODiDV1xmiQ= +cloud.google.com/go/notebooks v1.2.0/go.mod h1:9+wtppMfVPUeJ8fIWPOq1UnATHISkGXGqTkxeieQ6UY= +cloud.google.com/go/osconfig v1.7.0/go.mod h1:oVHeCeZELfJP7XLxcBGTMBvRO+1nQ5tFG9VQTmYS2Fs= +cloud.google.com/go/oslogin v1.4.0/go.mod h1:YdgMXWRaElXz/lDk1Na6Fh5orF7gvmJ0FGLIs9LId4E= +cloud.google.com/go/phishingprotection v0.5.0/go.mod h1:Y3HZknsK9bc9dMi+oE8Bim0lczMU6hrX0UpADuMefr0= +cloud.google.com/go/privatecatalog v0.5.0/go.mod h1:XgosMUvvPyxDjAVNDYxJ7wBW8//hLDDYmnsNcMGq1K0= cloud.google.com/go/pubsub v1.0.1/go.mod h1:R0Gpsv3s54REJCy4fxDixWD93lHJMoZTyQ2kNxGRt3I= cloud.google.com/go/pubsub v1.1.0/go.mod h1:EwwdRX2sKPjnvnqCa270oGRyludottCI76h+R3AArQw= cloud.google.com/go/pubsub v1.2.0/go.mod h1:jhfEVHT8odbXTkndysNHCcx0awwzvfOlguIAii9o8iA= cloud.google.com/go/pubsub v1.3.1/go.mod h1:i+ucay31+CNRpDW4Lu78I4xXG+O1r/MAHgjpRVR+TSU= +cloud.google.com/go/recaptchaenterprise v1.3.1/go.mod h1:OdD+q+y4XGeAlxRaMn1Y7/GveP6zmq76byL6tjPE7d4= +cloud.google.com/go/recaptchaenterprise/v2 v2.1.0/go.mod h1:w9yVqajwroDNTfGuhmOjPDN//rZGySaf6PtFVcSCa7o= +cloud.google.com/go/recommendationengine v0.5.0/go.mod h1:E5756pJcVFeVgaQv3WNpImkFP8a+RptV6dDLGPILjvg= +cloud.google.com/go/recommender v1.5.0/go.mod h1:jdoeiBIVrJe9gQjwd759ecLJbxCDED4A6p+mqoqDvTg= +cloud.google.com/go/redis v1.7.0/go.mod h1:V3x5Jq1jzUcg+UNsRvdmsfuFnit1cfe3Z/PGyq/lm4Y= +cloud.google.com/go/retail v1.8.0/go.mod h1:QblKS8waDmNUhghY2TI9O3JLlFk8jybHeV4BF19FrE4= +cloud.google.com/go/scheduler v1.4.0/go.mod h1:drcJBmxF3aqZJRhmkHQ9b3uSSpQoltBPGPxGAWROx6s= +cloud.google.com/go/secretmanager v1.6.0/go.mod h1:awVa/OXF6IiyaU1wQ34inzQNc4ISIDIrId8qE5QGgKA= +cloud.google.com/go/security v1.5.0/go.mod h1:lgxGdyOKKjHL4YG3/YwIL2zLqMFCKs0UbQwgyZmfJl4= +cloud.google.com/go/security v1.7.0/go.mod h1:mZklORHl6Bg7CNnnjLH//0UlAlaXqiG7Lb9PsPXLfD0= +cloud.google.com/go/securitycenter v1.13.0/go.mod h1:cv5qNAqjY84FCN6Y9z28WlkKXyWsgLO832YiWwkCWcU= +cloud.google.com/go/servicedirectory v1.4.0/go.mod h1:gH1MUaZCgtP7qQiI+F+A+OpeKF/HQWgtAddhTbhL2bs= +cloud.google.com/go/speech v1.6.0/go.mod h1:79tcr4FHCimOp56lwC01xnt/WPJZc4v3gzyT7FoBkCM= cloud.google.com/go/storage v1.0.0/go.mod h1:IhtSnM/ZTZV8YYJWCY8RULGVqBDmpoyjwiyrjsg+URw= cloud.google.com/go/storage v1.5.0/go.mod h1:tpKbwo567HUNpVclU5sGELwQWBDZ8gh0ZeosJ0Rtdos= cloud.google.com/go/storage v1.6.0/go.mod h1:N7U0C8pVQ/+NIKOBQyamJIeKQKkZ+mxpohlUTyfDhBk= cloud.google.com/go/storage v1.8.0/go.mod h1:Wv1Oy7z6Yz3DshWRJFhqM/UCfaWIRTdp0RXyy7KQOVs= cloud.google.com/go/storage v1.10.0/go.mod h1:FLPqc6j+Ki4BU591ie1oL6qBQGu2Bl/tZ9ullr3+Kg0= cloud.google.com/go/storage v1.22.1/go.mod h1:S8N1cAStu7BOeFfE8KAQzmyyLkK8p/vmRq6kuBTW58Y= +cloud.google.com/go/storage v1.23.0/go.mod h1:vOEEDNFnciUMhBeT6hsJIn3ieU5cFRmzeLgDvXzfIXc= +cloud.google.com/go/storage v1.26.0 h1:lYAGjknyDJirSzfwUlkv4Nsnj7od7foxQNH/fqZqles= +cloud.google.com/go/storage v1.26.0/go.mod h1:mk/N7YwIKEWyTvXAWQCIeiCTdLoRH6Pd5xmSnolQLTI= +cloud.google.com/go/talent v1.1.0/go.mod h1:Vl4pt9jiHKvOgF9KoZo6Kob9oV4lwd/ZD5Cto54zDRw= +cloud.google.com/go/videointelligence v1.6.0/go.mod h1:w0DIDlVRKtwPCn/C4iwZIJdvC69yInhW0cfi+p546uU= +cloud.google.com/go/vision v1.2.0/go.mod h1:SmNwgObm5DpFBme2xpyOyasvBc1aPdjvMk2bBk0tKD0= +cloud.google.com/go/vision/v2 v2.2.0/go.mod h1:uCdV4PpN1S0jyCyq8sIM42v2Y6zOLkZs+4R9LrGYwFo= +cloud.google.com/go/webrisk v1.4.0/go.mod h1:Hn8X6Zr+ziE2aNd8SliSDWpEnSS1u4R9+xXZmFiHmGE= +cloud.google.com/go/workflows v1.6.0/go.mod h1:6t9F5h/unJz41YqfBmqSASJSXccBLtD1Vwf+KmJENM0= dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= +firebase.google.com/go/v4 v4.9.0 h1:VCagv+hYOxUGeuyu7J+o2rKJkDp5JQBbA3Bzlof+LMk= +firebase.google.com/go/v4 v4.9.0/go.mod h1:bHhRkM3VtGJx19rQdW7GDNLdnA8/T6SsnN5nXk/xdw8= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= github.com/GoogleCloudPlatform/functions-framework-go v1.5.3 h1:Xx8uWT4hjgbjuXexbpU6V0yawWOdrbcAzZVyMYJvX8Q= @@ -152,9 +218,11 @@ github.com/google/go-cmp v0.5.7/go.mod h1:n+brtR0CgQNWTVd5ZUFpTBC8YFBDLK/h/bpaJ8 github.com/google/go-cmp v0.5.8 h1:e6P7q2lk1O+qJJb4BtCQXlK8vWEO8V1ZeuEdJNOqZyg= github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/martian v2.1.0+incompatible h1:/CP5g8u/VJHijgedC/Legn3BAbAaWPgecwXBIDzw5no= github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs= github.com/google/martian/v3 v3.0.0/go.mod h1:y5Zk1BBys9G+gd6Jrk0W3cC1+ELVxBWuIGO+w/tUAp0= github.com/google/martian/v3 v3.1.0/go.mod h1:y5Zk1BBys9G+gd6Jrk0W3cC1+ELVxBWuIGO+w/tUAp0= +github.com/google/martian/v3 v3.2.1 h1:d8MncMlErDFTwQGBK1xhv026j9kqhvw1Qv9IbWT1VLQ= github.com/google/martian/v3 v3.2.1/go.mod h1:oBOf6HBosgwRXnUGWUB05QECsc6uvmMiJ3+6W4l/CUk= github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= github.com/google/pprof v0.0.0-20190515194954-54271f7e092f/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= @@ -172,18 +240,21 @@ github.com/google/pprof v0.0.0-20210609004039-a478d1d731e9/go.mod h1:kpwsk12EmLe github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/google/uuid v1.1.2 h1:EVhdT+1Kseyi1/pUmXKaFxYsDNy9RQYkMWRH68J/W7Y= github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/googleapis/enterprise-certificate-proxy v0.0.0-20220520183353-fd19c99a87aa h1:7MYGT2XEMam7Mtzv1yDUYXANedWvwk3HKkR3MyGowy8= +github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= +github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/googleapis/enterprise-certificate-proxy v0.0.0-20220520183353-fd19c99a87aa/go.mod h1:17drOmN3MwGY7t0e+Ei9b45FFGA3fBs3x36SsCg1hq8= +github.com/googleapis/enterprise-certificate-proxy v0.1.0 h1:zO8WHNx/MYiAKJ3d5spxZXZE6KHmIQGQcAzwUzV7qQw= +github.com/googleapis/enterprise-certificate-proxy v0.1.0/go.mod h1:17drOmN3MwGY7t0e+Ei9b45FFGA3fBs3x36SsCg1hq8= github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg= github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk= github.com/googleapis/gax-go/v2 v2.1.0/go.mod h1:Q3nei7sK6ybPYH7twZdmQpAd1MKb7pfu6SK+H1/DsU0= github.com/googleapis/gax-go/v2 v2.1.1/go.mod h1:hddJymUZASv3XPyGkUpKj8pPO47Rmb0eJc8R6ouapiM= github.com/googleapis/gax-go/v2 v2.2.0/go.mod h1:as02EH8zWkzwUoLbBaFeQ+arQaj/OthfcblKl4IGNaM= github.com/googleapis/gax-go/v2 v2.3.0/go.mod h1:b8LNqSzNabLiUpXKkY7HAR5jr6bIT99EXz9pXxye9YM= -github.com/googleapis/gax-go/v2 v2.4.0 h1:dS9eYAjhrE2RjmzYw2XAPvcXfmcQLtFEQWn0CR82awk= github.com/googleapis/gax-go/v2 v2.4.0/go.mod h1:XOTVJ59hdnfJLIP/dh8n5CGryZR2LxK9wbMD5+iXC6c= +github.com/googleapis/gax-go/v2 v2.5.1 h1:kBRZU0PSuI7PspsSb/ChWoVResUcwNVIdpB049pKTiw= +github.com/googleapis/gax-go/v2 v2.5.1/go.mod h1:h6B0KMMFNtI2ddbGJn3T3ZbwkeT6yqEF02fYlzkUCyo= github.com/googleapis/go-type-adapters v1.0.0/go.mod h1:zHW75FOG2aur7gAO2B+MLby+cLsWGBF62rFAi7WjWO4= github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw= github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= @@ -326,8 +397,12 @@ golang.org/x/net v0.0.0-20220225172249-27dd8689420f/go.mod h1:CfG3xpIq0wQ8r1q4Su golang.org/x/net v0.0.0-20220325170049-de3da57026de/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= golang.org/x/net v0.0.0-20220412020605-290c469a71a5/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= golang.org/x/net v0.0.0-20220425223048-2871e0cb64e4/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= -golang.org/x/net v0.0.0-20220607020251-c690dde0001d h1:4SFsTMi4UahlKoloni7L4eYzhFRifURQLw+yv0QDCx8= golang.org/x/net v0.0.0-20220607020251-c690dde0001d/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.0.0-20220617184016-355a448f1bc9/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.0.0-20220624214902-1bab6f366d9e/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.0.0-20220708220712-1185a9018129/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.0.0-20220909164309-bea034e7d591 h1:D0B/7al0LLrVC8aWF4+oxpv/m8bc7ViFfVS8/gXGdqI= +golang.org/x/net v0.0.0-20220909164309-bea034e7d591/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -343,12 +418,16 @@ golang.org/x/oauth2 v0.0.0-20210514164344-f6687ab2804c/go.mod h1:KelEdhl1UZF7XfJ golang.org/x/oauth2 v0.0.0-20210628180205-a41e5a781914/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= golang.org/x/oauth2 v0.0.0-20210805134026-6f1e6394065a/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= golang.org/x/oauth2 v0.0.0-20210819190943-2bc19b11175f/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= +golang.org/x/oauth2 v0.0.0-20211005180243-6b3c2da341f1/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= golang.org/x/oauth2 v0.0.0-20211104180415-d3ed0bb246c8/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b/go.mod h1:DAh4E804XQdzx2j+YRIaUnCqCV2RuMz24cGBJ5QYIrc= golang.org/x/oauth2 v0.0.0-20220309155454-6242fa91716a/go.mod h1:DAh4E804XQdzx2j+YRIaUnCqCV2RuMz24cGBJ5QYIrc= golang.org/x/oauth2 v0.0.0-20220411215720-9780585627b5/go.mod h1:DAh4E804XQdzx2j+YRIaUnCqCV2RuMz24cGBJ5QYIrc= -golang.org/x/oauth2 v0.0.0-20220608161450-d0670ef3b1eb h1:8tDJ3aechhddbdPAxpycgXHJRMLpk/Ab+aa4OgdN5/g= golang.org/x/oauth2 v0.0.0-20220608161450-d0670ef3b1eb/go.mod h1:jaDAt6Dkxork7LmZnYtzbRWj0W47D86a3TGe0YHBvmE= +golang.org/x/oauth2 v0.0.0-20220622183110-fd043fe589d2/go.mod h1:jaDAt6Dkxork7LmZnYtzbRWj0W47D86a3TGe0YHBvmE= +golang.org/x/oauth2 v0.0.0-20220822191816-0ebed06d0094/go.mod h1:h4gKUeWbJ4rQPri7E0u6Gs4e9Ri2zaLxzw5DI5XGrYg= +golang.org/x/oauth2 v0.0.0-20220909003341-f21342109be1 h1:lxqLZaMad/dJHMFZH0NiNpiEZI/nhgWhe4wgzpE+MuA= +golang.org/x/oauth2 v0.0.0-20220909003341-f21342109be1/go.mod h1:h4gKUeWbJ4rQPri7E0u6Gs4e9Ri2zaLxzw5DI5XGrYg= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -360,6 +439,7 @@ golang.org/x/sync v0.0.0-20200625203802-6e8e738ad208/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201207232520-09787c993a3a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220601150217-0de741cfad7f h1:Ax0t5p6N38Ga0dThY21weqDEyz2oklo4IvDkpigvkD8= golang.org/x/sync v0.0.0-20220601150217-0de741cfad7f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -407,6 +487,7 @@ golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20210806184541-e5e7981a1069/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210823070655-63515b42dcdf/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210908233432-aa78b53d3365/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20211007075335-d3039528d8ac/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20211124211545-fe61309f8881/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20211210111614-af8b64212486/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= @@ -416,9 +497,13 @@ golang.org/x/sys v0.0.0-20220227234510-4e6760a101f9/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220328115105-d36c6a25d886/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220412211240-33da011f77ad/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220502124256-b6088ccd6cba/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220503163025-988cb79eb6c6/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220610221304-9f5ed59c137d h1:Zu/JngovGLVi6t2J3nmAf3AoTDwuzw85YZ3b9o4yU7s= golang.org/x/sys v0.0.0-20220610221304-9f5ed59c137d/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220615213510-4f61da869c0c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220624220833-87e55d714810/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10 h1:WIoqL4EROvwiPdUtaip4VcDdpZ4kha7wBWZrbVKCIZg= +golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -434,6 +519,8 @@ golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/time v0.0.0-20220609170525-579cf78fd858 h1:Dpdu/EMxGMFgq0CeYMh4fazTD2vtlZRYE7wyynxJb9U= +golang.org/x/time v0.0.0-20220609170525-579cf78fd858/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= @@ -490,6 +577,7 @@ golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8T golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20220411194840-2f41105eb62f/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20220517211312-f3a8303e98df/go.mod h1:K8+ghG5WaK9qNqU5K3HdILfMLy1f3aNYFI/wnl100a8= golang.org/x/xerrors v0.0.0-20220609144429-65e65417b02f h1:uF6paiQQebLeSXkrTqHqz0MXhXXS1KgF41eUdBNvxK0= golang.org/x/xerrors v0.0.0-20220609144429-65e65417b02f/go.mod h1:K8+ghG5WaK9qNqU5K3HdILfMLy1f3aNYFI/wnl100a8= google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE= @@ -521,6 +609,7 @@ google.golang.org/api v0.54.0/go.mod h1:7C4bFFOvVDGXjfDTAsgGwDgAxRDeQ4X8NvUedIt6 google.golang.org/api v0.55.0/go.mod h1:38yMfeP1kfjsl8isn0tliTjIb1rJXcQi4UXlbqivdVE= google.golang.org/api v0.56.0/go.mod h1:38yMfeP1kfjsl8isn0tliTjIb1rJXcQi4UXlbqivdVE= google.golang.org/api v0.57.0/go.mod h1:dVPlbZyBo2/OjBpmvNdpn2GRm6rPy75jyU7bmhdrMgI= +google.golang.org/api v0.59.0/go.mod h1:sT2boj7M9YJxZzgeZqXogmhfmRWDtPzT31xkieUbuZU= google.golang.org/api v0.61.0/go.mod h1:xQRti5UdCmoCEqFxcz93fTl338AVqDgyaDRuOZ3hg9I= google.golang.org/api v0.63.0/go.mod h1:gs4ij2ffTRXwuzzgJl/56BdwJaA194ijkfn++9tDuPo= google.golang.org/api v0.67.0/go.mod h1:ShHKP8E60yPsKNw/w8w+VYaj9H6buA5UqDp8dhbQZ6g= @@ -528,9 +617,17 @@ google.golang.org/api v0.70.0/go.mod h1:Bs4ZM2HGifEvXwd50TtW70ovgJffJYw2oRCOFU/S google.golang.org/api v0.71.0/go.mod h1:4PyU6e6JogV1f9eA4voyrTY2batOLdgZ5qZ5HOCc4j8= google.golang.org/api v0.74.0/go.mod h1:ZpfMZOVRMywNyvJFeqL9HRWBgAuRfSjJFpe9QtRRyDs= google.golang.org/api v0.75.0/go.mod h1:pU9QmyHLnzlpar1Mjt4IbapUCy8J+6HD6GeELN69ljA= +google.golang.org/api v0.77.0/go.mod h1:pU9QmyHLnzlpar1Mjt4IbapUCy8J+6HD6GeELN69ljA= google.golang.org/api v0.78.0/go.mod h1:1Sg78yoMLOhlQTeF+ARBoytAcH1NNyyl390YMy6rKmw= -google.golang.org/api v0.84.0 h1:NMB9J4cCxs9xEm+1Z9QiO3eFvn7EnQj3Eo3hN6ugVlg= +google.golang.org/api v0.80.0/go.mod h1:xY3nI94gbvBrE0J6NHXhxOmW97HG7Khjkku6AFB3Hyg= google.golang.org/api v0.84.0/go.mod h1:NTsGnUFJMYROtiquksZHBWtHfeMC7iYthki7Eq3pa8o= +google.golang.org/api v0.85.0/go.mod h1:AqZf8Ep9uZ2pyTvgL+x0D3Zt0eoT9b5E8fmzfu6FO2g= +google.golang.org/api v0.90.0/go.mod h1:+Sem1dnrKlrXMR/X0bPnMWyluQe4RsNoYfmNLhOIkzw= +google.golang.org/api v0.93.0/go.mod h1:+Sem1dnrKlrXMR/X0bPnMWyluQe4RsNoYfmNLhOIkzw= +google.golang.org/api v0.94.0/go.mod h1:eADj+UBuxkh5zlrSntJghuNeg8HwQ1w5lTKkuqaETEI= +google.golang.org/api v0.95.0/go.mod h1:eADj+UBuxkh5zlrSntJghuNeg8HwQ1w5lTKkuqaETEI= +google.golang.org/api v0.96.0 h1:F60cuQPJq7K7FzsxMYHAUJSiXh2oKctHxBMbDygxhfM= +google.golang.org/api v0.96.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= @@ -539,6 +636,8 @@ google.golang.org/appengine v1.6.5/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCID google.golang.org/appengine v1.6.6/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c= google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= +google.golang.org/appengine/v2 v2.0.2 h1:MSqyWy2shDLwG7chbwBJ5uMyw6SNqJzhJHNDwYB0Akk= +google.golang.org/appengine/v2 v2.0.2/go.mod h1:PkgRUWz4o1XOvbqtWTkBtCitEJ5Tp4HoVEdMMYQR/8E= google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= google.golang.org/genproto v0.0.0-20190307195333-5fe7a883aa19/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= google.golang.org/genproto v0.0.0-20190418145605-e7d98fc518a7/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= @@ -597,6 +696,8 @@ google.golang.org/genproto v0.0.0-20210903162649-d08c68adba83/go.mod h1:eFjDcFEc google.golang.org/genproto v0.0.0-20210909211513-a8c4777a87af/go.mod h1:eFjDcFEctNawg4eG61bRv87N7iHBWyVhJu7u1kqDUXY= google.golang.org/genproto v0.0.0-20210921142501-181ce0d877f6/go.mod h1:5CzLGKJ67TSI2B9POpiiyGha0AjJvZIUgRMt1dSmuhc= google.golang.org/genproto v0.0.0-20210924002016-3dee208752a0/go.mod h1:5CzLGKJ67TSI2B9POpiiyGha0AjJvZIUgRMt1dSmuhc= +google.golang.org/genproto v0.0.0-20211008145708-270636b82663/go.mod h1:5CzLGKJ67TSI2B9POpiiyGha0AjJvZIUgRMt1dSmuhc= +google.golang.org/genproto v0.0.0-20211028162531-8db9c33dc351/go.mod h1:5CzLGKJ67TSI2B9POpiiyGha0AjJvZIUgRMt1dSmuhc= google.golang.org/genproto v0.0.0-20211118181313-81c1377c94b1/go.mod h1:5CzLGKJ67TSI2B9POpiiyGha0AjJvZIUgRMt1dSmuhc= google.golang.org/genproto v0.0.0-20211206160659-862468c7d6e0/go.mod h1:5CzLGKJ67TSI2B9POpiiyGha0AjJvZIUgRMt1dSmuhc= google.golang.org/genproto v0.0.0-20211208223120-3a66f561d7aa/go.mod h1:5CzLGKJ67TSI2B9POpiiyGha0AjJvZIUgRMt1dSmuhc= @@ -613,11 +714,30 @@ google.golang.org/genproto v0.0.0-20220413183235-5e96e2839df9/go.mod h1:8w6bsBMX google.golang.org/genproto v0.0.0-20220414192740-2d67ff6cf2b4/go.mod h1:8w6bsBMX6yCPbAVTeqQHvzxW0EIFigd5lZyahWgyfDo= google.golang.org/genproto v0.0.0-20220421151946-72621c1f0bd3/go.mod h1:8w6bsBMX6yCPbAVTeqQHvzxW0EIFigd5lZyahWgyfDo= google.golang.org/genproto v0.0.0-20220429170224-98d788798c3e/go.mod h1:8w6bsBMX6yCPbAVTeqQHvzxW0EIFigd5lZyahWgyfDo= +google.golang.org/genproto v0.0.0-20220502173005-c8bf987b8c21/go.mod h1:RAyBrSAP7Fh3Nc84ghnVLDPuV51xc9agzmm4Ph6i0Q4= google.golang.org/genproto v0.0.0-20220505152158-f39f71e6c8f3/go.mod h1:RAyBrSAP7Fh3Nc84ghnVLDPuV51xc9agzmm4Ph6i0Q4= google.golang.org/genproto v0.0.0-20220518221133-4f43b3371335/go.mod h1:RAyBrSAP7Fh3Nc84ghnVLDPuV51xc9agzmm4Ph6i0Q4= +google.golang.org/genproto v0.0.0-20220523171625-347a074981d8/go.mod h1:RAyBrSAP7Fh3Nc84ghnVLDPuV51xc9agzmm4Ph6i0Q4= google.golang.org/genproto v0.0.0-20220608133413-ed9918b62aac/go.mod h1:KEWEmljWE5zPzLBa/oHl6DaEt9LmfH6WtH1OHIvleBA= -google.golang.org/genproto v0.0.0-20220617124728-180714bec0ad h1:kqrS+lhvaMHCxul6sKQvKJ8nAAhlVItmZV822hYFH/U= +google.golang.org/genproto v0.0.0-20220616135557-88e70c0c3a90/go.mod h1:KEWEmljWE5zPzLBa/oHl6DaEt9LmfH6WtH1OHIvleBA= google.golang.org/genproto v0.0.0-20220617124728-180714bec0ad/go.mod h1:KEWEmljWE5zPzLBa/oHl6DaEt9LmfH6WtH1OHIvleBA= +google.golang.org/genproto v0.0.0-20220624142145-8cd45d7dbd1f/go.mod h1:KEWEmljWE5zPzLBa/oHl6DaEt9LmfH6WtH1OHIvleBA= +google.golang.org/genproto v0.0.0-20220628213854-d9e0b6570c03/go.mod h1:KEWEmljWE5zPzLBa/oHl6DaEt9LmfH6WtH1OHIvleBA= +google.golang.org/genproto v0.0.0-20220722212130-b98a9ff5e252/go.mod h1:GkXuJDJ6aQ7lnJcRF+SJVgFdQhypqgl3LB1C9vabdRE= +google.golang.org/genproto v0.0.0-20220801145646-83ce21fca29f/go.mod h1:iHe1svFLAZg9VWz891+QbRMwUv9O/1Ww+/mngYeThbc= +google.golang.org/genproto v0.0.0-20220810155839-1856144b1d9c/go.mod h1:dbqgFATTzChvnt+ujMdZwITVAJHFtfyN1qUhDqEiIlk= +google.golang.org/genproto v0.0.0-20220815135757-37a418bb8959/go.mod h1:dbqgFATTzChvnt+ujMdZwITVAJHFtfyN1qUhDqEiIlk= +google.golang.org/genproto v0.0.0-20220817144833-d7fd3f11b9b1/go.mod h1:dbqgFATTzChvnt+ujMdZwITVAJHFtfyN1qUhDqEiIlk= +google.golang.org/genproto v0.0.0-20220822174746-9e6da59bd2fc/go.mod h1:dbqgFATTzChvnt+ujMdZwITVAJHFtfyN1qUhDqEiIlk= +google.golang.org/genproto v0.0.0-20220829144015-23454907ede3/go.mod h1:dbqgFATTzChvnt+ujMdZwITVAJHFtfyN1qUhDqEiIlk= +google.golang.org/genproto v0.0.0-20220829175752-36a9c930ecbf/go.mod h1:dbqgFATTzChvnt+ujMdZwITVAJHFtfyN1qUhDqEiIlk= +google.golang.org/genproto v0.0.0-20220913154956-18f8339a66a5/go.mod h1:0Nb8Qy+Sk5eDzHnzlStwW3itdNaWoZA5XeSG+R3JHSo= +google.golang.org/genproto v0.0.0-20220914142337-ca0e39ece12f/go.mod h1:0Nb8Qy+Sk5eDzHnzlStwW3itdNaWoZA5XeSG+R3JHSo= +google.golang.org/genproto v0.0.0-20220915135415-7fd63a7952de/go.mod h1:0Nb8Qy+Sk5eDzHnzlStwW3itdNaWoZA5XeSG+R3JHSo= +google.golang.org/genproto v0.0.0-20220916172020-2692e8806bfa/go.mod h1:0Nb8Qy+Sk5eDzHnzlStwW3itdNaWoZA5XeSG+R3JHSo= +google.golang.org/genproto v0.0.0-20220919141832-68c03719ef51/go.mod h1:0Nb8Qy+Sk5eDzHnzlStwW3itdNaWoZA5XeSG+R3JHSo= +google.golang.org/genproto v0.0.0-20220920201722-2b89144ce006 h1:mmbq5q8M1t7dhkLw320YK4PsOXm6jdnUAkErImaIqOg= +google.golang.org/genproto v0.0.0-20220920201722-2b89144ce006/go.mod h1:ht8XFiar2npT/g4vkk7O0WYS1sHOHbdujxbEp7CJWbw= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= @@ -647,8 +767,11 @@ google.golang.org/grpc v1.40.1/go.mod h1:ogyxbiOoUXAkP+4+xa6PZSE9DZgIHtSpzjDTB9K google.golang.org/grpc v1.44.0/go.mod h1:k+4IHHFw41K8+bbowsex27ge2rCb65oeWqe4jJ590SU= google.golang.org/grpc v1.45.0/go.mod h1:lN7owxKUQEqMfSyQikvvk5tf/6zMPsrK+ONuO11+0rQ= google.golang.org/grpc v1.46.0/go.mod h1:vN9eftEi1UMyUsIF80+uQXhHjbXYbm0uXoFCACuMGWk= -google.golang.org/grpc v1.47.0 h1:9n77onPX5F3qfFCqjy9dhn8PbNQsIKeVU04J9G7umt8= +google.golang.org/grpc v1.46.2/go.mod h1:vN9eftEi1UMyUsIF80+uQXhHjbXYbm0uXoFCACuMGWk= google.golang.org/grpc v1.47.0/go.mod h1:vN9eftEi1UMyUsIF80+uQXhHjbXYbm0uXoFCACuMGWk= +google.golang.org/grpc v1.48.0/go.mod h1:vN9eftEi1UMyUsIF80+uQXhHjbXYbm0uXoFCACuMGWk= +google.golang.org/grpc v1.49.0 h1:WTLtQzmQori5FUH25Pq4WT22oCsv8USpQ+F6rqtsmxw= +google.golang.org/grpc v1.49.0/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCDK+GI= google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.1.0/go.mod h1:6Kw0yEErY5E/yWrBtf03jp27GLLJujG4z/JK95pnjjw= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= @@ -663,8 +786,9 @@ google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlba google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= -google.golang.org/protobuf v1.28.0 h1:w43yiav+6bVFTBQFZX0r7ipe9JQ1QsbMgHwbBziscLw= google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +google.golang.org/protobuf v1.28.1 h1:d0NfwRgPtno5B1Wa6L2DAG+KivqkdutMf1UhdNx175w= +google.golang.org/protobuf v1.28.1/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f h1:BLraFXnmrev5lT+xlilqcH8XK9/i0At2xKjWk4p6zsU= diff --git a/learning/tour-of-beam/backend/integration_tests/api.go b/learning/tour-of-beam/backend/integration_tests/api.go index 4bb4f6743656..d8d684ca368f 100644 --- a/learning/tour-of-beam/backend/integration_tests/api.go +++ b/learning/tour-of-beam/backend/integration_tests/api.go @@ -67,3 +67,22 @@ type ErrorResponse struct { Code string `json:"code"` Message string `json:"message,omitempty"` } + +type UnitProgress struct { + Id string `json:"id"` + IsCompleted bool `json:"isCompleted"` + UserSnippetId string `json:"userSnippetId,omitempty"` +} +type SdkProgress struct { + Units []UnitProgress `json:"units"` +} + +type UserCodeFile struct { + Name string `json:"name"` + Content string `json:"content"` + IsMain bool `json:"isMain,omitempty"` +} +type UserCodeRequest struct { + Files []UserCodeFile `json:"files"` + PipelineOptions string `json:"pipelineOptions"` +} diff --git a/learning/tour-of-beam/backend/integration_tests/auth_emulator.go b/learning/tour-of-beam/backend/integration_tests/auth_emulator.go new file mode 100644 index 000000000000..9273153efcd8 --- /dev/null +++ b/learning/tour-of-beam/backend/integration_tests/auth_emulator.go @@ -0,0 +1,134 @@ +//go:build integration +// +build integration + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "bytes" + "encoding/json" + "io" + "log" + "net/http" + "os" + "time" +) + +const ( + TIMEOUT_HTTP = 10 * time.Second + TIMEOUT_STARTUP = 30 * time.Second +) + +type EmulatorClient struct { + host string + client *http.Client +} + +func makeEmulatorCiient() *EmulatorClient { + return &EmulatorClient{ + os.Getenv("FIREBASE_AUTH_EMULATOR_HOST"), + &http.Client{Timeout: TIMEOUT_HTTP}, + } +} + +func (e *EmulatorClient) waitApi() { + terminate := time.NewTimer(TIMEOUT_STARTUP) + tick := time.NewTicker(5 * time.Second) + for { + select { + case <-terminate.C: + log.Fatalf("timeout waiting for emulator") + case <-tick.C: + resp, err := e.do(http.MethodGet, "", nil) + if err != nil { + log.Println("emulator API:", err) + continue + } + parsed := struct { + AuthEmulator struct { + Ready bool `json:"ready"` + } `json:"authEmulator"` + }{} + err = json.Unmarshal(resp, &parsed) + if err != nil { + log.Println("emulator API bad response:", err) + continue + } + if parsed.AuthEmulator.Ready { + return + } + } + } +} + +func (e *EmulatorClient) do(method, endpoint string, jsonBody map[string]string) ([]byte, error) { + url := "http://" + e.host + if endpoint > "" { + url += "/" + endpoint + } + var buf []byte + // handle nil jsonBody as no body + if jsonBody != nil { + buf, _ = json.Marshal(jsonBody) + } + + req, err := http.NewRequest(method, url, bytes.NewBuffer(buf)) + if err != nil { + return nil, err + } + req.Header.Add("content-type", "application/json") + + response, err := e.client.Do(req) + if err != nil { + return nil, err + } + + // Close the connection to reuse it + defer response.Body.Close() + // show the response in stdout + tee := io.TeeReader(response.Body, os.Stdout) + defer os.Stdout.WriteString("\n") + + var out []byte + out, err = io.ReadAll(tee) + if err != nil { + return nil, err + } + + return out, nil +} + +// Get valid Firebase ID token +// Simulate Frontend client authorization logic +// Here, we use the simplest possible authorization: email/password +// Firebase Admin SDK lacks methods to create a user and get ID token +func (e *EmulatorClient) getIDToken() string { + // create a user (sign-up with dummy email/password) + endpoint := "identitytoolkit.googleapis.com/v1/accounts:signUp?key=anything_goes" + body := map[string]string{"email": "a@b.c", "password": "1q2w3e"} + resp, err := e.do(http.MethodPost, endpoint, body) + if err != nil { + log.Fatalf("emulator request error: %+v", err) + } + + var parsed struct { + IdToken string `json:"idToken"` + } + err = json.Unmarshal(resp, &parsed) + if err != nil { + log.Fatalf("failed to parse output: %+v", err) + } + + return parsed.IdToken +} diff --git a/learning/tour-of-beam/backend/integration_tests/auth_test.go b/learning/tour-of-beam/backend/integration_tests/auth_test.go new file mode 100644 index 000000000000..abfecd5e4fa6 --- /dev/null +++ b/learning/tour-of-beam/backend/integration_tests/auth_test.go @@ -0,0 +1,114 @@ +//go:build integration +// +build integration + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "flag" + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" +) + +var emulator *EmulatorClient + +func TestMain(m *testing.M) { + // to parse go test * flags m.Run consumes + flag.Parse() + + emulator = makeEmulatorCiient() + emulator.waitApi() + + os.Exit(m.Run()) +} + +func TestSaveGetProgress(t *testing.T) { + idToken := emulator.getIDToken() + + t.Run("save_complete", func(t *testing.T) { + port := os.Getenv(PORT_POST_UNIT_COMPLETE) + if port == "" { + t.Fatal(PORT_POST_UNIT_COMPLETE, "env not set") + } + url := "http://localhost:" + port + + err := PostUnitComplete(url, "python", "unit_id_1", idToken) + if err != nil { + t.Fatal(err) + } + }) + t.Run("save_code", func(t *testing.T) { + port := os.Getenv(PORT_POST_USER_CODE) + if port == "" { + t.Fatal(PORT_POST_USER_CODE, "env not set") + } + url := "http://localhost:" + port + req := UserCodeRequest{ + Files: []UserCodeFile{ + {Name: "main.py", Content: "import sys; sys.exit(0)", IsMain: true}, + }, + PipelineOptions: "some opts", + } + + _, err := PostUserCode(url, "python", "unit_id_2", idToken, req) + if err != nil { + t.Fatal(err) + } + }) + t.Run("save_code_fail", func(t *testing.T) { + port := os.Getenv(PORT_POST_USER_CODE) + if port == "" { + t.Fatal(PORT_POST_USER_CODE, "env not set") + } + url := "http://localhost:" + port + req := UserCodeRequest{ + Files: []UserCodeFile{ + // empty content doesn't pass validation + {Name: "main.py", Content: "", IsMain: true}, + }, + PipelineOptions: "some opts", + } + + resp, err := PostUserCode(url, "python", "unit_id_1", idToken, req) + if err != nil { + t.Fatal(err) + } + assert.Equal(t, "INTERNAL_ERROR", resp.Code) + msg := "playground api error" + assert.Equal(t, msg, resp.Message[:len(msg)]) + + }) + t.Run("get", func(t *testing.T) { + port := os.Getenv(PORT_GET_USER_PROGRESS) + if port == "" { + t.Fatal(PORT_GET_USER_PROGRESS, "env not set") + } + url := "http://localhost:" + port + + mock_path := filepath.Join("..", "samples", "api", "get_user_progress.json") + var exp SdkProgress + if err := loadJson(mock_path, &exp); err != nil { + t.Fatal(err) + } + + resp, err := GetUserProgress(url, "python", idToken) + if err != nil { + t.Fatal(err) + } + assert.Equal(t, exp, resp) + }) +} diff --git a/learning/tour-of-beam/backend/integration_tests/client.go b/learning/tour-of-beam/backend/integration_tests/client.go index 5d43f454d495..4d4c23d52441 100644 --- a/learning/tour-of-beam/backend/integration_tests/client.go +++ b/learning/tour-of-beam/backend/integration_tests/client.go @@ -13,6 +13,7 @@ package main import ( + "bytes" "encoding/json" "fmt" "io" @@ -39,33 +40,67 @@ func verifyHeaders(header http.Header) error { func GetSdkList(url string) (SdkList, error) { var result SdkList - err := Get(&result, url, nil) + err := Get(&result, url, nil, nil) return result, err } func GetContentTree(url, sdk string) (ContentTree, error) { var result ContentTree - err := Get(&result, url, map[string]string{"sdk": sdk}) + err := Get(&result, url, map[string]string{"sdk": sdk}, nil) return result, err } func GetUnitContent(url, sdk, unitId string) (Unit, error) { var result Unit - err := Get(&result, url, map[string]string{"sdk": sdk, "id": unitId}) + err := Get(&result, url, map[string]string{"sdk": sdk, "id": unitId}, nil) return result, err } +func GetUserProgress(url, sdk, token string) (SdkProgress, error) { + var result SdkProgress + err := Get(&result, url, map[string]string{"sdk": sdk}, + map[string]string{"Authorization": "Bearer " + token}) + return result, err +} + +func PostUnitComplete(url, sdk, unitId, token string) error { + var result interface{} + err := Do(&result, http.MethodPost, url, map[string]string{"sdk": sdk, "id": unitId}, + map[string]string{"Authorization": "Bearer " + token}, nil) + return err +} + +func PostUserCode(url, sdk, unitId, token string, body UserCodeRequest) (ErrorResponse, error) { + raw, err := json.Marshal(body) + if err != nil { + return ErrorResponse{}, err + } + + var result ErrorResponse + err = Do(&result, http.MethodPost, url, map[string]string{"sdk": sdk, "id": unitId}, + map[string]string{"Authorization": "Bearer " + token}, bytes.NewReader(raw)) + return result, err +} + +func Get(dst interface{}, url string, queryParams, headers map[string]string) error { + return Do(dst, http.MethodGet, url, queryParams, headers, nil) +} + // Generic HTTP call wrapper // params: // * dst: response struct pointer // * url: request url // * query_params: url query params, as a map (we don't use multiple-valued params) -func Get(dst interface{}, url string, queryParams map[string]string) error { - req, err := http.NewRequest(http.MethodGet, url, nil) +func Do(dst interface{}, method, url string, queryParams, headers map[string]string, body io.Reader) error { + req, err := http.NewRequest(method, url, body) if err != nil { return err } req.Header.Add("Content-Type", "application/json") + for k, v := range headers { + req.Header.Add(k, v) + } + if len(queryParams) > 0 { q := req.URL.Query() for k, v := range queryParams { @@ -85,5 +120,6 @@ func Get(dst interface{}, url string, queryParams map[string]string) error { } tee := io.TeeReader(resp.Body, os.Stdout) + defer os.Stdout.WriteString("\n") return json.NewDecoder(tee).Decode(dst) } diff --git a/learning/tour-of-beam/backend/internal/storage/image/Dockerfile b/learning/tour-of-beam/backend/integration_tests/emulators/datastore/Dockerfile similarity index 100% rename from learning/tour-of-beam/backend/internal/storage/image/Dockerfile rename to learning/tour-of-beam/backend/integration_tests/emulators/datastore/Dockerfile diff --git a/learning/tour-of-beam/backend/internal/storage/image/start-datastore.sh b/learning/tour-of-beam/backend/integration_tests/emulators/datastore/start-datastore.sh similarity index 100% rename from learning/tour-of-beam/backend/internal/storage/image/start-datastore.sh rename to learning/tour-of-beam/backend/integration_tests/emulators/datastore/start-datastore.sh diff --git a/learning/tour-of-beam/backend/integration_tests/emulators/firebase/Dockerfile b/learning/tour-of-beam/backend/integration_tests/emulators/firebase/Dockerfile new file mode 100644 index 000000000000..7fb924ba6a82 --- /dev/null +++ b/learning/tour-of-beam/backend/integration_tests/emulators/firebase/Dockerfile @@ -0,0 +1,21 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM alpine:3.16 + +RUN apk add openjdk11 npm bash + +RUN npm install -g firebase-tools + +COPY firebase.json / + +CMD firebase emulators:start --only auth --project $PROJECT_ID \ No newline at end of file diff --git a/learning/tour-of-beam/backend/integration_tests/emulators/firebase/firebase.json b/learning/tour-of-beam/backend/integration_tests/emulators/firebase/firebase.json new file mode 100644 index 000000000000..e4fda75c2d88 --- /dev/null +++ b/learning/tour-of-beam/backend/integration_tests/emulators/firebase/firebase.json @@ -0,0 +1,12 @@ +{ + "emulators": { + "auth": { + "host": "0.0.0.0", + "port": 9099 + }, + "ui": { + "enabled": false + }, + "singleProjectMode": true + } +} diff --git a/learning/tour-of-beam/backend/integration_tests/function_test.go b/learning/tour-of-beam/backend/integration_tests/function_test.go index 06ed66d2a7e3..fcda014ec33e 100644 --- a/learning/tour-of-beam/backend/integration_tests/function_test.go +++ b/learning/tour-of-beam/backend/integration_tests/function_test.go @@ -25,9 +25,12 @@ import ( ) const ( - PORT_SDK_LIST = "PORT_SDK_LIST" - PORT_GET_CONTENT_TREE = "PORT_GET_CONTENT_TREE" - PORT_GET_UNIT_CONTENT = "PORT_GET_UNIT_CONTENT" + PORT_SDK_LIST = "PORT_SDK_LIST" + PORT_GET_CONTENT_TREE = "PORT_GET_CONTENT_TREE" + PORT_GET_UNIT_CONTENT = "PORT_GET_UNIT_CONTENT" + PORT_GET_USER_PROGRESS = "PORT_GET_USER_PROGRESS" + PORT_POST_UNIT_COMPLETE = "PORT_POST_UNIT_COMPLETE" + PORT_POST_USER_CODE = "PORT_POST_USER_CODE" ) // scenarios: @@ -36,12 +39,9 @@ const ( // + Get content tree for non-existing SDK: 404 Not Found // + Get unit content for existing SDK, existing unitId // + Get unit content for non-existing SDK/unitId: 404 Not Found -// TODO: -// - Get content tree for a registered user -// - Get unit content for a registered user -// - Save user code/progress for a registered user -// - (negative) Save user code/progress w/o user token/bad token -// - (negative) Save user code/progress for non-existing SDK/unitId: 404 Not Found +// + Save user code/progress for a registered user +// + (negative) Save user code/progress w/o user token/bad token +// + (negative) Save user code/progress for non-existing SDK/unitId: 404 Not Found func loadJson(path string, dst interface{}) error { fh, err := os.Open(path) @@ -115,24 +115,36 @@ func TestNegative(t *testing.T) { for i, params := range []struct { portEnvName string queryParams map[string]string + headers map[string]string expected ErrorResponse }{ - {PORT_GET_CONTENT_TREE, nil, + {PORT_GET_CONTENT_TREE, nil, nil, ErrorResponse{ Code: "BAD_FORMAT", Message: "unknown sdk", }, }, - {PORT_GET_CONTENT_TREE, map[string]string{"sdk": "scio"}, + {PORT_GET_CONTENT_TREE, map[string]string{"sdk": "scio"}, nil, // TODO: actually here should be a NOT_FOUND error ErrorResponse{Code: "INTERNAL_ERROR", Message: "storage error"}, }, - {PORT_GET_UNIT_CONTENT, map[string]string{"sdk": "python", "unitId": "unknown_unitId"}, + {PORT_GET_UNIT_CONTENT, map[string]string{"sdk": "python", "id": "unknown_unitId"}, + nil, ErrorResponse{ Code: "NOT_FOUND", Message: "unit not found", }, }, + // bad authorization header we can test w/o Firebase auth emulator + // for functional tests see auth_test.go + {PORT_GET_USER_PROGRESS, + map[string]string{"sdk": "python"}, + map[string]string{"authorization": "bad_header"}, + ErrorResponse{ + Code: "UNAUTHORIZED", + Message: "bad auth header", + }, + }, } { t.Log("Scenario", i) port := os.Getenv(params.portEnvName) @@ -142,7 +154,7 @@ func TestNegative(t *testing.T) { url := "http://localhost:" + port var resp ErrorResponse - err := Get(&resp, url, params.queryParams) + err := Get(&resp, url, params.queryParams, params.headers) if err != nil { t.Fatal(err) } diff --git a/learning/tour-of-beam/backend/integration_tests/local.sh b/learning/tour-of-beam/backend/integration_tests/local.sh index 6ebebd20f3e3..a28032ac0cbc 100644 --- a/learning/tour-of-beam/backend/integration_tests/local.sh +++ b/learning/tour-of-beam/backend/integration_tests/local.sh @@ -12,14 +12,25 @@ # See the License for the specific language governing permissions and # limitations under the License. -export DATASTORE_PROJECT_ID=test-proj +# demo- prefix makes firebase emulator thinking we're in a local-only environment +export GOOGLE_CLOUD_PROJECT=demo-test-proj +export FIREBASE_AUTH_EMULATOR_HOST=localhost:9099 + +# Enable TOB_MOCK to mock out datastore +#export TOB_MOCK=1 +export DATASTORE_PROJECT_ID=$GOOGLE_CLOUD_PROJECT export DATASTORE_EMULATOR_HOST=localhost:8081 export DATASTORE_EMULATOR_DATADIR=./datadir-$(date '+%H-%M-%S') +export PLAYGROUND_ROUTER_HOST=localhost:8000 + export TOB_LEARNING_ROOT=./samples/learning-content export PORT_SDK_LIST=8801 export PORT_GET_CONTENT_TREE=8802 export PORT_GET_UNIT_CONTENT=8803 +export PORT_GET_USER_PROGRESS=8804 +export PORT_POST_UNIT_COMPLETE=8805 +export PORT_POST_USER_CODE=8806 mkdir "$DATASTORE_EMULATOR_DATADIR" @@ -30,14 +41,17 @@ go build -o tob_function cmd/main.go PORT=$PORT_SDK_LIST FUNCTION_TARGET=getSdkList ./tob_function & PORT=$PORT_GET_CONTENT_TREE FUNCTION_TARGET=getContentTree ./tob_function & PORT=$PORT_GET_UNIT_CONTENT FUNCTION_TARGET=getUnitContent ./tob_function & +PORT=$PORT_GET_USER_PROGRESS FUNCTION_TARGET=getUserProgress ./tob_function & +PORT=$PORT_POST_UNIT_COMPLETE FUNCTION_TARGET=postUnitComplete ./tob_function & +PORT=$PORT_POST_USER_CODE FUNCTION_TARGET=postUserCode ./tob_function & sleep 5 go run cmd/ci_cd/ci_cd.go - -go test -v --tags integration ./integration_tests/... +# -count=1 is an idiomatic way to disable test caching +go test -v -count=1 --tags integration ./integration_tests/... pkill -P $$ diff --git a/learning/tour-of-beam/backend/internal/entity.go b/learning/tour-of-beam/backend/internal/entity.go index 45f5c15bdbb7..62c33722f930 100644 --- a/learning/tour-of-beam/backend/internal/entity.go +++ b/learning/tour-of-beam/backend/internal/entity.go @@ -15,6 +15,14 @@ package internal +import "errors" + +var ( + ErrNoUnit = errors.New("unit not found") + ErrNoUser = errors.New("user not found") + ErrPlayground = errors.New("playground error") +) + type SdkItem struct { Id string `json:"id"` Title string `json:"title"` @@ -75,3 +83,22 @@ type CodeMessage struct { Code string `json:"code"` Message string `json:"message,omitempty"` } + +type UnitProgress struct { + Id string `json:"id"` + IsCompleted bool `json:"isCompleted"` + UserSnippetId string `json:"userSnippetId,omitempty"` +} +type SdkProgress struct { + Units []UnitProgress `json:"units"` +} + +type UserCodeFile struct { + Name string `json:"name"` + Content string `json:"content"` + IsMain bool `json:"isMain,omitempty"` +} +type UserCodeRequest struct { + Files []UserCodeFile `json:"files"` + PipelineOptions string `json:"pipelineOptions"` +} diff --git a/learning/tour-of-beam/backend/internal/service/content.go b/learning/tour-of-beam/backend/internal/service/content.go index 2edf2e87048b..675addbf17fe 100644 --- a/learning/tour-of-beam/backend/internal/service/content.go +++ b/learning/tour-of-beam/backend/internal/service/content.go @@ -18,35 +18,67 @@ package service import ( "context" "errors" + "fmt" tob "beam.apache.org/learning/tour-of-beam/backend/internal" "beam.apache.org/learning/tour-of-beam/backend/internal/storage" + pb "beam.apache.org/learning/tour-of-beam/backend/playground_api" ) -var ErrNoUnit = errors.New("unit not found") - type IContent interface { - GetContentTree(ctx context.Context, sdk tob.Sdk, userId *string) (tob.ContentTree, error) - GetUnitContent(ctx context.Context, sdk tob.Sdk, unitId string, userId *string) (tob.Unit, error) + GetContentTree(ctx context.Context, sdk tob.Sdk) (tob.ContentTree, error) + GetUnitContent(ctx context.Context, sdk tob.Sdk, unitId string) (tob.Unit, error) + GetUserProgress(ctx context.Context, sdk tob.Sdk, userId string) (tob.SdkProgress, error) + SetUnitComplete(ctx context.Context, sdk tob.Sdk, unitId, uid string) error + SaveUserCode(ctx context.Context, sdk tob.Sdk, unitId, uid string, userRequest tob.UserCodeRequest) error } type Svc struct { - Repo storage.Iface + Repo storage.Iface + PgClient pb.PlaygroundServiceClient } -func (s *Svc) GetContentTree(ctx context.Context, sdk tob.Sdk, userId *string) (ct tob.ContentTree, err error) { - // TODO enrich tree with user-specific state (isCompleted) +func (s *Svc) GetContentTree(ctx context.Context, sdk tob.Sdk) (ct tob.ContentTree, err error) { return s.Repo.GetContentTree(ctx, sdk) } -func (s *Svc) GetUnitContent(ctx context.Context, sdk tob.Sdk, unitId string, userId *string) (tob.Unit, error) { - // TODO enrich unit with user-specific state: isCompleted, userSnippetId +func (s *Svc) GetUnitContent(ctx context.Context, sdk tob.Sdk, unitId string) (tob.Unit, error) { unit, err := s.Repo.GetUnitContent(ctx, sdk, unitId) if err != nil { return tob.Unit{}, err } if unit == nil { - return tob.Unit{}, ErrNoUnit + return tob.Unit{}, tob.ErrNoUnit } return *unit, nil } + +func (s *Svc) GetUserProgress(ctx context.Context, sdk tob.Sdk, userId string) (tob.SdkProgress, error) { + progress, err := s.Repo.GetUserProgress(ctx, sdk, userId) + if errors.Is(err, tob.ErrNoUser) { + // make an empty list a default response + return tob.SdkProgress{Units: make([]tob.UnitProgress, 0)}, nil + } + if err != nil { + return tob.SdkProgress{}, err + } + if progress == nil { + panic("progress is nil, no err") + } + + return *progress, nil +} + +func (s *Svc) SetUnitComplete(ctx context.Context, sdk tob.Sdk, unitId, uid string) error { + return s.Repo.SetUnitComplete(ctx, sdk, unitId, uid) +} + +func (s *Svc) SaveUserCode(ctx context.Context, sdk tob.Sdk, unitId, uid string, userRequest tob.UserCodeRequest) error { + req := MakePgSaveRequest(userRequest, sdk) + resp, err := s.PgClient.SaveSnippet(ctx, &req) + if err != nil { + return err + } + fmt.Println("SaveSnippet response:", resp) + return s.Repo.SaveUserSnippetId(ctx, sdk, unitId, uid, resp.GetId()) +} diff --git a/learning/tour-of-beam/backend/internal/service/pg_adapter.go b/learning/tour-of-beam/backend/internal/service/pg_adapter.go new file mode 100644 index 000000000000..ca28b260ebd9 --- /dev/null +++ b/learning/tour-of-beam/backend/internal/service/pg_adapter.go @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package service + +import ( + "log" + + tob "beam.apache.org/learning/tour-of-beam/backend/internal" + pb "beam.apache.org/learning/tour-of-beam/backend/playground_api" +) + +func MakePgSaveRequest(userRequest tob.UserCodeRequest, sdk tob.Sdk) pb.SaveSnippetRequest { + filesProto := make([]*pb.SnippetFile, 0) + for _, file := range userRequest.Files { + filesProto = append(filesProto, + &pb.SnippetFile{ + Name: file.Name, + Content: file.Content, + IsMain: file.IsMain, + }) + } + sdkIdx, ok := pb.Sdk_value[sdk.StorageID()] + if !ok { + log.Panicf("Playground SDK undefined for: %v", sdk) + } + return pb.SaveSnippetRequest{ + Sdk: pb.Sdk(sdkIdx), + Files: filesProto, + PipelineOptions: userRequest.PipelineOptions, + } +} diff --git a/learning/tour-of-beam/backend/internal/storage/adapter.go b/learning/tour-of-beam/backend/internal/storage/adapter.go index 28240dcc09a7..51c1be10f539 100644 --- a/learning/tour-of-beam/backend/internal/storage/adapter.go +++ b/learning/tour-of-beam/backend/internal/storage/adapter.go @@ -35,6 +35,10 @@ func datastoreKey(kind string, sdk tob.Sdk, id string, parent *datastore.Key) *d return pgNameKey(kind, name, parent) } +func rootSdkKey(sdk tob.Sdk) *datastore.Key { + return pgNameKey(PgSdksKind, sdk.StorageID(), nil) +} + func MakeUnitNode(unit *tob.Unit, order, level int) *TbLearningNode { if unit == nil { return nil @@ -129,3 +133,11 @@ func MakeDatastoreModule(mod *tob.Module, order int) *TbLearningModule { Order: order, } } + +func FromDatastoreUserProgress(tbUP TbUnitProgress) tob.UnitProgress { + return tob.UnitProgress{ + Id: tbUP.UnitID, + IsCompleted: tbUP.IsCompleted, + UserSnippetId: tbUP.SnippetId, + } +} diff --git a/learning/tour-of-beam/backend/internal/storage/datastore.go b/learning/tour-of-beam/backend/internal/storage/datastore.go index 840983a9dc91..477dc368012e 100644 --- a/learning/tour-of-beam/backend/internal/storage/datastore.go +++ b/learning/tour-of-beam/backend/internal/storage/datastore.go @@ -17,8 +17,10 @@ package storage import ( "context" + "errors" "fmt" "log" + "time" tob "beam.apache.org/learning/tour-of-beam/backend/internal" "cloud.google.com/go/datastore" @@ -256,5 +258,82 @@ func (d *DatastoreDb) GetUnitContent(ctx context.Context, sdk tob.Sdk, unitId st return node.Unit, nil } +func (d *DatastoreDb) SaveUser(ctx context.Context, uid string) error { + userKey := pgNameKey(TbUserKind, uid, nil) + + _, err := d.Client.Put(ctx, userKey, &TbUser{UID: uid, LastVisitAt: time.Now()}) + if err != nil { + return fmt.Errorf("failed to create tb_user: %w", err) + } + + return nil +} + +func (d *DatastoreDb) GetUserProgress(ctx context.Context, sdk tob.Sdk, uid string) (*tob.SdkProgress, error) { + userKey := pgNameKey(TbUserKind, uid, nil) + err := d.Client.Get(ctx, userKey, &TbUser{}) + if errors.Is(err, datastore.ErrNoSuchEntity) { + return nil, tob.ErrNoUser + } + if err != nil { + return nil, fmt.Errorf("failed to get user: %w", err) + } + + var tbUnits []TbUnitProgress + query := datastore.NewQuery(TbUserProgressKind). + Namespace(PgNamespace). + Ancestor(userKey). + FilterField("sdk", "=", rootSdkKey(sdk)) + + _, err = d.Client.GetAll(ctx, query, &tbUnits) + if err != nil { + return nil, fmt.Errorf("query progress failed: %w", err) + } + + sdkProgress := &tob.SdkProgress{Units: make([]tob.UnitProgress, 0)} + for _, up := range tbUnits { + sdkProgress.Units = append(sdkProgress.Units, FromDatastoreUserProgress(up)) + } + + return sdkProgress, nil +} + +func (d *DatastoreDb) upsertUnitProgress(ctx context.Context, sdk tob.Sdk, unitId, uid string, applyChanges func(*TbUnitProgress)) error { + userKey := pgNameKey(TbUserKind, uid, nil) + progressKey := datastoreKey(TbUserProgressKind, sdk, unitId, userKey) + + _, err := d.Client.RunInTransaction(ctx, func(tx *datastore.Transaction) error { + // default entity values + progress := TbUnitProgress{ + Sdk: rootSdkKey(sdk), + UnitID: unitId, + } + if err := tx.Get(progressKey, &progress); err != nil && err != datastore.ErrNoSuchEntity { + return err + } + applyChanges(&progress) + if _, err := tx.Put(progressKey, &progress); err != nil { + return err + } + return nil + }) + if err != nil { + return fmt.Errorf("failed to upsert tb_user_progress: %w", err) + } + return nil +} + +func (d *DatastoreDb) SetUnitComplete(ctx context.Context, sdk tob.Sdk, unitId, uid string) error { + return d.upsertUnitProgress(ctx, sdk, unitId, uid, func(p *TbUnitProgress) { + p.IsCompleted = true + }) +} + +func (d *DatastoreDb) SaveUserSnippetId(ctx context.Context, sdk tob.Sdk, unitId, uid, snippetId string) error { + return d.upsertUnitProgress(ctx, sdk, unitId, uid, func(p *TbUnitProgress) { + p.SnippetId = snippetId + }) +} + // check if the interface is implemented. var _ Iface = &DatastoreDb{} diff --git a/learning/tour-of-beam/backend/internal/storage/iface.go b/learning/tour-of-beam/backend/internal/storage/iface.go index f81a28e4ce52..c18e093ca7aa 100644 --- a/learning/tour-of-beam/backend/internal/storage/iface.go +++ b/learning/tour-of-beam/backend/internal/storage/iface.go @@ -26,4 +26,9 @@ type Iface interface { SaveContentTrees(ctx context.Context, trees []tob.ContentTree) error GetUnitContent(ctx context.Context, sdk tob.Sdk, unitId string) (*tob.Unit, error) + + SaveUser(ctx context.Context, uid string) error + GetUserProgress(ctx context.Context, sdk tob.Sdk, uid string) (*tob.SdkProgress, error) + SetUnitComplete(ctx context.Context, sdk tob.Sdk, unitId, uid string) error + SaveUserSnippetId(ctx context.Context, sdk tob.Sdk, unitId, uid, snippetId string) error } diff --git a/learning/tour-of-beam/backend/internal/service/mock.go b/learning/tour-of-beam/backend/internal/storage/mock.go similarity index 58% rename from learning/tour-of-beam/backend/internal/service/mock.go rename to learning/tour-of-beam/backend/internal/storage/mock.go index dd9fac6cc958..1a8d4193bd11 100644 --- a/learning/tour-of-beam/backend/internal/service/mock.go +++ b/learning/tour-of-beam/backend/internal/storage/mock.go @@ -13,14 +13,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -package service +package storage import ( "context" "encoding/json" + "errors" "io/ioutil" "path" "runtime" + "strings" tob "beam.apache.org/learning/tour-of-beam/backend/internal" ) @@ -33,16 +35,45 @@ func getSamplesPath() string { type Mock struct{} // check if the interface is implemented. -var _ IContent = &Mock{} +var _ Iface = &Mock{} -func (d *Mock) GetContentTree(_ context.Context, sdk tob.Sdk, userId *string) (ct tob.ContentTree, err error) { +func (d *Mock) GetContentTree(_ context.Context, sdk tob.Sdk) (ct tob.ContentTree, err error) { + // this sdk is special: we use it as an empty learning path + if sdk == tob.SDK_SCIO { + return ct, errors.New("empty sdk tree") + } content, _ := ioutil.ReadFile(path.Join(getSamplesPath(), "get_content_tree.json")) _ = json.Unmarshal(content, &ct) return ct, nil } -func (d *Mock) GetUnitContent(_ context.Context, sdk tob.Sdk, unitId string, userId *string) (u tob.Unit, err error) { +func (d *Mock) SaveContentTrees(_ context.Context, _ []tob.ContentTree) error { + return nil +} + +func (d *Mock) GetUnitContent(_ context.Context, sdk tob.Sdk, unitId string) (u *tob.Unit, err error) { + if strings.HasPrefix(unitId, "unknown_") { + return u, tob.ErrNoUnit + } content, _ := ioutil.ReadFile(path.Join(getSamplesPath(), "get_unit_content.json")) err = json.Unmarshal(content, &u) return u, err } + +func (d *Mock) SaveUser(ctx context.Context, uid string) error { + return nil +} + +func (d *Mock) GetUserProgress(_ context.Context, sdk tob.Sdk, userId string) (sp *tob.SdkProgress, err error) { + content, _ := ioutil.ReadFile(path.Join(getSamplesPath(), "get_user_progress.json")) + _ = json.Unmarshal(content, &sp) + return sp, nil +} + +func (d *Mock) SetUnitComplete(ctx context.Context, sdk tob.Sdk, unitId, uid string) error { + return nil +} + +func (d *Mock) SaveUserSnippetId(ctx context.Context, sdk tob.Sdk, unitId, uid, snippetId string) error { + return nil +} diff --git a/learning/tour-of-beam/backend/internal/storage/schema.go b/learning/tour-of-beam/backend/internal/storage/schema.go index 5e36c86f4907..e0d05f0ab4f9 100644 --- a/learning/tour-of-beam/backend/internal/storage/schema.go +++ b/learning/tour-of-beam/backend/internal/storage/schema.go @@ -16,6 +16,8 @@ package storage import ( + "time" + tob "beam.apache.org/learning/tour-of-beam/backend/internal" "cloud.google.com/go/datastore" ) @@ -33,6 +35,8 @@ const ( TbLearningPathKind = "tb_learning_path" TbLearningModuleKind = "tb_learning_module" TbLearningNodeKind = "tb_learning_node" + TbUserKind = "tb_user" + TbUserProgressKind = "tb_user_progress" PgSnippetsKind = "pg_snippets" PgSdksKind = "pg_sdks" @@ -95,6 +99,21 @@ type TbLearningNode struct { Level int `datastore:"level"` } +type TbUser struct { + Key *datastore.Key `datastore:"__key__"` + UID string `datastore:"uid"` + LastVisitAt time.Time `datastore:"lastVisitAt"` +} + +type TbUnitProgress struct { + Key *datastore.Key `datastore:"__key__"` + Sdk *datastore.Key `datastore:"sdk"` + + UnitID string `datastore:"unitId"` + IsCompleted bool `datastore:"isCompleted"` + SnippetId string `datastore:"snippetId"` +} + type PgSnippets struct { Key *datastore.Key `datastore:"__key__"` Origin string `datastore:"origin"` diff --git a/learning/tour-of-beam/backend/middleware.go b/learning/tour-of-beam/backend/middleware.go index 87c98bd6e14d..b328fbe89ea8 100644 --- a/learning/tour-of-beam/backend/middleware.go +++ b/learning/tour-of-beam/backend/middleware.go @@ -24,6 +24,13 @@ import ( tob "beam.apache.org/learning/tour-of-beam/backend/internal" ) +const ( + BAD_FORMAT = "BAD_FORMAT" + INTERNAL_ERROR = "INTERNAL_ERROR" + NOT_FOUND = "NOT_FOUND" + UNAUTHORIZED = "UNAUTHORIZED" +) + // Middleware-maker for setting a header // We also make this less generic: it works with HandlerFunc's // so that to be convertible to func(w http ResponseWriter, r *http.Request) @@ -51,12 +58,14 @@ func EnsureMethod(method string) func(http.HandlerFunc) http.HandlerFunc { } // Helper common AIO middleware -func Common(next http.HandlerFunc) http.HandlerFunc { - addContentType := AddHeader("Content-Type", "application/json") - addCORS := AddHeader("Access-Control-Allow-Origin", "*") - ensureGet := EnsureMethod(http.MethodGet) +func Common(method string) func(http.HandlerFunc) http.HandlerFunc { + return func(next http.HandlerFunc) http.HandlerFunc { + addContentType := AddHeader("Content-Type", "application/json") + addCORS := AddHeader("Access-Control-Allow-Origin", "*") + ensureGet := EnsureMethod(method) - return ensureGet(addCORS(addContentType(next))) + return ensureGet(addCORS(addContentType(next))) + } } // HandleFunc enriched with sdk. diff --git a/learning/tour-of-beam/backend/playground_api/api.pb.go b/learning/tour-of-beam/backend/playground_api/api.pb.go new file mode 100644 index 000000000000..f9f402704b5b --- /dev/null +++ b/learning/tour-of-beam/backend/playground_api/api.pb.go @@ -0,0 +1,3507 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Code generated by protoc-gen-go. DO NOT EDIT. +// versions: +// protoc-gen-go v1.28.1 +// protoc v3.12.4 +// source: api.proto + +package playground + +import ( + protoreflect "google.golang.org/protobuf/reflect/protoreflect" + protoimpl "google.golang.org/protobuf/runtime/protoimpl" + reflect "reflect" + sync "sync" +) + +const ( + // Verify that this generated code is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) + // Verify that runtime/protoimpl is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) +) + +type Sdk int32 + +const ( + Sdk_SDK_UNSPECIFIED Sdk = 0 + Sdk_SDK_JAVA Sdk = 1 + Sdk_SDK_GO Sdk = 2 + Sdk_SDK_PYTHON Sdk = 3 + Sdk_SDK_SCIO Sdk = 4 +) + +// Enum value maps for Sdk. +var ( + Sdk_name = map[int32]string{ + 0: "SDK_UNSPECIFIED", + 1: "SDK_JAVA", + 2: "SDK_GO", + 3: "SDK_PYTHON", + 4: "SDK_SCIO", + } + Sdk_value = map[string]int32{ + "SDK_UNSPECIFIED": 0, + "SDK_JAVA": 1, + "SDK_GO": 2, + "SDK_PYTHON": 3, + "SDK_SCIO": 4, + } +) + +func (x Sdk) Enum() *Sdk { + p := new(Sdk) + *p = x + return p +} + +func (x Sdk) String() string { + return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) +} + +func (Sdk) Descriptor() protoreflect.EnumDescriptor { + return file_api_proto_enumTypes[0].Descriptor() +} + +func (Sdk) Type() protoreflect.EnumType { + return &file_api_proto_enumTypes[0] +} + +func (x Sdk) Number() protoreflect.EnumNumber { + return protoreflect.EnumNumber(x) +} + +// Deprecated: Use Sdk.Descriptor instead. +func (Sdk) EnumDescriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{0} +} + +type Status int32 + +const ( + Status_STATUS_UNSPECIFIED Status = 0 + Status_STATUS_VALIDATING Status = 1 + Status_STATUS_VALIDATION_ERROR Status = 2 + Status_STATUS_PREPARING Status = 3 + Status_STATUS_PREPARATION_ERROR Status = 4 + Status_STATUS_COMPILING Status = 5 + Status_STATUS_COMPILE_ERROR Status = 6 + Status_STATUS_EXECUTING Status = 7 + Status_STATUS_FINISHED Status = 8 + Status_STATUS_RUN_ERROR Status = 9 + Status_STATUS_ERROR Status = 10 + Status_STATUS_RUN_TIMEOUT Status = 11 + Status_STATUS_CANCELED Status = 12 +) + +// Enum value maps for Status. +var ( + Status_name = map[int32]string{ + 0: "STATUS_UNSPECIFIED", + 1: "STATUS_VALIDATING", + 2: "STATUS_VALIDATION_ERROR", + 3: "STATUS_PREPARING", + 4: "STATUS_PREPARATION_ERROR", + 5: "STATUS_COMPILING", + 6: "STATUS_COMPILE_ERROR", + 7: "STATUS_EXECUTING", + 8: "STATUS_FINISHED", + 9: "STATUS_RUN_ERROR", + 10: "STATUS_ERROR", + 11: "STATUS_RUN_TIMEOUT", + 12: "STATUS_CANCELED", + } + Status_value = map[string]int32{ + "STATUS_UNSPECIFIED": 0, + "STATUS_VALIDATING": 1, + "STATUS_VALIDATION_ERROR": 2, + "STATUS_PREPARING": 3, + "STATUS_PREPARATION_ERROR": 4, + "STATUS_COMPILING": 5, + "STATUS_COMPILE_ERROR": 6, + "STATUS_EXECUTING": 7, + "STATUS_FINISHED": 8, + "STATUS_RUN_ERROR": 9, + "STATUS_ERROR": 10, + "STATUS_RUN_TIMEOUT": 11, + "STATUS_CANCELED": 12, + } +) + +func (x Status) Enum() *Status { + p := new(Status) + *p = x + return p +} + +func (x Status) String() string { + return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) +} + +func (Status) Descriptor() protoreflect.EnumDescriptor { + return file_api_proto_enumTypes[1].Descriptor() +} + +func (Status) Type() protoreflect.EnumType { + return &file_api_proto_enumTypes[1] +} + +func (x Status) Number() protoreflect.EnumNumber { + return protoreflect.EnumNumber(x) +} + +// Deprecated: Use Status.Descriptor instead. +func (Status) EnumDescriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{1} +} + +type PrecompiledObjectType int32 + +const ( + PrecompiledObjectType_PRECOMPILED_OBJECT_TYPE_UNSPECIFIED PrecompiledObjectType = 0 + PrecompiledObjectType_PRECOMPILED_OBJECT_TYPE_EXAMPLE PrecompiledObjectType = 1 + PrecompiledObjectType_PRECOMPILED_OBJECT_TYPE_KATA PrecompiledObjectType = 2 + PrecompiledObjectType_PRECOMPILED_OBJECT_TYPE_UNIT_TEST PrecompiledObjectType = 3 +) + +// Enum value maps for PrecompiledObjectType. +var ( + PrecompiledObjectType_name = map[int32]string{ + 0: "PRECOMPILED_OBJECT_TYPE_UNSPECIFIED", + 1: "PRECOMPILED_OBJECT_TYPE_EXAMPLE", + 2: "PRECOMPILED_OBJECT_TYPE_KATA", + 3: "PRECOMPILED_OBJECT_TYPE_UNIT_TEST", + } + PrecompiledObjectType_value = map[string]int32{ + "PRECOMPILED_OBJECT_TYPE_UNSPECIFIED": 0, + "PRECOMPILED_OBJECT_TYPE_EXAMPLE": 1, + "PRECOMPILED_OBJECT_TYPE_KATA": 2, + "PRECOMPILED_OBJECT_TYPE_UNIT_TEST": 3, + } +) + +func (x PrecompiledObjectType) Enum() *PrecompiledObjectType { + p := new(PrecompiledObjectType) + *p = x + return p +} + +func (x PrecompiledObjectType) String() string { + return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) +} + +func (PrecompiledObjectType) Descriptor() protoreflect.EnumDescriptor { + return file_api_proto_enumTypes[2].Descriptor() +} + +func (PrecompiledObjectType) Type() protoreflect.EnumType { + return &file_api_proto_enumTypes[2] +} + +func (x PrecompiledObjectType) Number() protoreflect.EnumNumber { + return protoreflect.EnumNumber(x) +} + +// Deprecated: Use PrecompiledObjectType.Descriptor instead. +func (PrecompiledObjectType) EnumDescriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{2} +} + +type Complexity int32 + +const ( + Complexity_COMPLEXITY_UNSPECIFIED Complexity = 0 + Complexity_COMPLEXITY_BASIC Complexity = 1 + Complexity_COMPLEXITY_MEDIUM Complexity = 2 + Complexity_COMPLEXITY_ADVANCED Complexity = 3 +) + +// Enum value maps for Complexity. +var ( + Complexity_name = map[int32]string{ + 0: "COMPLEXITY_UNSPECIFIED", + 1: "COMPLEXITY_BASIC", + 2: "COMPLEXITY_MEDIUM", + 3: "COMPLEXITY_ADVANCED", + } + Complexity_value = map[string]int32{ + "COMPLEXITY_UNSPECIFIED": 0, + "COMPLEXITY_BASIC": 1, + "COMPLEXITY_MEDIUM": 2, + "COMPLEXITY_ADVANCED": 3, + } +) + +func (x Complexity) Enum() *Complexity { + p := new(Complexity) + *p = x + return p +} + +func (x Complexity) String() string { + return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) +} + +func (Complexity) Descriptor() protoreflect.EnumDescriptor { + return file_api_proto_enumTypes[3].Descriptor() +} + +func (Complexity) Type() protoreflect.EnumType { + return &file_api_proto_enumTypes[3] +} + +func (x Complexity) Number() protoreflect.EnumNumber { + return protoreflect.EnumNumber(x) +} + +// Deprecated: Use Complexity.Descriptor instead. +func (Complexity) EnumDescriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{3} +} + +// RunCodeRequest represents a code text and options of SDK which executes the code. +type RunCodeRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Code string `protobuf:"bytes,1,opt,name=code,proto3" json:"code,omitempty"` + Sdk Sdk `protobuf:"varint,2,opt,name=sdk,proto3,enum=api.v1.Sdk" json:"sdk,omitempty"` + // The pipeline options as they would be passed to the program (e.g. "--option1 value1 --option2 value2") + PipelineOptions string `protobuf:"bytes,3,opt,name=pipeline_options,json=pipelineOptions,proto3" json:"pipeline_options,omitempty"` +} + +func (x *RunCodeRequest) Reset() { + *x = RunCodeRequest{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[0] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *RunCodeRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*RunCodeRequest) ProtoMessage() {} + +func (x *RunCodeRequest) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[0] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use RunCodeRequest.ProtoReflect.Descriptor instead. +func (*RunCodeRequest) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{0} +} + +func (x *RunCodeRequest) GetCode() string { + if x != nil { + return x.Code + } + return "" +} + +func (x *RunCodeRequest) GetSdk() Sdk { + if x != nil { + return x.Sdk + } + return Sdk_SDK_UNSPECIFIED +} + +func (x *RunCodeRequest) GetPipelineOptions() string { + if x != nil { + return x.PipelineOptions + } + return "" +} + +// RunCodeResponse contains information of the pipeline uuid. +type RunCodeResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + PipelineUuid string `protobuf:"bytes,1,opt,name=pipeline_uuid,json=pipelineUuid,proto3" json:"pipeline_uuid,omitempty"` +} + +func (x *RunCodeResponse) Reset() { + *x = RunCodeResponse{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[1] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *RunCodeResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*RunCodeResponse) ProtoMessage() {} + +func (x *RunCodeResponse) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[1] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use RunCodeResponse.ProtoReflect.Descriptor instead. +func (*RunCodeResponse) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{1} +} + +func (x *RunCodeResponse) GetPipelineUuid() string { + if x != nil { + return x.PipelineUuid + } + return "" +} + +// CheckStatusRequest contains information of the pipeline uuid. +type CheckStatusRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + PipelineUuid string `protobuf:"bytes,1,opt,name=pipeline_uuid,json=pipelineUuid,proto3" json:"pipeline_uuid,omitempty"` +} + +func (x *CheckStatusRequest) Reset() { + *x = CheckStatusRequest{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[2] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *CheckStatusRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*CheckStatusRequest) ProtoMessage() {} + +func (x *CheckStatusRequest) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[2] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use CheckStatusRequest.ProtoReflect.Descriptor instead. +func (*CheckStatusRequest) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{2} +} + +func (x *CheckStatusRequest) GetPipelineUuid() string { + if x != nil { + return x.PipelineUuid + } + return "" +} + +// StatusInfo contains information about the status of the code execution. +type CheckStatusResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Status Status `protobuf:"varint,1,opt,name=status,proto3,enum=api.v1.Status" json:"status,omitempty"` +} + +func (x *CheckStatusResponse) Reset() { + *x = CheckStatusResponse{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[3] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *CheckStatusResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*CheckStatusResponse) ProtoMessage() {} + +func (x *CheckStatusResponse) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[3] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use CheckStatusResponse.ProtoReflect.Descriptor instead. +func (*CheckStatusResponse) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{3} +} + +func (x *CheckStatusResponse) GetStatus() Status { + if x != nil { + return x.Status + } + return Status_STATUS_UNSPECIFIED +} + +// GetValidationOutputRequest contains information of the pipeline uuid. +type GetValidationOutputRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + PipelineUuid string `protobuf:"bytes,1,opt,name=pipeline_uuid,json=pipelineUuid,proto3" json:"pipeline_uuid,omitempty"` +} + +func (x *GetValidationOutputRequest) Reset() { + *x = GetValidationOutputRequest{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[4] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GetValidationOutputRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetValidationOutputRequest) ProtoMessage() {} + +func (x *GetValidationOutputRequest) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[4] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetValidationOutputRequest.ProtoReflect.Descriptor instead. +func (*GetValidationOutputRequest) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{4} +} + +func (x *GetValidationOutputRequest) GetPipelineUuid() string { + if x != nil { + return x.PipelineUuid + } + return "" +} + +// GetValidationOutputResponse represents the result of the code validation. +type GetValidationOutputResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Output string `protobuf:"bytes,1,opt,name=output,proto3" json:"output,omitempty"` +} + +func (x *GetValidationOutputResponse) Reset() { + *x = GetValidationOutputResponse{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[5] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GetValidationOutputResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetValidationOutputResponse) ProtoMessage() {} + +func (x *GetValidationOutputResponse) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[5] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetValidationOutputResponse.ProtoReflect.Descriptor instead. +func (*GetValidationOutputResponse) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{5} +} + +func (x *GetValidationOutputResponse) GetOutput() string { + if x != nil { + return x.Output + } + return "" +} + +// GetPreparationOutputRequest contains information of the pipeline uuid. +type GetPreparationOutputRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + PipelineUuid string `protobuf:"bytes,1,opt,name=pipeline_uuid,json=pipelineUuid,proto3" json:"pipeline_uuid,omitempty"` +} + +func (x *GetPreparationOutputRequest) Reset() { + *x = GetPreparationOutputRequest{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[6] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GetPreparationOutputRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetPreparationOutputRequest) ProtoMessage() {} + +func (x *GetPreparationOutputRequest) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[6] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetPreparationOutputRequest.ProtoReflect.Descriptor instead. +func (*GetPreparationOutputRequest) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{6} +} + +func (x *GetPreparationOutputRequest) GetPipelineUuid() string { + if x != nil { + return x.PipelineUuid + } + return "" +} + +// GetPreparationOutputResponse represents the result of the code preparation. +type GetPreparationOutputResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Output string `protobuf:"bytes,1,opt,name=output,proto3" json:"output,omitempty"` +} + +func (x *GetPreparationOutputResponse) Reset() { + *x = GetPreparationOutputResponse{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[7] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GetPreparationOutputResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetPreparationOutputResponse) ProtoMessage() {} + +func (x *GetPreparationOutputResponse) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[7] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetPreparationOutputResponse.ProtoReflect.Descriptor instead. +func (*GetPreparationOutputResponse) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{7} +} + +func (x *GetPreparationOutputResponse) GetOutput() string { + if x != nil { + return x.Output + } + return "" +} + +// GetCompileOutputRequest contains information of the pipeline uuid. +type GetCompileOutputRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + PipelineUuid string `protobuf:"bytes,1,opt,name=pipeline_uuid,json=pipelineUuid,proto3" json:"pipeline_uuid,omitempty"` +} + +func (x *GetCompileOutputRequest) Reset() { + *x = GetCompileOutputRequest{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[8] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GetCompileOutputRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetCompileOutputRequest) ProtoMessage() {} + +func (x *GetCompileOutputRequest) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[8] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetCompileOutputRequest.ProtoReflect.Descriptor instead. +func (*GetCompileOutputRequest) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{8} +} + +func (x *GetCompileOutputRequest) GetPipelineUuid() string { + if x != nil { + return x.PipelineUuid + } + return "" +} + +// GetCompileOutputResponse represents the result of the compiled code. +type GetCompileOutputResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Output string `protobuf:"bytes,1,opt,name=output,proto3" json:"output,omitempty"` +} + +func (x *GetCompileOutputResponse) Reset() { + *x = GetCompileOutputResponse{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[9] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GetCompileOutputResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetCompileOutputResponse) ProtoMessage() {} + +func (x *GetCompileOutputResponse) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[9] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetCompileOutputResponse.ProtoReflect.Descriptor instead. +func (*GetCompileOutputResponse) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{9} +} + +func (x *GetCompileOutputResponse) GetOutput() string { + if x != nil { + return x.Output + } + return "" +} + +// GetRunOutputRequest contains information of the pipeline uuid. +type GetRunOutputRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + PipelineUuid string `protobuf:"bytes,1,opt,name=pipeline_uuid,json=pipelineUuid,proto3" json:"pipeline_uuid,omitempty"` +} + +func (x *GetRunOutputRequest) Reset() { + *x = GetRunOutputRequest{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[10] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GetRunOutputRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetRunOutputRequest) ProtoMessage() {} + +func (x *GetRunOutputRequest) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[10] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetRunOutputRequest.ProtoReflect.Descriptor instead. +func (*GetRunOutputRequest) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{10} +} + +func (x *GetRunOutputRequest) GetPipelineUuid() string { + if x != nil { + return x.PipelineUuid + } + return "" +} + +// RunOutputResponse represents the result of the executed code. +type GetRunOutputResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Output string `protobuf:"bytes,1,opt,name=output,proto3" json:"output,omitempty"` +} + +func (x *GetRunOutputResponse) Reset() { + *x = GetRunOutputResponse{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[11] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GetRunOutputResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetRunOutputResponse) ProtoMessage() {} + +func (x *GetRunOutputResponse) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[11] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetRunOutputResponse.ProtoReflect.Descriptor instead. +func (*GetRunOutputResponse) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{11} +} + +func (x *GetRunOutputResponse) GetOutput() string { + if x != nil { + return x.Output + } + return "" +} + +// GetRunErrorRequest contains information of the pipeline uuid. +type GetRunErrorRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + PipelineUuid string `protobuf:"bytes,1,opt,name=pipeline_uuid,json=pipelineUuid,proto3" json:"pipeline_uuid,omitempty"` +} + +func (x *GetRunErrorRequest) Reset() { + *x = GetRunErrorRequest{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[12] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GetRunErrorRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetRunErrorRequest) ProtoMessage() {} + +func (x *GetRunErrorRequest) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[12] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetRunErrorRequest.ProtoReflect.Descriptor instead. +func (*GetRunErrorRequest) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{12} +} + +func (x *GetRunErrorRequest) GetPipelineUuid() string { + if x != nil { + return x.PipelineUuid + } + return "" +} + +// GetRunErrorResponse represents the error of the executed code. +type GetRunErrorResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Output string `protobuf:"bytes,1,opt,name=output,proto3" json:"output,omitempty"` +} + +func (x *GetRunErrorResponse) Reset() { + *x = GetRunErrorResponse{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[13] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GetRunErrorResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetRunErrorResponse) ProtoMessage() {} + +func (x *GetRunErrorResponse) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[13] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetRunErrorResponse.ProtoReflect.Descriptor instead. +func (*GetRunErrorResponse) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{13} +} + +func (x *GetRunErrorResponse) GetOutput() string { + if x != nil { + return x.Output + } + return "" +} + +// GetLogsRequest contains information of the pipeline uuid. +type GetLogsRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + PipelineUuid string `protobuf:"bytes,1,opt,name=pipeline_uuid,json=pipelineUuid,proto3" json:"pipeline_uuid,omitempty"` +} + +func (x *GetLogsRequest) Reset() { + *x = GetLogsRequest{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[14] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GetLogsRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetLogsRequest) ProtoMessage() {} + +func (x *GetLogsRequest) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[14] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetLogsRequest.ProtoReflect.Descriptor instead. +func (*GetLogsRequest) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{14} +} + +func (x *GetLogsRequest) GetPipelineUuid() string { + if x != nil { + return x.PipelineUuid + } + return "" +} + +// RunOutputResponse represents the logs of the executed code. +type GetLogsResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Output string `protobuf:"bytes,1,opt,name=output,proto3" json:"output,omitempty"` +} + +func (x *GetLogsResponse) Reset() { + *x = GetLogsResponse{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[15] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GetLogsResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetLogsResponse) ProtoMessage() {} + +func (x *GetLogsResponse) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[15] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetLogsResponse.ProtoReflect.Descriptor instead. +func (*GetLogsResponse) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{15} +} + +func (x *GetLogsResponse) GetOutput() string { + if x != nil { + return x.Output + } + return "" +} + +// GetGraphRequest contains information of the pipeline uuid. +type GetGraphRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + PipelineUuid string `protobuf:"bytes,1,opt,name=pipeline_uuid,json=pipelineUuid,proto3" json:"pipeline_uuid,omitempty"` +} + +func (x *GetGraphRequest) Reset() { + *x = GetGraphRequest{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[16] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GetGraphRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetGraphRequest) ProtoMessage() {} + +func (x *GetGraphRequest) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[16] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetGraphRequest.ProtoReflect.Descriptor instead. +func (*GetGraphRequest) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{16} +} + +func (x *GetGraphRequest) GetPipelineUuid() string { + if x != nil { + return x.PipelineUuid + } + return "" +} + +// GetGraphResponse represents the string representation of pipeline execution graph in DOT format. +type GetGraphResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Graph string `protobuf:"bytes,1,opt,name=graph,proto3" json:"graph,omitempty"` +} + +func (x *GetGraphResponse) Reset() { + *x = GetGraphResponse{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[17] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GetGraphResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetGraphResponse) ProtoMessage() {} + +func (x *GetGraphResponse) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[17] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetGraphResponse.ProtoReflect.Descriptor instead. +func (*GetGraphResponse) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{17} +} + +func (x *GetGraphResponse) GetGraph() string { + if x != nil { + return x.Graph + } + return "" +} + +// CancelRequest request to cancel code processing +type CancelRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + PipelineUuid string `protobuf:"bytes,1,opt,name=pipeline_uuid,json=pipelineUuid,proto3" json:"pipeline_uuid,omitempty"` +} + +func (x *CancelRequest) Reset() { + *x = CancelRequest{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[18] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *CancelRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*CancelRequest) ProtoMessage() {} + +func (x *CancelRequest) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[18] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use CancelRequest.ProtoReflect.Descriptor instead. +func (*CancelRequest) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{18} +} + +func (x *CancelRequest) GetPipelineUuid() string { + if x != nil { + return x.PipelineUuid + } + return "" +} + +// CancelResponse response for cancel request +type CancelResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields +} + +func (x *CancelResponse) Reset() { + *x = CancelResponse{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[19] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *CancelResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*CancelResponse) ProtoMessage() {} + +func (x *CancelResponse) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[19] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use CancelResponse.ProtoReflect.Descriptor instead. +func (*CancelResponse) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{19} +} + +// PrecompiledObject represents one PrecompiledObject with its information +type PrecompiledObject struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + CloudPath string `protobuf:"bytes,1,opt,name=cloud_path,json=cloudPath,proto3" json:"cloud_path,omitempty"` + Name string `protobuf:"bytes,2,opt,name=name,proto3" json:"name,omitempty"` + Description string `protobuf:"bytes,3,opt,name=description,proto3" json:"description,omitempty"` + Type PrecompiledObjectType `protobuf:"varint,4,opt,name=type,proto3,enum=api.v1.PrecompiledObjectType" json:"type,omitempty"` + PipelineOptions string `protobuf:"bytes,5,opt,name=pipeline_options,json=pipelineOptions,proto3" json:"pipeline_options,omitempty"` + // Link to the example in the Beam repository + Link string `protobuf:"bytes,6,opt,name=link,proto3" json:"link,omitempty"` + Multifile bool `protobuf:"varint,7,opt,name=multifile,proto3" json:"multifile,omitempty"` + ContextLine int32 `protobuf:"varint,8,opt,name=context_line,json=contextLine,proto3" json:"context_line,omitempty"` + DefaultExample bool `protobuf:"varint,9,opt,name=default_example,json=defaultExample,proto3" json:"default_example,omitempty"` + Sdk Sdk `protobuf:"varint,10,opt,name=sdk,proto3,enum=api.v1.Sdk" json:"sdk,omitempty"` + Complexity Complexity `protobuf:"varint,11,opt,name=complexity,proto3,enum=api.v1.Complexity" json:"complexity,omitempty"` + Tags []string `protobuf:"bytes,12,rep,name=tags,proto3" json:"tags,omitempty"` +} + +func (x *PrecompiledObject) Reset() { + *x = PrecompiledObject{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[20] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *PrecompiledObject) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*PrecompiledObject) ProtoMessage() {} + +func (x *PrecompiledObject) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[20] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use PrecompiledObject.ProtoReflect.Descriptor instead. +func (*PrecompiledObject) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{20} +} + +func (x *PrecompiledObject) GetCloudPath() string { + if x != nil { + return x.CloudPath + } + return "" +} + +func (x *PrecompiledObject) GetName() string { + if x != nil { + return x.Name + } + return "" +} + +func (x *PrecompiledObject) GetDescription() string { + if x != nil { + return x.Description + } + return "" +} + +func (x *PrecompiledObject) GetType() PrecompiledObjectType { + if x != nil { + return x.Type + } + return PrecompiledObjectType_PRECOMPILED_OBJECT_TYPE_UNSPECIFIED +} + +func (x *PrecompiledObject) GetPipelineOptions() string { + if x != nil { + return x.PipelineOptions + } + return "" +} + +func (x *PrecompiledObject) GetLink() string { + if x != nil { + return x.Link + } + return "" +} + +func (x *PrecompiledObject) GetMultifile() bool { + if x != nil { + return x.Multifile + } + return false +} + +func (x *PrecompiledObject) GetContextLine() int32 { + if x != nil { + return x.ContextLine + } + return 0 +} + +func (x *PrecompiledObject) GetDefaultExample() bool { + if x != nil { + return x.DefaultExample + } + return false +} + +func (x *PrecompiledObject) GetSdk() Sdk { + if x != nil { + return x.Sdk + } + return Sdk_SDK_UNSPECIFIED +} + +func (x *PrecompiledObject) GetComplexity() Complexity { + if x != nil { + return x.Complexity + } + return Complexity_COMPLEXITY_UNSPECIFIED +} + +func (x *PrecompiledObject) GetTags() []string { + if x != nil { + return x.Tags + } + return nil +} + +// Categories represent the array of messages with sdk and categories at this sdk +type Categories struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Sdk Sdk `protobuf:"varint,1,opt,name=sdk,proto3,enum=api.v1.Sdk" json:"sdk,omitempty"` + Categories []*Categories_Category `protobuf:"bytes,2,rep,name=categories,proto3" json:"categories,omitempty"` +} + +func (x *Categories) Reset() { + *x = Categories{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[21] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *Categories) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*Categories) ProtoMessage() {} + +func (x *Categories) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[21] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use Categories.ProtoReflect.Descriptor instead. +func (*Categories) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{21} +} + +func (x *Categories) GetSdk() Sdk { + if x != nil { + return x.Sdk + } + return Sdk_SDK_UNSPECIFIED +} + +func (x *Categories) GetCategories() []*Categories_Category { + if x != nil { + return x.Categories + } + return nil +} + +// GetPrecompiledObjectsRequest contains information of the needed PrecompiledObjects sdk and categories. +type GetPrecompiledObjectsRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Sdk Sdk `protobuf:"varint,1,opt,name=sdk,proto3,enum=api.v1.Sdk" json:"sdk,omitempty"` + Category string `protobuf:"bytes,2,opt,name=category,proto3" json:"category,omitempty"` +} + +func (x *GetPrecompiledObjectsRequest) Reset() { + *x = GetPrecompiledObjectsRequest{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[22] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GetPrecompiledObjectsRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetPrecompiledObjectsRequest) ProtoMessage() {} + +func (x *GetPrecompiledObjectsRequest) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[22] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetPrecompiledObjectsRequest.ProtoReflect.Descriptor instead. +func (*GetPrecompiledObjectsRequest) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{22} +} + +func (x *GetPrecompiledObjectsRequest) GetSdk() Sdk { + if x != nil { + return x.Sdk + } + return Sdk_SDK_UNSPECIFIED +} + +func (x *GetPrecompiledObjectsRequest) GetCategory() string { + if x != nil { + return x.Category + } + return "" +} + +// GetPrecompiledObjectRequest contains information of the needed PrecompiledObject. +type GetPrecompiledObjectRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + CloudPath string `protobuf:"bytes,1,opt,name=cloud_path,json=cloudPath,proto3" json:"cloud_path,omitempty"` +} + +func (x *GetPrecompiledObjectRequest) Reset() { + *x = GetPrecompiledObjectRequest{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[23] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GetPrecompiledObjectRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetPrecompiledObjectRequest) ProtoMessage() {} + +func (x *GetPrecompiledObjectRequest) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[23] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetPrecompiledObjectRequest.ProtoReflect.Descriptor instead. +func (*GetPrecompiledObjectRequest) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{23} +} + +func (x *GetPrecompiledObjectRequest) GetCloudPath() string { + if x != nil { + return x.CloudPath + } + return "" +} + +// GetPrecompiledObjectCodeRequest contains information of the PrecompiledObject uuid. +type GetPrecompiledObjectCodeRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + CloudPath string `protobuf:"bytes,1,opt,name=cloud_path,json=cloudPath,proto3" json:"cloud_path,omitempty"` +} + +func (x *GetPrecompiledObjectCodeRequest) Reset() { + *x = GetPrecompiledObjectCodeRequest{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[24] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GetPrecompiledObjectCodeRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetPrecompiledObjectCodeRequest) ProtoMessage() {} + +func (x *GetPrecompiledObjectCodeRequest) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[24] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetPrecompiledObjectCodeRequest.ProtoReflect.Descriptor instead. +func (*GetPrecompiledObjectCodeRequest) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{24} +} + +func (x *GetPrecompiledObjectCodeRequest) GetCloudPath() string { + if x != nil { + return x.CloudPath + } + return "" +} + +// GetPrecompiledObjectOutputRequest contains information of the PrecompiledObject uuid. +type GetPrecompiledObjectOutputRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + CloudPath string `protobuf:"bytes,1,opt,name=cloud_path,json=cloudPath,proto3" json:"cloud_path,omitempty"` +} + +func (x *GetPrecompiledObjectOutputRequest) Reset() { + *x = GetPrecompiledObjectOutputRequest{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[25] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GetPrecompiledObjectOutputRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetPrecompiledObjectOutputRequest) ProtoMessage() {} + +func (x *GetPrecompiledObjectOutputRequest) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[25] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetPrecompiledObjectOutputRequest.ProtoReflect.Descriptor instead. +func (*GetPrecompiledObjectOutputRequest) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{25} +} + +func (x *GetPrecompiledObjectOutputRequest) GetCloudPath() string { + if x != nil { + return x.CloudPath + } + return "" +} + +// GetPrecompiledObjectLogsRequest contains information of the PrecompiledObject uuid. +type GetPrecompiledObjectLogsRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + CloudPath string `protobuf:"bytes,1,opt,name=cloud_path,json=cloudPath,proto3" json:"cloud_path,omitempty"` +} + +func (x *GetPrecompiledObjectLogsRequest) Reset() { + *x = GetPrecompiledObjectLogsRequest{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[26] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GetPrecompiledObjectLogsRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetPrecompiledObjectLogsRequest) ProtoMessage() {} + +func (x *GetPrecompiledObjectLogsRequest) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[26] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetPrecompiledObjectLogsRequest.ProtoReflect.Descriptor instead. +func (*GetPrecompiledObjectLogsRequest) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{26} +} + +func (x *GetPrecompiledObjectLogsRequest) GetCloudPath() string { + if x != nil { + return x.CloudPath + } + return "" +} + +// GetPrecompiledObjectGraphRequest contains information of the PrecompiledObject cloud path. +type GetPrecompiledObjectGraphRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + CloudPath string `protobuf:"bytes,1,opt,name=cloud_path,json=cloudPath,proto3" json:"cloud_path,omitempty"` +} + +func (x *GetPrecompiledObjectGraphRequest) Reset() { + *x = GetPrecompiledObjectGraphRequest{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[27] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GetPrecompiledObjectGraphRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetPrecompiledObjectGraphRequest) ProtoMessage() {} + +func (x *GetPrecompiledObjectGraphRequest) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[27] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetPrecompiledObjectGraphRequest.ProtoReflect.Descriptor instead. +func (*GetPrecompiledObjectGraphRequest) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{27} +} + +func (x *GetPrecompiledObjectGraphRequest) GetCloudPath() string { + if x != nil { + return x.CloudPath + } + return "" +} + +// GetDefaultPrecompiledObjectRequest contains information of the needed PrecompiledObject sdk. +type GetDefaultPrecompiledObjectRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Sdk Sdk `protobuf:"varint,1,opt,name=sdk,proto3,enum=api.v1.Sdk" json:"sdk,omitempty"` +} + +func (x *GetDefaultPrecompiledObjectRequest) Reset() { + *x = GetDefaultPrecompiledObjectRequest{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[28] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GetDefaultPrecompiledObjectRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetDefaultPrecompiledObjectRequest) ProtoMessage() {} + +func (x *GetDefaultPrecompiledObjectRequest) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[28] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetDefaultPrecompiledObjectRequest.ProtoReflect.Descriptor instead. +func (*GetDefaultPrecompiledObjectRequest) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{28} +} + +func (x *GetDefaultPrecompiledObjectRequest) GetSdk() Sdk { + if x != nil { + return x.Sdk + } + return Sdk_SDK_UNSPECIFIED +} + +// GetPrecompiledObjectsResponse represent the map between sdk and categories for the sdk. +type GetPrecompiledObjectsResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + SdkCategories []*Categories `protobuf:"bytes,1,rep,name=sdk_categories,json=sdkCategories,proto3" json:"sdk_categories,omitempty"` +} + +func (x *GetPrecompiledObjectsResponse) Reset() { + *x = GetPrecompiledObjectsResponse{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[29] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GetPrecompiledObjectsResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetPrecompiledObjectsResponse) ProtoMessage() {} + +func (x *GetPrecompiledObjectsResponse) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[29] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetPrecompiledObjectsResponse.ProtoReflect.Descriptor instead. +func (*GetPrecompiledObjectsResponse) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{29} +} + +func (x *GetPrecompiledObjectsResponse) GetSdkCategories() []*Categories { + if x != nil { + return x.SdkCategories + } + return nil +} + +// GetPrecompiledObjectResponse represent the PrecompiledObject. +type GetPrecompiledObjectResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + PrecompiledObject *PrecompiledObject `protobuf:"bytes,1,opt,name=precompiled_object,json=precompiledObject,proto3" json:"precompiled_object,omitempty"` +} + +func (x *GetPrecompiledObjectResponse) Reset() { + *x = GetPrecompiledObjectResponse{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[30] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GetPrecompiledObjectResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetPrecompiledObjectResponse) ProtoMessage() {} + +func (x *GetPrecompiledObjectResponse) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[30] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetPrecompiledObjectResponse.ProtoReflect.Descriptor instead. +func (*GetPrecompiledObjectResponse) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{30} +} + +func (x *GetPrecompiledObjectResponse) GetPrecompiledObject() *PrecompiledObject { + if x != nil { + return x.PrecompiledObject + } + return nil +} + +// GetPrecompiledObjectResponse represents the source code of the PrecompiledObject. +type GetPrecompiledObjectCodeResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Code string `protobuf:"bytes,1,opt,name=code,proto3" json:"code,omitempty"` +} + +func (x *GetPrecompiledObjectCodeResponse) Reset() { + *x = GetPrecompiledObjectCodeResponse{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[31] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GetPrecompiledObjectCodeResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetPrecompiledObjectCodeResponse) ProtoMessage() {} + +func (x *GetPrecompiledObjectCodeResponse) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[31] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetPrecompiledObjectCodeResponse.ProtoReflect.Descriptor instead. +func (*GetPrecompiledObjectCodeResponse) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{31} +} + +func (x *GetPrecompiledObjectCodeResponse) GetCode() string { + if x != nil { + return x.Code + } + return "" +} + +// GetPrecompiledObjectOutputResponse represents the result of the executed code. +type GetPrecompiledObjectOutputResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Output string `protobuf:"bytes,1,opt,name=output,proto3" json:"output,omitempty"` +} + +func (x *GetPrecompiledObjectOutputResponse) Reset() { + *x = GetPrecompiledObjectOutputResponse{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[32] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GetPrecompiledObjectOutputResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetPrecompiledObjectOutputResponse) ProtoMessage() {} + +func (x *GetPrecompiledObjectOutputResponse) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[32] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetPrecompiledObjectOutputResponse.ProtoReflect.Descriptor instead. +func (*GetPrecompiledObjectOutputResponse) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{32} +} + +func (x *GetPrecompiledObjectOutputResponse) GetOutput() string { + if x != nil { + return x.Output + } + return "" +} + +// GetPrecompiledObjectLogsResponse represents the result of the executed code. +type GetPrecompiledObjectLogsResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Output string `protobuf:"bytes,1,opt,name=output,proto3" json:"output,omitempty"` +} + +func (x *GetPrecompiledObjectLogsResponse) Reset() { + *x = GetPrecompiledObjectLogsResponse{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[33] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GetPrecompiledObjectLogsResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetPrecompiledObjectLogsResponse) ProtoMessage() {} + +func (x *GetPrecompiledObjectLogsResponse) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[33] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetPrecompiledObjectLogsResponse.ProtoReflect.Descriptor instead. +func (*GetPrecompiledObjectLogsResponse) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{33} +} + +func (x *GetPrecompiledObjectLogsResponse) GetOutput() string { + if x != nil { + return x.Output + } + return "" +} + +// GetPrecompiledObjectGraphResponse represents the string representation of the executed code graph in DOT format. +type GetPrecompiledObjectGraphResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Graph string `protobuf:"bytes,1,opt,name=graph,proto3" json:"graph,omitempty"` +} + +func (x *GetPrecompiledObjectGraphResponse) Reset() { + *x = GetPrecompiledObjectGraphResponse{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[34] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GetPrecompiledObjectGraphResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetPrecompiledObjectGraphResponse) ProtoMessage() {} + +func (x *GetPrecompiledObjectGraphResponse) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[34] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetPrecompiledObjectGraphResponse.ProtoReflect.Descriptor instead. +func (*GetPrecompiledObjectGraphResponse) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{34} +} + +func (x *GetPrecompiledObjectGraphResponse) GetGraph() string { + if x != nil { + return x.Graph + } + return "" +} + +// GetDefaultPrecompiledObjectResponse represents the default PrecompiledObject and his category for the sdk. +type GetDefaultPrecompiledObjectResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + PrecompiledObject *PrecompiledObject `protobuf:"bytes,1,opt,name=precompiled_object,json=precompiledObject,proto3" json:"precompiled_object,omitempty"` +} + +func (x *GetDefaultPrecompiledObjectResponse) Reset() { + *x = GetDefaultPrecompiledObjectResponse{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[35] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GetDefaultPrecompiledObjectResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetDefaultPrecompiledObjectResponse) ProtoMessage() {} + +func (x *GetDefaultPrecompiledObjectResponse) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[35] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetDefaultPrecompiledObjectResponse.ProtoReflect.Descriptor instead. +func (*GetDefaultPrecompiledObjectResponse) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{35} +} + +func (x *GetDefaultPrecompiledObjectResponse) GetPrecompiledObject() *PrecompiledObject { + if x != nil { + return x.PrecompiledObject + } + return nil +} + +// SnippetFile represents the snippet file content and its name to save. +type SnippetFile struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` + Content string `protobuf:"bytes,2,opt,name=content,proto3" json:"content,omitempty"` + IsMain bool `protobuf:"varint,3,opt,name=is_main,json=isMain,proto3" json:"is_main,omitempty"` +} + +func (x *SnippetFile) Reset() { + *x = SnippetFile{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[36] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *SnippetFile) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*SnippetFile) ProtoMessage() {} + +func (x *SnippetFile) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[36] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use SnippetFile.ProtoReflect.Descriptor instead. +func (*SnippetFile) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{36} +} + +func (x *SnippetFile) GetName() string { + if x != nil { + return x.Name + } + return "" +} + +func (x *SnippetFile) GetContent() string { + if x != nil { + return x.Content + } + return "" +} + +func (x *SnippetFile) GetIsMain() bool { + if x != nil { + return x.IsMain + } + return false +} + +// SaveSnippetRequest represents a snippet content and options of SDK which executes the snippet. +type SaveSnippetRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Files []*SnippetFile `protobuf:"bytes,1,rep,name=files,proto3" json:"files,omitempty"` + Sdk Sdk `protobuf:"varint,2,opt,name=sdk,proto3,enum=api.v1.Sdk" json:"sdk,omitempty"` + // The pipeline options as they would be passed to the program (e.g. "--option1 value1 --option2 value2") + PipelineOptions string `protobuf:"bytes,3,opt,name=pipeline_options,json=pipelineOptions,proto3" json:"pipeline_options,omitempty"` + Complexity Complexity `protobuf:"varint,4,opt,name=complexity,proto3,enum=api.v1.Complexity" json:"complexity,omitempty"` +} + +func (x *SaveSnippetRequest) Reset() { + *x = SaveSnippetRequest{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[37] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *SaveSnippetRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*SaveSnippetRequest) ProtoMessage() {} + +func (x *SaveSnippetRequest) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[37] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use SaveSnippetRequest.ProtoReflect.Descriptor instead. +func (*SaveSnippetRequest) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{37} +} + +func (x *SaveSnippetRequest) GetFiles() []*SnippetFile { + if x != nil { + return x.Files + } + return nil +} + +func (x *SaveSnippetRequest) GetSdk() Sdk { + if x != nil { + return x.Sdk + } + return Sdk_SDK_UNSPECIFIED +} + +func (x *SaveSnippetRequest) GetPipelineOptions() string { + if x != nil { + return x.PipelineOptions + } + return "" +} + +func (x *SaveSnippetRequest) GetComplexity() Complexity { + if x != nil { + return x.Complexity + } + return Complexity_COMPLEXITY_UNSPECIFIED +} + +// SaveSnippetResponse contains information of the generated ID. +type SaveSnippetResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` +} + +func (x *SaveSnippetResponse) Reset() { + *x = SaveSnippetResponse{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[38] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *SaveSnippetResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*SaveSnippetResponse) ProtoMessage() {} + +func (x *SaveSnippetResponse) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[38] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use SaveSnippetResponse.ProtoReflect.Descriptor instead. +func (*SaveSnippetResponse) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{38} +} + +func (x *SaveSnippetResponse) GetId() string { + if x != nil { + return x.Id + } + return "" +} + +// GetSnippetRequest represents the generated ID. +type GetSnippetRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Id string `protobuf:"bytes,1,opt,name=id,proto3" json:"id,omitempty"` +} + +func (x *GetSnippetRequest) Reset() { + *x = GetSnippetRequest{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[39] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GetSnippetRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetSnippetRequest) ProtoMessage() {} + +func (x *GetSnippetRequest) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[39] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetSnippetRequest.ProtoReflect.Descriptor instead. +func (*GetSnippetRequest) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{39} +} + +func (x *GetSnippetRequest) GetId() string { + if x != nil { + return x.Id + } + return "" +} + +// GetSnippetResponse contains information of a snippet content and options of SDK which executes the snippet. +type GetSnippetResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Files []*SnippetFile `protobuf:"bytes,1,rep,name=files,proto3" json:"files,omitempty"` + Sdk Sdk `protobuf:"varint,2,opt,name=sdk,proto3,enum=api.v1.Sdk" json:"sdk,omitempty"` + // The pipeline options as they would be passed to the program (e.g. "--option1 value1 --option2 value2") + PipelineOptions string `protobuf:"bytes,3,opt,name=pipeline_options,json=pipelineOptions,proto3" json:"pipeline_options,omitempty"` + Complexity Complexity `protobuf:"varint,4,opt,name=complexity,proto3,enum=api.v1.Complexity" json:"complexity,omitempty"` +} + +func (x *GetSnippetResponse) Reset() { + *x = GetSnippetResponse{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[40] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *GetSnippetResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*GetSnippetResponse) ProtoMessage() {} + +func (x *GetSnippetResponse) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[40] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use GetSnippetResponse.ProtoReflect.Descriptor instead. +func (*GetSnippetResponse) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{40} +} + +func (x *GetSnippetResponse) GetFiles() []*SnippetFile { + if x != nil { + return x.Files + } + return nil +} + +func (x *GetSnippetResponse) GetSdk() Sdk { + if x != nil { + return x.Sdk + } + return Sdk_SDK_UNSPECIFIED +} + +func (x *GetSnippetResponse) GetPipelineOptions() string { + if x != nil { + return x.PipelineOptions + } + return "" +} + +func (x *GetSnippetResponse) GetComplexity() Complexity { + if x != nil { + return x.Complexity + } + return Complexity_COMPLEXITY_UNSPECIFIED +} + +type Categories_Category struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + CategoryName string `protobuf:"bytes,1,opt,name=category_name,json=categoryName,proto3" json:"category_name,omitempty"` + PrecompiledObjects []*PrecompiledObject `protobuf:"bytes,2,rep,name=precompiled_objects,json=precompiledObjects,proto3" json:"precompiled_objects,omitempty"` +} + +func (x *Categories_Category) Reset() { + *x = Categories_Category{} + if protoimpl.UnsafeEnabled { + mi := &file_api_proto_msgTypes[41] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *Categories_Category) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*Categories_Category) ProtoMessage() {} + +func (x *Categories_Category) ProtoReflect() protoreflect.Message { + mi := &file_api_proto_msgTypes[41] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use Categories_Category.ProtoReflect.Descriptor instead. +func (*Categories_Category) Descriptor() ([]byte, []int) { + return file_api_proto_rawDescGZIP(), []int{21, 0} +} + +func (x *Categories_Category) GetCategoryName() string { + if x != nil { + return x.CategoryName + } + return "" +} + +func (x *Categories_Category) GetPrecompiledObjects() []*PrecompiledObject { + if x != nil { + return x.PrecompiledObjects + } + return nil +} + +var File_api_proto protoreflect.FileDescriptor + +var file_api_proto_rawDesc = []byte{ + 0x0a, 0x09, 0x61, 0x70, 0x69, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x06, 0x61, 0x70, 0x69, + 0x2e, 0x76, 0x31, 0x22, 0x6e, 0x0a, 0x0e, 0x52, 0x75, 0x6e, 0x43, 0x6f, 0x64, 0x65, 0x52, 0x65, + 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x12, 0x0a, 0x04, 0x63, 0x6f, 0x64, 0x65, 0x18, 0x01, 0x20, + 0x01, 0x28, 0x09, 0x52, 0x04, 0x63, 0x6f, 0x64, 0x65, 0x12, 0x1d, 0x0a, 0x03, 0x73, 0x64, 0x6b, + 0x18, 0x02, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x0b, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, + 0x53, 0x64, 0x6b, 0x52, 0x03, 0x73, 0x64, 0x6b, 0x12, 0x29, 0x0a, 0x10, 0x70, 0x69, 0x70, 0x65, + 0x6c, 0x69, 0x6e, 0x65, 0x5f, 0x6f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x18, 0x03, 0x20, 0x01, + 0x28, 0x09, 0x52, 0x0f, 0x70, 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x4f, 0x70, 0x74, 0x69, + 0x6f, 0x6e, 0x73, 0x22, 0x36, 0x0a, 0x0f, 0x52, 0x75, 0x6e, 0x43, 0x6f, 0x64, 0x65, 0x52, 0x65, + 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x23, 0x0a, 0x0d, 0x70, 0x69, 0x70, 0x65, 0x6c, 0x69, + 0x6e, 0x65, 0x5f, 0x75, 0x75, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0c, 0x70, + 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x55, 0x75, 0x69, 0x64, 0x22, 0x39, 0x0a, 0x12, 0x43, + 0x68, 0x65, 0x63, 0x6b, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, + 0x74, 0x12, 0x23, 0x0a, 0x0d, 0x70, 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x5f, 0x75, 0x75, + 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0c, 0x70, 0x69, 0x70, 0x65, 0x6c, 0x69, + 0x6e, 0x65, 0x55, 0x75, 0x69, 0x64, 0x22, 0x3d, 0x0a, 0x13, 0x43, 0x68, 0x65, 0x63, 0x6b, 0x53, + 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x26, 0x0a, + 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x0e, 0x2e, + 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x06, 0x73, + 0x74, 0x61, 0x74, 0x75, 0x73, 0x22, 0x41, 0x0a, 0x1a, 0x47, 0x65, 0x74, 0x56, 0x61, 0x6c, 0x69, + 0x64, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x52, 0x65, 0x71, 0x75, + 0x65, 0x73, 0x74, 0x12, 0x23, 0x0a, 0x0d, 0x70, 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x5f, + 0x75, 0x75, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0c, 0x70, 0x69, 0x70, 0x65, + 0x6c, 0x69, 0x6e, 0x65, 0x55, 0x75, 0x69, 0x64, 0x22, 0x35, 0x0a, 0x1b, 0x47, 0x65, 0x74, 0x56, + 0x61, 0x6c, 0x69, 0x64, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x52, + 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x16, 0x0a, 0x06, 0x6f, 0x75, 0x74, 0x70, 0x75, + 0x74, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x22, + 0x42, 0x0a, 0x1b, 0x47, 0x65, 0x74, 0x50, 0x72, 0x65, 0x70, 0x61, 0x72, 0x61, 0x74, 0x69, 0x6f, + 0x6e, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x23, + 0x0a, 0x0d, 0x70, 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x5f, 0x75, 0x75, 0x69, 0x64, 0x18, + 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0c, 0x70, 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x55, + 0x75, 0x69, 0x64, 0x22, 0x36, 0x0a, 0x1c, 0x47, 0x65, 0x74, 0x50, 0x72, 0x65, 0x70, 0x61, 0x72, + 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x52, 0x65, 0x73, 0x70, 0x6f, + 0x6e, 0x73, 0x65, 0x12, 0x16, 0x0a, 0x06, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x18, 0x01, 0x20, + 0x01, 0x28, 0x09, 0x52, 0x06, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x22, 0x3e, 0x0a, 0x17, 0x47, + 0x65, 0x74, 0x43, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x52, + 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x23, 0x0a, 0x0d, 0x70, 0x69, 0x70, 0x65, 0x6c, 0x69, + 0x6e, 0x65, 0x5f, 0x75, 0x75, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0c, 0x70, + 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x55, 0x75, 0x69, 0x64, 0x22, 0x32, 0x0a, 0x18, 0x47, + 0x65, 0x74, 0x43, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x52, + 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x16, 0x0a, 0x06, 0x6f, 0x75, 0x74, 0x70, 0x75, + 0x74, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x22, + 0x3a, 0x0a, 0x13, 0x47, 0x65, 0x74, 0x52, 0x75, 0x6e, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x52, + 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x23, 0x0a, 0x0d, 0x70, 0x69, 0x70, 0x65, 0x6c, 0x69, + 0x6e, 0x65, 0x5f, 0x75, 0x75, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0c, 0x70, + 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x55, 0x75, 0x69, 0x64, 0x22, 0x2e, 0x0a, 0x14, 0x47, + 0x65, 0x74, 0x52, 0x75, 0x6e, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x52, 0x65, 0x73, 0x70, 0x6f, + 0x6e, 0x73, 0x65, 0x12, 0x16, 0x0a, 0x06, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x18, 0x01, 0x20, + 0x01, 0x28, 0x09, 0x52, 0x06, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x22, 0x39, 0x0a, 0x12, 0x47, + 0x65, 0x74, 0x52, 0x75, 0x6e, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, + 0x74, 0x12, 0x23, 0x0a, 0x0d, 0x70, 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x5f, 0x75, 0x75, + 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0c, 0x70, 0x69, 0x70, 0x65, 0x6c, 0x69, + 0x6e, 0x65, 0x55, 0x75, 0x69, 0x64, 0x22, 0x2d, 0x0a, 0x13, 0x47, 0x65, 0x74, 0x52, 0x75, 0x6e, + 0x45, 0x72, 0x72, 0x6f, 0x72, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x16, 0x0a, + 0x06, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x6f, + 0x75, 0x74, 0x70, 0x75, 0x74, 0x22, 0x35, 0x0a, 0x0e, 0x47, 0x65, 0x74, 0x4c, 0x6f, 0x67, 0x73, + 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x23, 0x0a, 0x0d, 0x70, 0x69, 0x70, 0x65, 0x6c, + 0x69, 0x6e, 0x65, 0x5f, 0x75, 0x75, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0c, + 0x70, 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x55, 0x75, 0x69, 0x64, 0x22, 0x29, 0x0a, 0x0f, + 0x47, 0x65, 0x74, 0x4c, 0x6f, 0x67, 0x73, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, + 0x16, 0x0a, 0x06, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, + 0x06, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x22, 0x36, 0x0a, 0x0f, 0x47, 0x65, 0x74, 0x47, 0x72, + 0x61, 0x70, 0x68, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x23, 0x0a, 0x0d, 0x70, 0x69, + 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x5f, 0x75, 0x75, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, + 0x09, 0x52, 0x0c, 0x70, 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x55, 0x75, 0x69, 0x64, 0x22, + 0x28, 0x0a, 0x10, 0x47, 0x65, 0x74, 0x47, 0x72, 0x61, 0x70, 0x68, 0x52, 0x65, 0x73, 0x70, 0x6f, + 0x6e, 0x73, 0x65, 0x12, 0x14, 0x0a, 0x05, 0x67, 0x72, 0x61, 0x70, 0x68, 0x18, 0x01, 0x20, 0x01, + 0x28, 0x09, 0x52, 0x05, 0x67, 0x72, 0x61, 0x70, 0x68, 0x22, 0x34, 0x0a, 0x0d, 0x43, 0x61, 0x6e, + 0x63, 0x65, 0x6c, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x23, 0x0a, 0x0d, 0x70, 0x69, + 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x5f, 0x75, 0x75, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, + 0x09, 0x52, 0x0c, 0x70, 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x55, 0x75, 0x69, 0x64, 0x22, + 0x10, 0x0a, 0x0e, 0x43, 0x61, 0x6e, 0x63, 0x65, 0x6c, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, + 0x65, 0x22, 0xab, 0x03, 0x0a, 0x11, 0x50, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, + 0x64, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x12, 0x1d, 0x0a, 0x0a, 0x63, 0x6c, 0x6f, 0x75, 0x64, + 0x5f, 0x70, 0x61, 0x74, 0x68, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x09, 0x63, 0x6c, 0x6f, + 0x75, 0x64, 0x50, 0x61, 0x74, 0x68, 0x12, 0x12, 0x0a, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x02, + 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x12, 0x20, 0x0a, 0x0b, 0x64, 0x65, + 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, + 0x0b, 0x64, 0x65, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x12, 0x31, 0x0a, 0x04, + 0x74, 0x79, 0x70, 0x65, 0x18, 0x04, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x1d, 0x2e, 0x61, 0x70, 0x69, + 0x2e, 0x76, 0x31, 0x2e, 0x50, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x4f, + 0x62, 0x6a, 0x65, 0x63, 0x74, 0x54, 0x79, 0x70, 0x65, 0x52, 0x04, 0x74, 0x79, 0x70, 0x65, 0x12, + 0x29, 0x0a, 0x10, 0x70, 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x5f, 0x6f, 0x70, 0x74, 0x69, + 0x6f, 0x6e, 0x73, 0x18, 0x05, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0f, 0x70, 0x69, 0x70, 0x65, 0x6c, + 0x69, 0x6e, 0x65, 0x4f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x12, 0x12, 0x0a, 0x04, 0x6c, 0x69, + 0x6e, 0x6b, 0x18, 0x06, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x6c, 0x69, 0x6e, 0x6b, 0x12, 0x1c, + 0x0a, 0x09, 0x6d, 0x75, 0x6c, 0x74, 0x69, 0x66, 0x69, 0x6c, 0x65, 0x18, 0x07, 0x20, 0x01, 0x28, + 0x08, 0x52, 0x09, 0x6d, 0x75, 0x6c, 0x74, 0x69, 0x66, 0x69, 0x6c, 0x65, 0x12, 0x21, 0x0a, 0x0c, + 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x5f, 0x6c, 0x69, 0x6e, 0x65, 0x18, 0x08, 0x20, 0x01, + 0x28, 0x05, 0x52, 0x0b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x4c, 0x69, 0x6e, 0x65, 0x12, + 0x27, 0x0a, 0x0f, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x5f, 0x65, 0x78, 0x61, 0x6d, 0x70, + 0x6c, 0x65, 0x18, 0x09, 0x20, 0x01, 0x28, 0x08, 0x52, 0x0e, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, + 0x74, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x12, 0x1d, 0x0a, 0x03, 0x73, 0x64, 0x6b, 0x18, + 0x0a, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x0b, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x53, + 0x64, 0x6b, 0x52, 0x03, 0x73, 0x64, 0x6b, 0x12, 0x32, 0x0a, 0x0a, 0x63, 0x6f, 0x6d, 0x70, 0x6c, + 0x65, 0x78, 0x69, 0x74, 0x79, 0x18, 0x0b, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x12, 0x2e, 0x61, 0x70, + 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x78, 0x69, 0x74, 0x79, 0x52, + 0x0a, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x78, 0x69, 0x74, 0x79, 0x12, 0x12, 0x0a, 0x04, 0x74, + 0x61, 0x67, 0x73, 0x18, 0x0c, 0x20, 0x03, 0x28, 0x09, 0x52, 0x04, 0x74, 0x61, 0x67, 0x73, 0x22, + 0xe5, 0x01, 0x0a, 0x0a, 0x43, 0x61, 0x74, 0x65, 0x67, 0x6f, 0x72, 0x69, 0x65, 0x73, 0x12, 0x1d, + 0x0a, 0x03, 0x73, 0x64, 0x6b, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x0b, 0x2e, 0x61, 0x70, + 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x64, 0x6b, 0x52, 0x03, 0x73, 0x64, 0x6b, 0x12, 0x3b, 0x0a, + 0x0a, 0x63, 0x61, 0x74, 0x65, 0x67, 0x6f, 0x72, 0x69, 0x65, 0x73, 0x18, 0x02, 0x20, 0x03, 0x28, + 0x0b, 0x32, 0x1b, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x43, 0x61, 0x74, 0x65, 0x67, + 0x6f, 0x72, 0x69, 0x65, 0x73, 0x2e, 0x43, 0x61, 0x74, 0x65, 0x67, 0x6f, 0x72, 0x79, 0x52, 0x0a, + 0x63, 0x61, 0x74, 0x65, 0x67, 0x6f, 0x72, 0x69, 0x65, 0x73, 0x1a, 0x7b, 0x0a, 0x08, 0x43, 0x61, + 0x74, 0x65, 0x67, 0x6f, 0x72, 0x79, 0x12, 0x23, 0x0a, 0x0d, 0x63, 0x61, 0x74, 0x65, 0x67, 0x6f, + 0x72, 0x79, 0x5f, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0c, 0x63, + 0x61, 0x74, 0x65, 0x67, 0x6f, 0x72, 0x79, 0x4e, 0x61, 0x6d, 0x65, 0x12, 0x4a, 0x0a, 0x13, 0x70, + 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x5f, 0x6f, 0x62, 0x6a, 0x65, 0x63, + 0x74, 0x73, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x19, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, + 0x31, 0x2e, 0x50, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x4f, 0x62, 0x6a, + 0x65, 0x63, 0x74, 0x52, 0x12, 0x70, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, + 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x73, 0x22, 0x59, 0x0a, 0x1c, 0x47, 0x65, 0x74, 0x50, 0x72, + 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x73, + 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x1d, 0x0a, 0x03, 0x73, 0x64, 0x6b, 0x18, 0x01, + 0x20, 0x01, 0x28, 0x0e, 0x32, 0x0b, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x64, + 0x6b, 0x52, 0x03, 0x73, 0x64, 0x6b, 0x12, 0x1a, 0x0a, 0x08, 0x63, 0x61, 0x74, 0x65, 0x67, 0x6f, + 0x72, 0x79, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x08, 0x63, 0x61, 0x74, 0x65, 0x67, 0x6f, + 0x72, 0x79, 0x22, 0x3c, 0x0a, 0x1b, 0x47, 0x65, 0x74, 0x50, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, + 0x69, 0x6c, 0x65, 0x64, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, + 0x74, 0x12, 0x1d, 0x0a, 0x0a, 0x63, 0x6c, 0x6f, 0x75, 0x64, 0x5f, 0x70, 0x61, 0x74, 0x68, 0x18, + 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x09, 0x63, 0x6c, 0x6f, 0x75, 0x64, 0x50, 0x61, 0x74, 0x68, + 0x22, 0x40, 0x0a, 0x1f, 0x47, 0x65, 0x74, 0x50, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, + 0x65, 0x64, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x43, 0x6f, 0x64, 0x65, 0x52, 0x65, 0x71, 0x75, + 0x65, 0x73, 0x74, 0x12, 0x1d, 0x0a, 0x0a, 0x63, 0x6c, 0x6f, 0x75, 0x64, 0x5f, 0x70, 0x61, 0x74, + 0x68, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x09, 0x63, 0x6c, 0x6f, 0x75, 0x64, 0x50, 0x61, + 0x74, 0x68, 0x22, 0x42, 0x0a, 0x21, 0x47, 0x65, 0x74, 0x50, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, + 0x69, 0x6c, 0x65, 0x64, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, + 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x1d, 0x0a, 0x0a, 0x63, 0x6c, 0x6f, 0x75, 0x64, + 0x5f, 0x70, 0x61, 0x74, 0x68, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x09, 0x63, 0x6c, 0x6f, + 0x75, 0x64, 0x50, 0x61, 0x74, 0x68, 0x22, 0x40, 0x0a, 0x1f, 0x47, 0x65, 0x74, 0x50, 0x72, 0x65, + 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x4c, 0x6f, + 0x67, 0x73, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x1d, 0x0a, 0x0a, 0x63, 0x6c, 0x6f, + 0x75, 0x64, 0x5f, 0x70, 0x61, 0x74, 0x68, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x09, 0x63, + 0x6c, 0x6f, 0x75, 0x64, 0x50, 0x61, 0x74, 0x68, 0x22, 0x41, 0x0a, 0x20, 0x47, 0x65, 0x74, 0x50, + 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, + 0x47, 0x72, 0x61, 0x70, 0x68, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x1d, 0x0a, 0x0a, + 0x63, 0x6c, 0x6f, 0x75, 0x64, 0x5f, 0x70, 0x61, 0x74, 0x68, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, + 0x52, 0x09, 0x63, 0x6c, 0x6f, 0x75, 0x64, 0x50, 0x61, 0x74, 0x68, 0x22, 0x43, 0x0a, 0x22, 0x47, + 0x65, 0x74, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x50, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, + 0x69, 0x6c, 0x65, 0x64, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, + 0x74, 0x12, 0x1d, 0x0a, 0x03, 0x73, 0x64, 0x6b, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x0b, + 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x64, 0x6b, 0x52, 0x03, 0x73, 0x64, 0x6b, + 0x22, 0x5a, 0x0a, 0x1d, 0x47, 0x65, 0x74, 0x50, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, + 0x65, 0x64, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x73, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, + 0x65, 0x12, 0x39, 0x0a, 0x0e, 0x73, 0x64, 0x6b, 0x5f, 0x63, 0x61, 0x74, 0x65, 0x67, 0x6f, 0x72, + 0x69, 0x65, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x61, 0x70, 0x69, 0x2e, + 0x76, 0x31, 0x2e, 0x43, 0x61, 0x74, 0x65, 0x67, 0x6f, 0x72, 0x69, 0x65, 0x73, 0x52, 0x0d, 0x73, + 0x64, 0x6b, 0x43, 0x61, 0x74, 0x65, 0x67, 0x6f, 0x72, 0x69, 0x65, 0x73, 0x22, 0x68, 0x0a, 0x1c, + 0x47, 0x65, 0x74, 0x50, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x4f, 0x62, + 0x6a, 0x65, 0x63, 0x74, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x48, 0x0a, 0x12, + 0x70, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x5f, 0x6f, 0x62, 0x6a, 0x65, + 0x63, 0x74, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x19, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, + 0x31, 0x2e, 0x50, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x4f, 0x62, 0x6a, + 0x65, 0x63, 0x74, 0x52, 0x11, 0x70, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, + 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x22, 0x36, 0x0a, 0x20, 0x47, 0x65, 0x74, 0x50, 0x72, 0x65, + 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x43, 0x6f, + 0x64, 0x65, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x12, 0x0a, 0x04, 0x63, 0x6f, + 0x64, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x63, 0x6f, 0x64, 0x65, 0x22, 0x3c, + 0x0a, 0x22, 0x47, 0x65, 0x74, 0x50, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, + 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x52, 0x65, 0x73, 0x70, + 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x16, 0x0a, 0x06, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x18, 0x01, + 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x22, 0x3a, 0x0a, 0x20, + 0x47, 0x65, 0x74, 0x50, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x4f, 0x62, + 0x6a, 0x65, 0x63, 0x74, 0x4c, 0x6f, 0x67, 0x73, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, + 0x12, 0x16, 0x0a, 0x06, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, + 0x52, 0x06, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x22, 0x39, 0x0a, 0x21, 0x47, 0x65, 0x74, 0x50, + 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, + 0x47, 0x72, 0x61, 0x70, 0x68, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x14, 0x0a, + 0x05, 0x67, 0x72, 0x61, 0x70, 0x68, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x67, 0x72, + 0x61, 0x70, 0x68, 0x22, 0x6f, 0x0a, 0x23, 0x47, 0x65, 0x74, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, + 0x74, 0x50, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x4f, 0x62, 0x6a, 0x65, + 0x63, 0x74, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x48, 0x0a, 0x12, 0x70, 0x72, + 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x5f, 0x6f, 0x62, 0x6a, 0x65, 0x63, 0x74, + 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x19, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, + 0x50, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x4f, 0x62, 0x6a, 0x65, 0x63, + 0x74, 0x52, 0x11, 0x70, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x4f, 0x62, + 0x6a, 0x65, 0x63, 0x74, 0x22, 0x54, 0x0a, 0x0b, 0x53, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x46, + 0x69, 0x6c, 0x65, 0x12, 0x12, 0x0a, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, + 0x09, 0x52, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x12, 0x18, 0x0a, 0x07, 0x63, 0x6f, 0x6e, 0x74, 0x65, + 0x6e, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, + 0x74, 0x12, 0x17, 0x0a, 0x07, 0x69, 0x73, 0x5f, 0x6d, 0x61, 0x69, 0x6e, 0x18, 0x03, 0x20, 0x01, + 0x28, 0x08, 0x52, 0x06, 0x69, 0x73, 0x4d, 0x61, 0x69, 0x6e, 0x22, 0xbd, 0x01, 0x0a, 0x12, 0x53, + 0x61, 0x76, 0x65, 0x53, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, + 0x74, 0x12, 0x29, 0x0a, 0x05, 0x66, 0x69, 0x6c, 0x65, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, + 0x32, 0x13, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x6e, 0x69, 0x70, 0x70, 0x65, + 0x74, 0x46, 0x69, 0x6c, 0x65, 0x52, 0x05, 0x66, 0x69, 0x6c, 0x65, 0x73, 0x12, 0x1d, 0x0a, 0x03, + 0x73, 0x64, 0x6b, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x0b, 0x2e, 0x61, 0x70, 0x69, 0x2e, + 0x76, 0x31, 0x2e, 0x53, 0x64, 0x6b, 0x52, 0x03, 0x73, 0x64, 0x6b, 0x12, 0x29, 0x0a, 0x10, 0x70, + 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x5f, 0x6f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x18, + 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0f, 0x70, 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x4f, + 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x12, 0x32, 0x0a, 0x0a, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, + 0x78, 0x69, 0x74, 0x79, 0x18, 0x04, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x12, 0x2e, 0x61, 0x70, 0x69, + 0x2e, 0x76, 0x31, 0x2e, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x78, 0x69, 0x74, 0x79, 0x52, 0x0a, + 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x78, 0x69, 0x74, 0x79, 0x22, 0x25, 0x0a, 0x13, 0x53, 0x61, + 0x76, 0x65, 0x53, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, + 0x65, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x69, + 0x64, 0x22, 0x23, 0x0a, 0x11, 0x47, 0x65, 0x74, 0x53, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x52, + 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x0e, 0x0a, 0x02, 0x69, 0x64, 0x18, 0x01, 0x20, 0x01, + 0x28, 0x09, 0x52, 0x02, 0x69, 0x64, 0x22, 0xbd, 0x01, 0x0a, 0x12, 0x47, 0x65, 0x74, 0x53, 0x6e, + 0x69, 0x70, 0x70, 0x65, 0x74, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x29, 0x0a, + 0x05, 0x66, 0x69, 0x6c, 0x65, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x13, 0x2e, 0x61, + 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x46, 0x69, 0x6c, + 0x65, 0x52, 0x05, 0x66, 0x69, 0x6c, 0x65, 0x73, 0x12, 0x1d, 0x0a, 0x03, 0x73, 0x64, 0x6b, 0x18, + 0x02, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x0b, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x53, + 0x64, 0x6b, 0x52, 0x03, 0x73, 0x64, 0x6b, 0x12, 0x29, 0x0a, 0x10, 0x70, 0x69, 0x70, 0x65, 0x6c, + 0x69, 0x6e, 0x65, 0x5f, 0x6f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x18, 0x03, 0x20, 0x01, 0x28, + 0x09, 0x52, 0x0f, 0x70, 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x4f, 0x70, 0x74, 0x69, 0x6f, + 0x6e, 0x73, 0x12, 0x32, 0x0a, 0x0a, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x78, 0x69, 0x74, 0x79, + 0x18, 0x04, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x12, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, + 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x78, 0x69, 0x74, 0x79, 0x52, 0x0a, 0x63, 0x6f, 0x6d, 0x70, + 0x6c, 0x65, 0x78, 0x69, 0x74, 0x79, 0x2a, 0x52, 0x0a, 0x03, 0x53, 0x64, 0x6b, 0x12, 0x13, 0x0a, + 0x0f, 0x53, 0x44, 0x4b, 0x5f, 0x55, 0x4e, 0x53, 0x50, 0x45, 0x43, 0x49, 0x46, 0x49, 0x45, 0x44, + 0x10, 0x00, 0x12, 0x0c, 0x0a, 0x08, 0x53, 0x44, 0x4b, 0x5f, 0x4a, 0x41, 0x56, 0x41, 0x10, 0x01, + 0x12, 0x0a, 0x0a, 0x06, 0x53, 0x44, 0x4b, 0x5f, 0x47, 0x4f, 0x10, 0x02, 0x12, 0x0e, 0x0a, 0x0a, + 0x53, 0x44, 0x4b, 0x5f, 0x50, 0x59, 0x54, 0x48, 0x4f, 0x4e, 0x10, 0x03, 0x12, 0x0c, 0x0a, 0x08, + 0x53, 0x44, 0x4b, 0x5f, 0x53, 0x43, 0x49, 0x4f, 0x10, 0x04, 0x2a, 0xb8, 0x02, 0x0a, 0x06, 0x53, + 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x16, 0x0a, 0x12, 0x53, 0x54, 0x41, 0x54, 0x55, 0x53, 0x5f, + 0x55, 0x4e, 0x53, 0x50, 0x45, 0x43, 0x49, 0x46, 0x49, 0x45, 0x44, 0x10, 0x00, 0x12, 0x15, 0x0a, + 0x11, 0x53, 0x54, 0x41, 0x54, 0x55, 0x53, 0x5f, 0x56, 0x41, 0x4c, 0x49, 0x44, 0x41, 0x54, 0x49, + 0x4e, 0x47, 0x10, 0x01, 0x12, 0x1b, 0x0a, 0x17, 0x53, 0x54, 0x41, 0x54, 0x55, 0x53, 0x5f, 0x56, + 0x41, 0x4c, 0x49, 0x44, 0x41, 0x54, 0x49, 0x4f, 0x4e, 0x5f, 0x45, 0x52, 0x52, 0x4f, 0x52, 0x10, + 0x02, 0x12, 0x14, 0x0a, 0x10, 0x53, 0x54, 0x41, 0x54, 0x55, 0x53, 0x5f, 0x50, 0x52, 0x45, 0x50, + 0x41, 0x52, 0x49, 0x4e, 0x47, 0x10, 0x03, 0x12, 0x1c, 0x0a, 0x18, 0x53, 0x54, 0x41, 0x54, 0x55, + 0x53, 0x5f, 0x50, 0x52, 0x45, 0x50, 0x41, 0x52, 0x41, 0x54, 0x49, 0x4f, 0x4e, 0x5f, 0x45, 0x52, + 0x52, 0x4f, 0x52, 0x10, 0x04, 0x12, 0x14, 0x0a, 0x10, 0x53, 0x54, 0x41, 0x54, 0x55, 0x53, 0x5f, + 0x43, 0x4f, 0x4d, 0x50, 0x49, 0x4c, 0x49, 0x4e, 0x47, 0x10, 0x05, 0x12, 0x18, 0x0a, 0x14, 0x53, + 0x54, 0x41, 0x54, 0x55, 0x53, 0x5f, 0x43, 0x4f, 0x4d, 0x50, 0x49, 0x4c, 0x45, 0x5f, 0x45, 0x52, + 0x52, 0x4f, 0x52, 0x10, 0x06, 0x12, 0x14, 0x0a, 0x10, 0x53, 0x54, 0x41, 0x54, 0x55, 0x53, 0x5f, + 0x45, 0x58, 0x45, 0x43, 0x55, 0x54, 0x49, 0x4e, 0x47, 0x10, 0x07, 0x12, 0x13, 0x0a, 0x0f, 0x53, + 0x54, 0x41, 0x54, 0x55, 0x53, 0x5f, 0x46, 0x49, 0x4e, 0x49, 0x53, 0x48, 0x45, 0x44, 0x10, 0x08, + 0x12, 0x14, 0x0a, 0x10, 0x53, 0x54, 0x41, 0x54, 0x55, 0x53, 0x5f, 0x52, 0x55, 0x4e, 0x5f, 0x45, + 0x52, 0x52, 0x4f, 0x52, 0x10, 0x09, 0x12, 0x10, 0x0a, 0x0c, 0x53, 0x54, 0x41, 0x54, 0x55, 0x53, + 0x5f, 0x45, 0x52, 0x52, 0x4f, 0x52, 0x10, 0x0a, 0x12, 0x16, 0x0a, 0x12, 0x53, 0x54, 0x41, 0x54, + 0x55, 0x53, 0x5f, 0x52, 0x55, 0x4e, 0x5f, 0x54, 0x49, 0x4d, 0x45, 0x4f, 0x55, 0x54, 0x10, 0x0b, + 0x12, 0x13, 0x0a, 0x0f, 0x53, 0x54, 0x41, 0x54, 0x55, 0x53, 0x5f, 0x43, 0x41, 0x4e, 0x43, 0x45, + 0x4c, 0x45, 0x44, 0x10, 0x0c, 0x2a, 0xae, 0x01, 0x0a, 0x15, 0x50, 0x72, 0x65, 0x63, 0x6f, 0x6d, + 0x70, 0x69, 0x6c, 0x65, 0x64, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x54, 0x79, 0x70, 0x65, 0x12, + 0x27, 0x0a, 0x23, 0x50, 0x52, 0x45, 0x43, 0x4f, 0x4d, 0x50, 0x49, 0x4c, 0x45, 0x44, 0x5f, 0x4f, + 0x42, 0x4a, 0x45, 0x43, 0x54, 0x5f, 0x54, 0x59, 0x50, 0x45, 0x5f, 0x55, 0x4e, 0x53, 0x50, 0x45, + 0x43, 0x49, 0x46, 0x49, 0x45, 0x44, 0x10, 0x00, 0x12, 0x23, 0x0a, 0x1f, 0x50, 0x52, 0x45, 0x43, + 0x4f, 0x4d, 0x50, 0x49, 0x4c, 0x45, 0x44, 0x5f, 0x4f, 0x42, 0x4a, 0x45, 0x43, 0x54, 0x5f, 0x54, + 0x59, 0x50, 0x45, 0x5f, 0x45, 0x58, 0x41, 0x4d, 0x50, 0x4c, 0x45, 0x10, 0x01, 0x12, 0x20, 0x0a, + 0x1c, 0x50, 0x52, 0x45, 0x43, 0x4f, 0x4d, 0x50, 0x49, 0x4c, 0x45, 0x44, 0x5f, 0x4f, 0x42, 0x4a, + 0x45, 0x43, 0x54, 0x5f, 0x54, 0x59, 0x50, 0x45, 0x5f, 0x4b, 0x41, 0x54, 0x41, 0x10, 0x02, 0x12, + 0x25, 0x0a, 0x21, 0x50, 0x52, 0x45, 0x43, 0x4f, 0x4d, 0x50, 0x49, 0x4c, 0x45, 0x44, 0x5f, 0x4f, + 0x42, 0x4a, 0x45, 0x43, 0x54, 0x5f, 0x54, 0x59, 0x50, 0x45, 0x5f, 0x55, 0x4e, 0x49, 0x54, 0x5f, + 0x54, 0x45, 0x53, 0x54, 0x10, 0x03, 0x2a, 0x6e, 0x0a, 0x0a, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, + 0x78, 0x69, 0x74, 0x79, 0x12, 0x1a, 0x0a, 0x16, 0x43, 0x4f, 0x4d, 0x50, 0x4c, 0x45, 0x58, 0x49, + 0x54, 0x59, 0x5f, 0x55, 0x4e, 0x53, 0x50, 0x45, 0x43, 0x49, 0x46, 0x49, 0x45, 0x44, 0x10, 0x00, + 0x12, 0x14, 0x0a, 0x10, 0x43, 0x4f, 0x4d, 0x50, 0x4c, 0x45, 0x58, 0x49, 0x54, 0x59, 0x5f, 0x42, + 0x41, 0x53, 0x49, 0x43, 0x10, 0x01, 0x12, 0x15, 0x0a, 0x11, 0x43, 0x4f, 0x4d, 0x50, 0x4c, 0x45, + 0x58, 0x49, 0x54, 0x59, 0x5f, 0x4d, 0x45, 0x44, 0x49, 0x55, 0x4d, 0x10, 0x02, 0x12, 0x17, 0x0a, + 0x13, 0x43, 0x4f, 0x4d, 0x50, 0x4c, 0x45, 0x58, 0x49, 0x54, 0x59, 0x5f, 0x41, 0x44, 0x56, 0x41, + 0x4e, 0x43, 0x45, 0x44, 0x10, 0x03, 0x32, 0x8b, 0x0d, 0x0a, 0x11, 0x50, 0x6c, 0x61, 0x79, 0x67, + 0x72, 0x6f, 0x75, 0x6e, 0x64, 0x53, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x12, 0x3a, 0x0a, 0x07, + 0x52, 0x75, 0x6e, 0x43, 0x6f, 0x64, 0x65, 0x12, 0x16, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, + 0x2e, 0x52, 0x75, 0x6e, 0x43, 0x6f, 0x64, 0x65, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, + 0x17, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x52, 0x75, 0x6e, 0x43, 0x6f, 0x64, 0x65, + 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x46, 0x0a, 0x0b, 0x43, 0x68, 0x65, 0x63, + 0x6b, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x1a, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, + 0x2e, 0x43, 0x68, 0x65, 0x63, 0x6b, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x65, 0x71, 0x75, + 0x65, 0x73, 0x74, 0x1a, 0x1b, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x43, 0x68, 0x65, + 0x63, 0x6b, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, + 0x12, 0x49, 0x0a, 0x0c, 0x47, 0x65, 0x74, 0x52, 0x75, 0x6e, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, + 0x12, 0x1b, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x47, 0x65, 0x74, 0x52, 0x75, 0x6e, + 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x1c, 0x2e, + 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x47, 0x65, 0x74, 0x52, 0x75, 0x6e, 0x4f, 0x75, 0x74, + 0x70, 0x75, 0x74, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x3a, 0x0a, 0x07, 0x47, + 0x65, 0x74, 0x4c, 0x6f, 0x67, 0x73, 0x12, 0x16, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, + 0x47, 0x65, 0x74, 0x4c, 0x6f, 0x67, 0x73, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x17, + 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x47, 0x65, 0x74, 0x4c, 0x6f, 0x67, 0x73, 0x52, + 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x3d, 0x0a, 0x08, 0x47, 0x65, 0x74, 0x47, 0x72, + 0x61, 0x70, 0x68, 0x12, 0x17, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x47, 0x65, 0x74, + 0x47, 0x72, 0x61, 0x70, 0x68, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x18, 0x2e, 0x61, + 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x47, 0x65, 0x74, 0x47, 0x72, 0x61, 0x70, 0x68, 0x52, 0x65, + 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x46, 0x0a, 0x0b, 0x47, 0x65, 0x74, 0x52, 0x75, 0x6e, + 0x45, 0x72, 0x72, 0x6f, 0x72, 0x12, 0x1a, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x47, + 0x65, 0x74, 0x52, 0x75, 0x6e, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, + 0x74, 0x1a, 0x1b, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x47, 0x65, 0x74, 0x52, 0x75, + 0x6e, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x5e, + 0x0a, 0x13, 0x47, 0x65, 0x74, 0x56, 0x61, 0x6c, 0x69, 0x64, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x4f, + 0x75, 0x74, 0x70, 0x75, 0x74, 0x12, 0x22, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x47, + 0x65, 0x74, 0x56, 0x61, 0x6c, 0x69, 0x64, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x4f, 0x75, 0x74, 0x70, + 0x75, 0x74, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x23, 0x2e, 0x61, 0x70, 0x69, 0x2e, + 0x76, 0x31, 0x2e, 0x47, 0x65, 0x74, 0x56, 0x61, 0x6c, 0x69, 0x64, 0x61, 0x74, 0x69, 0x6f, 0x6e, + 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x61, + 0x0a, 0x14, 0x47, 0x65, 0x74, 0x50, 0x72, 0x65, 0x70, 0x61, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, + 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x12, 0x23, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, + 0x47, 0x65, 0x74, 0x50, 0x72, 0x65, 0x70, 0x61, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x4f, 0x75, + 0x74, 0x70, 0x75, 0x74, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x24, 0x2e, 0x61, 0x70, + 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x47, 0x65, 0x74, 0x50, 0x72, 0x65, 0x70, 0x61, 0x72, 0x61, 0x74, + 0x69, 0x6f, 0x6e, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, + 0x65, 0x12, 0x55, 0x0a, 0x10, 0x47, 0x65, 0x74, 0x43, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x4f, + 0x75, 0x74, 0x70, 0x75, 0x74, 0x12, 0x1f, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x47, + 0x65, 0x74, 0x43, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x52, + 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x20, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, + 0x47, 0x65, 0x74, 0x43, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, + 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x37, 0x0a, 0x06, 0x43, 0x61, 0x6e, 0x63, + 0x65, 0x6c, 0x12, 0x15, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x43, 0x61, 0x6e, 0x63, + 0x65, 0x6c, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x16, 0x2e, 0x61, 0x70, 0x69, 0x2e, + 0x76, 0x31, 0x2e, 0x43, 0x61, 0x6e, 0x63, 0x65, 0x6c, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, + 0x65, 0x12, 0x64, 0x0a, 0x15, 0x47, 0x65, 0x74, 0x50, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, + 0x6c, 0x65, 0x64, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x73, 0x12, 0x24, 0x2e, 0x61, 0x70, 0x69, + 0x2e, 0x76, 0x31, 0x2e, 0x47, 0x65, 0x74, 0x50, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, + 0x65, 0x64, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x73, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, + 0x1a, 0x25, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x47, 0x65, 0x74, 0x50, 0x72, 0x65, + 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x73, 0x52, + 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x61, 0x0a, 0x14, 0x47, 0x65, 0x74, 0x50, 0x72, + 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x12, + 0x23, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x47, 0x65, 0x74, 0x50, 0x72, 0x65, 0x63, + 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x52, 0x65, 0x71, + 0x75, 0x65, 0x73, 0x74, 0x1a, 0x24, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x47, 0x65, + 0x74, 0x50, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x4f, 0x62, 0x6a, 0x65, + 0x63, 0x74, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x6d, 0x0a, 0x18, 0x47, 0x65, + 0x74, 0x50, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x4f, 0x62, 0x6a, 0x65, + 0x63, 0x74, 0x43, 0x6f, 0x64, 0x65, 0x12, 0x27, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, + 0x47, 0x65, 0x74, 0x50, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x4f, 0x62, + 0x6a, 0x65, 0x63, 0x74, 0x43, 0x6f, 0x64, 0x65, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, + 0x28, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x47, 0x65, 0x74, 0x50, 0x72, 0x65, 0x63, + 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x43, 0x6f, 0x64, + 0x65, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x73, 0x0a, 0x1a, 0x47, 0x65, 0x74, + 0x50, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x4f, 0x62, 0x6a, 0x65, 0x63, + 0x74, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x12, 0x29, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, + 0x2e, 0x47, 0x65, 0x74, 0x50, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x4f, + 0x62, 0x6a, 0x65, 0x63, 0x74, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x52, 0x65, 0x71, 0x75, 0x65, + 0x73, 0x74, 0x1a, 0x2a, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x47, 0x65, 0x74, 0x50, + 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, + 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x6d, + 0x0a, 0x18, 0x47, 0x65, 0x74, 0x50, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, + 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x4c, 0x6f, 0x67, 0x73, 0x12, 0x27, 0x2e, 0x61, 0x70, 0x69, + 0x2e, 0x76, 0x31, 0x2e, 0x47, 0x65, 0x74, 0x50, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, + 0x65, 0x64, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x4c, 0x6f, 0x67, 0x73, 0x52, 0x65, 0x71, 0x75, + 0x65, 0x73, 0x74, 0x1a, 0x28, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x47, 0x65, 0x74, + 0x50, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x4f, 0x62, 0x6a, 0x65, 0x63, + 0x74, 0x4c, 0x6f, 0x67, 0x73, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x70, 0x0a, + 0x19, 0x47, 0x65, 0x74, 0x50, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x4f, + 0x62, 0x6a, 0x65, 0x63, 0x74, 0x47, 0x72, 0x61, 0x70, 0x68, 0x12, 0x28, 0x2e, 0x61, 0x70, 0x69, + 0x2e, 0x76, 0x31, 0x2e, 0x47, 0x65, 0x74, 0x50, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, + 0x65, 0x64, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x47, 0x72, 0x61, 0x70, 0x68, 0x52, 0x65, 0x71, + 0x75, 0x65, 0x73, 0x74, 0x1a, 0x29, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x47, 0x65, + 0x74, 0x50, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x4f, 0x62, 0x6a, 0x65, + 0x63, 0x74, 0x47, 0x72, 0x61, 0x70, 0x68, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, + 0x76, 0x0a, 0x1b, 0x47, 0x65, 0x74, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x50, 0x72, 0x65, + 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x12, 0x2a, + 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x47, 0x65, 0x74, 0x44, 0x65, 0x66, 0x61, 0x75, + 0x6c, 0x74, 0x50, 0x72, 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x4f, 0x62, 0x6a, + 0x65, 0x63, 0x74, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x2b, 0x2e, 0x61, 0x70, 0x69, + 0x2e, 0x76, 0x31, 0x2e, 0x47, 0x65, 0x74, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x50, 0x72, + 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x52, + 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x46, 0x0a, 0x0b, 0x53, 0x61, 0x76, 0x65, 0x53, + 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x12, 0x1a, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, + 0x53, 0x61, 0x76, 0x65, 0x53, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x52, 0x65, 0x71, 0x75, 0x65, + 0x73, 0x74, 0x1a, 0x1b, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x61, 0x76, 0x65, + 0x53, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x52, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, + 0x43, 0x0a, 0x0a, 0x47, 0x65, 0x74, 0x53, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x12, 0x19, 0x2e, + 0x61, 0x70, 0x69, 0x2e, 0x76, 0x31, 0x2e, 0x47, 0x65, 0x74, 0x53, 0x6e, 0x69, 0x70, 0x70, 0x65, + 0x74, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x1a, 0x2e, 0x61, 0x70, 0x69, 0x2e, 0x76, + 0x31, 0x2e, 0x47, 0x65, 0x74, 0x53, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x52, 0x65, 0x73, 0x70, + 0x6f, 0x6e, 0x73, 0x65, 0x42, 0x38, 0x5a, 0x36, 0x62, 0x65, 0x61, 0x6d, 0x2e, 0x61, 0x70, 0x61, + 0x63, 0x68, 0x65, 0x2e, 0x6f, 0x72, 0x67, 0x2f, 0x70, 0x6c, 0x61, 0x79, 0x67, 0x72, 0x6f, 0x75, + 0x6e, 0x64, 0x2f, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x2f, 0x69, 0x6e, 0x74, 0x65, 0x72, + 0x6e, 0x61, 0x6c, 0x3b, 0x70, 0x6c, 0x61, 0x79, 0x67, 0x72, 0x6f, 0x75, 0x6e, 0x64, 0x62, 0x06, + 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, +} + +var ( + file_api_proto_rawDescOnce sync.Once + file_api_proto_rawDescData = file_api_proto_rawDesc +) + +func file_api_proto_rawDescGZIP() []byte { + file_api_proto_rawDescOnce.Do(func() { + file_api_proto_rawDescData = protoimpl.X.CompressGZIP(file_api_proto_rawDescData) + }) + return file_api_proto_rawDescData +} + +var file_api_proto_enumTypes = make([]protoimpl.EnumInfo, 4) +var file_api_proto_msgTypes = make([]protoimpl.MessageInfo, 42) +var file_api_proto_goTypes = []interface{}{ + (Sdk)(0), // 0: api.v1.Sdk + (Status)(0), // 1: api.v1.Status + (PrecompiledObjectType)(0), // 2: api.v1.PrecompiledObjectType + (Complexity)(0), // 3: api.v1.Complexity + (*RunCodeRequest)(nil), // 4: api.v1.RunCodeRequest + (*RunCodeResponse)(nil), // 5: api.v1.RunCodeResponse + (*CheckStatusRequest)(nil), // 6: api.v1.CheckStatusRequest + (*CheckStatusResponse)(nil), // 7: api.v1.CheckStatusResponse + (*GetValidationOutputRequest)(nil), // 8: api.v1.GetValidationOutputRequest + (*GetValidationOutputResponse)(nil), // 9: api.v1.GetValidationOutputResponse + (*GetPreparationOutputRequest)(nil), // 10: api.v1.GetPreparationOutputRequest + (*GetPreparationOutputResponse)(nil), // 11: api.v1.GetPreparationOutputResponse + (*GetCompileOutputRequest)(nil), // 12: api.v1.GetCompileOutputRequest + (*GetCompileOutputResponse)(nil), // 13: api.v1.GetCompileOutputResponse + (*GetRunOutputRequest)(nil), // 14: api.v1.GetRunOutputRequest + (*GetRunOutputResponse)(nil), // 15: api.v1.GetRunOutputResponse + (*GetRunErrorRequest)(nil), // 16: api.v1.GetRunErrorRequest + (*GetRunErrorResponse)(nil), // 17: api.v1.GetRunErrorResponse + (*GetLogsRequest)(nil), // 18: api.v1.GetLogsRequest + (*GetLogsResponse)(nil), // 19: api.v1.GetLogsResponse + (*GetGraphRequest)(nil), // 20: api.v1.GetGraphRequest + (*GetGraphResponse)(nil), // 21: api.v1.GetGraphResponse + (*CancelRequest)(nil), // 22: api.v1.CancelRequest + (*CancelResponse)(nil), // 23: api.v1.CancelResponse + (*PrecompiledObject)(nil), // 24: api.v1.PrecompiledObject + (*Categories)(nil), // 25: api.v1.Categories + (*GetPrecompiledObjectsRequest)(nil), // 26: api.v1.GetPrecompiledObjectsRequest + (*GetPrecompiledObjectRequest)(nil), // 27: api.v1.GetPrecompiledObjectRequest + (*GetPrecompiledObjectCodeRequest)(nil), // 28: api.v1.GetPrecompiledObjectCodeRequest + (*GetPrecompiledObjectOutputRequest)(nil), // 29: api.v1.GetPrecompiledObjectOutputRequest + (*GetPrecompiledObjectLogsRequest)(nil), // 30: api.v1.GetPrecompiledObjectLogsRequest + (*GetPrecompiledObjectGraphRequest)(nil), // 31: api.v1.GetPrecompiledObjectGraphRequest + (*GetDefaultPrecompiledObjectRequest)(nil), // 32: api.v1.GetDefaultPrecompiledObjectRequest + (*GetPrecompiledObjectsResponse)(nil), // 33: api.v1.GetPrecompiledObjectsResponse + (*GetPrecompiledObjectResponse)(nil), // 34: api.v1.GetPrecompiledObjectResponse + (*GetPrecompiledObjectCodeResponse)(nil), // 35: api.v1.GetPrecompiledObjectCodeResponse + (*GetPrecompiledObjectOutputResponse)(nil), // 36: api.v1.GetPrecompiledObjectOutputResponse + (*GetPrecompiledObjectLogsResponse)(nil), // 37: api.v1.GetPrecompiledObjectLogsResponse + (*GetPrecompiledObjectGraphResponse)(nil), // 38: api.v1.GetPrecompiledObjectGraphResponse + (*GetDefaultPrecompiledObjectResponse)(nil), // 39: api.v1.GetDefaultPrecompiledObjectResponse + (*SnippetFile)(nil), // 40: api.v1.SnippetFile + (*SaveSnippetRequest)(nil), // 41: api.v1.SaveSnippetRequest + (*SaveSnippetResponse)(nil), // 42: api.v1.SaveSnippetResponse + (*GetSnippetRequest)(nil), // 43: api.v1.GetSnippetRequest + (*GetSnippetResponse)(nil), // 44: api.v1.GetSnippetResponse + (*Categories_Category)(nil), // 45: api.v1.Categories.Category +} +var file_api_proto_depIdxs = []int32{ + 0, // 0: api.v1.RunCodeRequest.sdk:type_name -> api.v1.Sdk + 1, // 1: api.v1.CheckStatusResponse.status:type_name -> api.v1.Status + 2, // 2: api.v1.PrecompiledObject.type:type_name -> api.v1.PrecompiledObjectType + 0, // 3: api.v1.PrecompiledObject.sdk:type_name -> api.v1.Sdk + 3, // 4: api.v1.PrecompiledObject.complexity:type_name -> api.v1.Complexity + 0, // 5: api.v1.Categories.sdk:type_name -> api.v1.Sdk + 45, // 6: api.v1.Categories.categories:type_name -> api.v1.Categories.Category + 0, // 7: api.v1.GetPrecompiledObjectsRequest.sdk:type_name -> api.v1.Sdk + 0, // 8: api.v1.GetDefaultPrecompiledObjectRequest.sdk:type_name -> api.v1.Sdk + 25, // 9: api.v1.GetPrecompiledObjectsResponse.sdk_categories:type_name -> api.v1.Categories + 24, // 10: api.v1.GetPrecompiledObjectResponse.precompiled_object:type_name -> api.v1.PrecompiledObject + 24, // 11: api.v1.GetDefaultPrecompiledObjectResponse.precompiled_object:type_name -> api.v1.PrecompiledObject + 40, // 12: api.v1.SaveSnippetRequest.files:type_name -> api.v1.SnippetFile + 0, // 13: api.v1.SaveSnippetRequest.sdk:type_name -> api.v1.Sdk + 3, // 14: api.v1.SaveSnippetRequest.complexity:type_name -> api.v1.Complexity + 40, // 15: api.v1.GetSnippetResponse.files:type_name -> api.v1.SnippetFile + 0, // 16: api.v1.GetSnippetResponse.sdk:type_name -> api.v1.Sdk + 3, // 17: api.v1.GetSnippetResponse.complexity:type_name -> api.v1.Complexity + 24, // 18: api.v1.Categories.Category.precompiled_objects:type_name -> api.v1.PrecompiledObject + 4, // 19: api.v1.PlaygroundService.RunCode:input_type -> api.v1.RunCodeRequest + 6, // 20: api.v1.PlaygroundService.CheckStatus:input_type -> api.v1.CheckStatusRequest + 14, // 21: api.v1.PlaygroundService.GetRunOutput:input_type -> api.v1.GetRunOutputRequest + 18, // 22: api.v1.PlaygroundService.GetLogs:input_type -> api.v1.GetLogsRequest + 20, // 23: api.v1.PlaygroundService.GetGraph:input_type -> api.v1.GetGraphRequest + 16, // 24: api.v1.PlaygroundService.GetRunError:input_type -> api.v1.GetRunErrorRequest + 8, // 25: api.v1.PlaygroundService.GetValidationOutput:input_type -> api.v1.GetValidationOutputRequest + 10, // 26: api.v1.PlaygroundService.GetPreparationOutput:input_type -> api.v1.GetPreparationOutputRequest + 12, // 27: api.v1.PlaygroundService.GetCompileOutput:input_type -> api.v1.GetCompileOutputRequest + 22, // 28: api.v1.PlaygroundService.Cancel:input_type -> api.v1.CancelRequest + 26, // 29: api.v1.PlaygroundService.GetPrecompiledObjects:input_type -> api.v1.GetPrecompiledObjectsRequest + 27, // 30: api.v1.PlaygroundService.GetPrecompiledObject:input_type -> api.v1.GetPrecompiledObjectRequest + 28, // 31: api.v1.PlaygroundService.GetPrecompiledObjectCode:input_type -> api.v1.GetPrecompiledObjectCodeRequest + 29, // 32: api.v1.PlaygroundService.GetPrecompiledObjectOutput:input_type -> api.v1.GetPrecompiledObjectOutputRequest + 30, // 33: api.v1.PlaygroundService.GetPrecompiledObjectLogs:input_type -> api.v1.GetPrecompiledObjectLogsRequest + 31, // 34: api.v1.PlaygroundService.GetPrecompiledObjectGraph:input_type -> api.v1.GetPrecompiledObjectGraphRequest + 32, // 35: api.v1.PlaygroundService.GetDefaultPrecompiledObject:input_type -> api.v1.GetDefaultPrecompiledObjectRequest + 41, // 36: api.v1.PlaygroundService.SaveSnippet:input_type -> api.v1.SaveSnippetRequest + 43, // 37: api.v1.PlaygroundService.GetSnippet:input_type -> api.v1.GetSnippetRequest + 5, // 38: api.v1.PlaygroundService.RunCode:output_type -> api.v1.RunCodeResponse + 7, // 39: api.v1.PlaygroundService.CheckStatus:output_type -> api.v1.CheckStatusResponse + 15, // 40: api.v1.PlaygroundService.GetRunOutput:output_type -> api.v1.GetRunOutputResponse + 19, // 41: api.v1.PlaygroundService.GetLogs:output_type -> api.v1.GetLogsResponse + 21, // 42: api.v1.PlaygroundService.GetGraph:output_type -> api.v1.GetGraphResponse + 17, // 43: api.v1.PlaygroundService.GetRunError:output_type -> api.v1.GetRunErrorResponse + 9, // 44: api.v1.PlaygroundService.GetValidationOutput:output_type -> api.v1.GetValidationOutputResponse + 11, // 45: api.v1.PlaygroundService.GetPreparationOutput:output_type -> api.v1.GetPreparationOutputResponse + 13, // 46: api.v1.PlaygroundService.GetCompileOutput:output_type -> api.v1.GetCompileOutputResponse + 23, // 47: api.v1.PlaygroundService.Cancel:output_type -> api.v1.CancelResponse + 33, // 48: api.v1.PlaygroundService.GetPrecompiledObjects:output_type -> api.v1.GetPrecompiledObjectsResponse + 34, // 49: api.v1.PlaygroundService.GetPrecompiledObject:output_type -> api.v1.GetPrecompiledObjectResponse + 35, // 50: api.v1.PlaygroundService.GetPrecompiledObjectCode:output_type -> api.v1.GetPrecompiledObjectCodeResponse + 36, // 51: api.v1.PlaygroundService.GetPrecompiledObjectOutput:output_type -> api.v1.GetPrecompiledObjectOutputResponse + 37, // 52: api.v1.PlaygroundService.GetPrecompiledObjectLogs:output_type -> api.v1.GetPrecompiledObjectLogsResponse + 38, // 53: api.v1.PlaygroundService.GetPrecompiledObjectGraph:output_type -> api.v1.GetPrecompiledObjectGraphResponse + 39, // 54: api.v1.PlaygroundService.GetDefaultPrecompiledObject:output_type -> api.v1.GetDefaultPrecompiledObjectResponse + 42, // 55: api.v1.PlaygroundService.SaveSnippet:output_type -> api.v1.SaveSnippetResponse + 44, // 56: api.v1.PlaygroundService.GetSnippet:output_type -> api.v1.GetSnippetResponse + 38, // [38:57] is the sub-list for method output_type + 19, // [19:38] is the sub-list for method input_type + 19, // [19:19] is the sub-list for extension type_name + 19, // [19:19] is the sub-list for extension extendee + 0, // [0:19] is the sub-list for field type_name +} + +func init() { file_api_proto_init() } +func file_api_proto_init() { + if File_api_proto != nil { + return + } + if !protoimpl.UnsafeEnabled { + file_api_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*RunCodeRequest); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*RunCodeResponse); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*CheckStatusRequest); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[3].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*CheckStatusResponse); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[4].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GetValidationOutputRequest); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[5].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GetValidationOutputResponse); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[6].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GetPreparationOutputRequest); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[7].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GetPreparationOutputResponse); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[8].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GetCompileOutputRequest); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[9].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GetCompileOutputResponse); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[10].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GetRunOutputRequest); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[11].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GetRunOutputResponse); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[12].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GetRunErrorRequest); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[13].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GetRunErrorResponse); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[14].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GetLogsRequest); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[15].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GetLogsResponse); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[16].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GetGraphRequest); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[17].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GetGraphResponse); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[18].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*CancelRequest); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[19].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*CancelResponse); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[20].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*PrecompiledObject); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[21].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*Categories); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[22].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GetPrecompiledObjectsRequest); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[23].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GetPrecompiledObjectRequest); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[24].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GetPrecompiledObjectCodeRequest); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[25].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GetPrecompiledObjectOutputRequest); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[26].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GetPrecompiledObjectLogsRequest); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[27].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GetPrecompiledObjectGraphRequest); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[28].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GetDefaultPrecompiledObjectRequest); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[29].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GetPrecompiledObjectsResponse); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[30].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GetPrecompiledObjectResponse); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[31].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GetPrecompiledObjectCodeResponse); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[32].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GetPrecompiledObjectOutputResponse); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[33].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GetPrecompiledObjectLogsResponse); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[34].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GetPrecompiledObjectGraphResponse); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[35].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GetDefaultPrecompiledObjectResponse); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[36].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*SnippetFile); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[37].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*SaveSnippetRequest); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[38].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*SaveSnippetResponse); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[39].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GetSnippetRequest); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[40].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*GetSnippetResponse); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_api_proto_msgTypes[41].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*Categories_Category); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + } + type x struct{} + out := protoimpl.TypeBuilder{ + File: protoimpl.DescBuilder{ + GoPackagePath: reflect.TypeOf(x{}).PkgPath(), + RawDescriptor: file_api_proto_rawDesc, + NumEnums: 4, + NumMessages: 42, + NumExtensions: 0, + NumServices: 1, + }, + GoTypes: file_api_proto_goTypes, + DependencyIndexes: file_api_proto_depIdxs, + EnumInfos: file_api_proto_enumTypes, + MessageInfos: file_api_proto_msgTypes, + }.Build() + File_api_proto = out.File + file_api_proto_rawDesc = nil + file_api_proto_goTypes = nil + file_api_proto_depIdxs = nil +} diff --git a/learning/tour-of-beam/backend/playground_api/api_grpc.pb.go b/learning/tour-of-beam/backend/playground_api/api_grpc.pb.go new file mode 100644 index 000000000000..e50d4961adee --- /dev/null +++ b/learning/tour-of-beam/backend/playground_api/api_grpc.pb.go @@ -0,0 +1,791 @@ +// Code generated by protoc-gen-go-grpc. DO NOT EDIT. +// versions: +// - protoc-gen-go-grpc v1.2.0 +// - protoc v3.12.4 +// source: api.proto + +package playground + +import ( + context "context" + grpc "google.golang.org/grpc" + codes "google.golang.org/grpc/codes" + status "google.golang.org/grpc/status" +) + +// This is a compile-time assertion to ensure that this generated file +// is compatible with the grpc package it is being compiled against. +// Requires gRPC-Go v1.32.0 or later. +const _ = grpc.SupportPackageIsVersion7 + +// PlaygroundServiceClient is the client API for PlaygroundService service. +// +// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream. +type PlaygroundServiceClient interface { + // Submit the job for an execution and get the pipeline uuid. + RunCode(ctx context.Context, in *RunCodeRequest, opts ...grpc.CallOption) (*RunCodeResponse, error) + // Get the status of pipeline execution. + CheckStatus(ctx context.Context, in *CheckStatusRequest, opts ...grpc.CallOption) (*CheckStatusResponse, error) + // Get the result of pipeline execution. + GetRunOutput(ctx context.Context, in *GetRunOutputRequest, opts ...grpc.CallOption) (*GetRunOutputResponse, error) + // Get the logs of pipeline execution. + GetLogs(ctx context.Context, in *GetLogsRequest, opts ...grpc.CallOption) (*GetLogsResponse, error) + // Get the string representation of the pipeline execution graph in DOT format. + GetGraph(ctx context.Context, in *GetGraphRequest, opts ...grpc.CallOption) (*GetGraphResponse, error) + // Get the error of pipeline execution. + GetRunError(ctx context.Context, in *GetRunErrorRequest, opts ...grpc.CallOption) (*GetRunErrorResponse, error) + // Get the result of pipeline validation. + GetValidationOutput(ctx context.Context, in *GetValidationOutputRequest, opts ...grpc.CallOption) (*GetValidationOutputResponse, error) + // Get the result of pipeline preparation. + GetPreparationOutput(ctx context.Context, in *GetPreparationOutputRequest, opts ...grpc.CallOption) (*GetPreparationOutputResponse, error) + // Get the result of pipeline compilation. + GetCompileOutput(ctx context.Context, in *GetCompileOutputRequest, opts ...grpc.CallOption) (*GetCompileOutputResponse, error) + // Cancel code processing + Cancel(ctx context.Context, in *CancelRequest, opts ...grpc.CallOption) (*CancelResponse, error) + // Get all precompiled objects from the cloud datastore. + GetPrecompiledObjects(ctx context.Context, in *GetPrecompiledObjectsRequest, opts ...grpc.CallOption) (*GetPrecompiledObjectsResponse, error) + // Get precompiled object from the cloud datastore. + GetPrecompiledObject(ctx context.Context, in *GetPrecompiledObjectRequest, opts ...grpc.CallOption) (*GetPrecompiledObjectResponse, error) + // Get the code of an PrecompiledObject. + GetPrecompiledObjectCode(ctx context.Context, in *GetPrecompiledObjectCodeRequest, opts ...grpc.CallOption) (*GetPrecompiledObjectCodeResponse, error) + // Get the precompiled details of an PrecompiledObject. + GetPrecompiledObjectOutput(ctx context.Context, in *GetPrecompiledObjectOutputRequest, opts ...grpc.CallOption) (*GetPrecompiledObjectOutputResponse, error) + // Get the logs of an PrecompiledObject. + GetPrecompiledObjectLogs(ctx context.Context, in *GetPrecompiledObjectLogsRequest, opts ...grpc.CallOption) (*GetPrecompiledObjectLogsResponse, error) + // Get the graph of an PrecompiledObject. + GetPrecompiledObjectGraph(ctx context.Context, in *GetPrecompiledObjectGraphRequest, opts ...grpc.CallOption) (*GetPrecompiledObjectGraphResponse, error) + // Get the default precompile object for the sdk. + GetDefaultPrecompiledObject(ctx context.Context, in *GetDefaultPrecompiledObjectRequest, opts ...grpc.CallOption) (*GetDefaultPrecompiledObjectResponse, error) + // Save the snippet required for the sharing. + SaveSnippet(ctx context.Context, in *SaveSnippetRequest, opts ...grpc.CallOption) (*SaveSnippetResponse, error) + // Get the snippet of playground. + GetSnippet(ctx context.Context, in *GetSnippetRequest, opts ...grpc.CallOption) (*GetSnippetResponse, error) +} + +type playgroundServiceClient struct { + cc grpc.ClientConnInterface +} + +func NewPlaygroundServiceClient(cc grpc.ClientConnInterface) PlaygroundServiceClient { + return &playgroundServiceClient{cc} +} + +func (c *playgroundServiceClient) RunCode(ctx context.Context, in *RunCodeRequest, opts ...grpc.CallOption) (*RunCodeResponse, error) { + out := new(RunCodeResponse) + err := c.cc.Invoke(ctx, "/api.v1.PlaygroundService/RunCode", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *playgroundServiceClient) CheckStatus(ctx context.Context, in *CheckStatusRequest, opts ...grpc.CallOption) (*CheckStatusResponse, error) { + out := new(CheckStatusResponse) + err := c.cc.Invoke(ctx, "/api.v1.PlaygroundService/CheckStatus", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *playgroundServiceClient) GetRunOutput(ctx context.Context, in *GetRunOutputRequest, opts ...grpc.CallOption) (*GetRunOutputResponse, error) { + out := new(GetRunOutputResponse) + err := c.cc.Invoke(ctx, "/api.v1.PlaygroundService/GetRunOutput", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *playgroundServiceClient) GetLogs(ctx context.Context, in *GetLogsRequest, opts ...grpc.CallOption) (*GetLogsResponse, error) { + out := new(GetLogsResponse) + err := c.cc.Invoke(ctx, "/api.v1.PlaygroundService/GetLogs", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *playgroundServiceClient) GetGraph(ctx context.Context, in *GetGraphRequest, opts ...grpc.CallOption) (*GetGraphResponse, error) { + out := new(GetGraphResponse) + err := c.cc.Invoke(ctx, "/api.v1.PlaygroundService/GetGraph", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *playgroundServiceClient) GetRunError(ctx context.Context, in *GetRunErrorRequest, opts ...grpc.CallOption) (*GetRunErrorResponse, error) { + out := new(GetRunErrorResponse) + err := c.cc.Invoke(ctx, "/api.v1.PlaygroundService/GetRunError", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *playgroundServiceClient) GetValidationOutput(ctx context.Context, in *GetValidationOutputRequest, opts ...grpc.CallOption) (*GetValidationOutputResponse, error) { + out := new(GetValidationOutputResponse) + err := c.cc.Invoke(ctx, "/api.v1.PlaygroundService/GetValidationOutput", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *playgroundServiceClient) GetPreparationOutput(ctx context.Context, in *GetPreparationOutputRequest, opts ...grpc.CallOption) (*GetPreparationOutputResponse, error) { + out := new(GetPreparationOutputResponse) + err := c.cc.Invoke(ctx, "/api.v1.PlaygroundService/GetPreparationOutput", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *playgroundServiceClient) GetCompileOutput(ctx context.Context, in *GetCompileOutputRequest, opts ...grpc.CallOption) (*GetCompileOutputResponse, error) { + out := new(GetCompileOutputResponse) + err := c.cc.Invoke(ctx, "/api.v1.PlaygroundService/GetCompileOutput", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *playgroundServiceClient) Cancel(ctx context.Context, in *CancelRequest, opts ...grpc.CallOption) (*CancelResponse, error) { + out := new(CancelResponse) + err := c.cc.Invoke(ctx, "/api.v1.PlaygroundService/Cancel", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *playgroundServiceClient) GetPrecompiledObjects(ctx context.Context, in *GetPrecompiledObjectsRequest, opts ...grpc.CallOption) (*GetPrecompiledObjectsResponse, error) { + out := new(GetPrecompiledObjectsResponse) + err := c.cc.Invoke(ctx, "/api.v1.PlaygroundService/GetPrecompiledObjects", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *playgroundServiceClient) GetPrecompiledObject(ctx context.Context, in *GetPrecompiledObjectRequest, opts ...grpc.CallOption) (*GetPrecompiledObjectResponse, error) { + out := new(GetPrecompiledObjectResponse) + err := c.cc.Invoke(ctx, "/api.v1.PlaygroundService/GetPrecompiledObject", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *playgroundServiceClient) GetPrecompiledObjectCode(ctx context.Context, in *GetPrecompiledObjectCodeRequest, opts ...grpc.CallOption) (*GetPrecompiledObjectCodeResponse, error) { + out := new(GetPrecompiledObjectCodeResponse) + err := c.cc.Invoke(ctx, "/api.v1.PlaygroundService/GetPrecompiledObjectCode", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *playgroundServiceClient) GetPrecompiledObjectOutput(ctx context.Context, in *GetPrecompiledObjectOutputRequest, opts ...grpc.CallOption) (*GetPrecompiledObjectOutputResponse, error) { + out := new(GetPrecompiledObjectOutputResponse) + err := c.cc.Invoke(ctx, "/api.v1.PlaygroundService/GetPrecompiledObjectOutput", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *playgroundServiceClient) GetPrecompiledObjectLogs(ctx context.Context, in *GetPrecompiledObjectLogsRequest, opts ...grpc.CallOption) (*GetPrecompiledObjectLogsResponse, error) { + out := new(GetPrecompiledObjectLogsResponse) + err := c.cc.Invoke(ctx, "/api.v1.PlaygroundService/GetPrecompiledObjectLogs", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *playgroundServiceClient) GetPrecompiledObjectGraph(ctx context.Context, in *GetPrecompiledObjectGraphRequest, opts ...grpc.CallOption) (*GetPrecompiledObjectGraphResponse, error) { + out := new(GetPrecompiledObjectGraphResponse) + err := c.cc.Invoke(ctx, "/api.v1.PlaygroundService/GetPrecompiledObjectGraph", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *playgroundServiceClient) GetDefaultPrecompiledObject(ctx context.Context, in *GetDefaultPrecompiledObjectRequest, opts ...grpc.CallOption) (*GetDefaultPrecompiledObjectResponse, error) { + out := new(GetDefaultPrecompiledObjectResponse) + err := c.cc.Invoke(ctx, "/api.v1.PlaygroundService/GetDefaultPrecompiledObject", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *playgroundServiceClient) SaveSnippet(ctx context.Context, in *SaveSnippetRequest, opts ...grpc.CallOption) (*SaveSnippetResponse, error) { + out := new(SaveSnippetResponse) + err := c.cc.Invoke(ctx, "/api.v1.PlaygroundService/SaveSnippet", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +func (c *playgroundServiceClient) GetSnippet(ctx context.Context, in *GetSnippetRequest, opts ...grpc.CallOption) (*GetSnippetResponse, error) { + out := new(GetSnippetResponse) + err := c.cc.Invoke(ctx, "/api.v1.PlaygroundService/GetSnippet", in, out, opts...) + if err != nil { + return nil, err + } + return out, nil +} + +// PlaygroundServiceServer is the server API for PlaygroundService service. +// All implementations must embed UnimplementedPlaygroundServiceServer +// for forward compatibility +type PlaygroundServiceServer interface { + // Submit the job for an execution and get the pipeline uuid. + RunCode(context.Context, *RunCodeRequest) (*RunCodeResponse, error) + // Get the status of pipeline execution. + CheckStatus(context.Context, *CheckStatusRequest) (*CheckStatusResponse, error) + // Get the result of pipeline execution. + GetRunOutput(context.Context, *GetRunOutputRequest) (*GetRunOutputResponse, error) + // Get the logs of pipeline execution. + GetLogs(context.Context, *GetLogsRequest) (*GetLogsResponse, error) + // Get the string representation of the pipeline execution graph in DOT format. + GetGraph(context.Context, *GetGraphRequest) (*GetGraphResponse, error) + // Get the error of pipeline execution. + GetRunError(context.Context, *GetRunErrorRequest) (*GetRunErrorResponse, error) + // Get the result of pipeline validation. + GetValidationOutput(context.Context, *GetValidationOutputRequest) (*GetValidationOutputResponse, error) + // Get the result of pipeline preparation. + GetPreparationOutput(context.Context, *GetPreparationOutputRequest) (*GetPreparationOutputResponse, error) + // Get the result of pipeline compilation. + GetCompileOutput(context.Context, *GetCompileOutputRequest) (*GetCompileOutputResponse, error) + // Cancel code processing + Cancel(context.Context, *CancelRequest) (*CancelResponse, error) + // Get all precompiled objects from the cloud datastore. + GetPrecompiledObjects(context.Context, *GetPrecompiledObjectsRequest) (*GetPrecompiledObjectsResponse, error) + // Get precompiled object from the cloud datastore. + GetPrecompiledObject(context.Context, *GetPrecompiledObjectRequest) (*GetPrecompiledObjectResponse, error) + // Get the code of an PrecompiledObject. + GetPrecompiledObjectCode(context.Context, *GetPrecompiledObjectCodeRequest) (*GetPrecompiledObjectCodeResponse, error) + // Get the precompiled details of an PrecompiledObject. + GetPrecompiledObjectOutput(context.Context, *GetPrecompiledObjectOutputRequest) (*GetPrecompiledObjectOutputResponse, error) + // Get the logs of an PrecompiledObject. + GetPrecompiledObjectLogs(context.Context, *GetPrecompiledObjectLogsRequest) (*GetPrecompiledObjectLogsResponse, error) + // Get the graph of an PrecompiledObject. + GetPrecompiledObjectGraph(context.Context, *GetPrecompiledObjectGraphRequest) (*GetPrecompiledObjectGraphResponse, error) + // Get the default precompile object for the sdk. + GetDefaultPrecompiledObject(context.Context, *GetDefaultPrecompiledObjectRequest) (*GetDefaultPrecompiledObjectResponse, error) + // Save the snippet required for the sharing. + SaveSnippet(context.Context, *SaveSnippetRequest) (*SaveSnippetResponse, error) + // Get the snippet of playground. + GetSnippet(context.Context, *GetSnippetRequest) (*GetSnippetResponse, error) + mustEmbedUnimplementedPlaygroundServiceServer() +} + +// UnimplementedPlaygroundServiceServer must be embedded to have forward compatible implementations. +type UnimplementedPlaygroundServiceServer struct { +} + +func (UnimplementedPlaygroundServiceServer) RunCode(context.Context, *RunCodeRequest) (*RunCodeResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method RunCode not implemented") +} +func (UnimplementedPlaygroundServiceServer) CheckStatus(context.Context, *CheckStatusRequest) (*CheckStatusResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method CheckStatus not implemented") +} +func (UnimplementedPlaygroundServiceServer) GetRunOutput(context.Context, *GetRunOutputRequest) (*GetRunOutputResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method GetRunOutput not implemented") +} +func (UnimplementedPlaygroundServiceServer) GetLogs(context.Context, *GetLogsRequest) (*GetLogsResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method GetLogs not implemented") +} +func (UnimplementedPlaygroundServiceServer) GetGraph(context.Context, *GetGraphRequest) (*GetGraphResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method GetGraph not implemented") +} +func (UnimplementedPlaygroundServiceServer) GetRunError(context.Context, *GetRunErrorRequest) (*GetRunErrorResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method GetRunError not implemented") +} +func (UnimplementedPlaygroundServiceServer) GetValidationOutput(context.Context, *GetValidationOutputRequest) (*GetValidationOutputResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method GetValidationOutput not implemented") +} +func (UnimplementedPlaygroundServiceServer) GetPreparationOutput(context.Context, *GetPreparationOutputRequest) (*GetPreparationOutputResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method GetPreparationOutput not implemented") +} +func (UnimplementedPlaygroundServiceServer) GetCompileOutput(context.Context, *GetCompileOutputRequest) (*GetCompileOutputResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method GetCompileOutput not implemented") +} +func (UnimplementedPlaygroundServiceServer) Cancel(context.Context, *CancelRequest) (*CancelResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method Cancel not implemented") +} +func (UnimplementedPlaygroundServiceServer) GetPrecompiledObjects(context.Context, *GetPrecompiledObjectsRequest) (*GetPrecompiledObjectsResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method GetPrecompiledObjects not implemented") +} +func (UnimplementedPlaygroundServiceServer) GetPrecompiledObject(context.Context, *GetPrecompiledObjectRequest) (*GetPrecompiledObjectResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method GetPrecompiledObject not implemented") +} +func (UnimplementedPlaygroundServiceServer) GetPrecompiledObjectCode(context.Context, *GetPrecompiledObjectCodeRequest) (*GetPrecompiledObjectCodeResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method GetPrecompiledObjectCode not implemented") +} +func (UnimplementedPlaygroundServiceServer) GetPrecompiledObjectOutput(context.Context, *GetPrecompiledObjectOutputRequest) (*GetPrecompiledObjectOutputResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method GetPrecompiledObjectOutput not implemented") +} +func (UnimplementedPlaygroundServiceServer) GetPrecompiledObjectLogs(context.Context, *GetPrecompiledObjectLogsRequest) (*GetPrecompiledObjectLogsResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method GetPrecompiledObjectLogs not implemented") +} +func (UnimplementedPlaygroundServiceServer) GetPrecompiledObjectGraph(context.Context, *GetPrecompiledObjectGraphRequest) (*GetPrecompiledObjectGraphResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method GetPrecompiledObjectGraph not implemented") +} +func (UnimplementedPlaygroundServiceServer) GetDefaultPrecompiledObject(context.Context, *GetDefaultPrecompiledObjectRequest) (*GetDefaultPrecompiledObjectResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method GetDefaultPrecompiledObject not implemented") +} +func (UnimplementedPlaygroundServiceServer) SaveSnippet(context.Context, *SaveSnippetRequest) (*SaveSnippetResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method SaveSnippet not implemented") +} +func (UnimplementedPlaygroundServiceServer) GetSnippet(context.Context, *GetSnippetRequest) (*GetSnippetResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method GetSnippet not implemented") +} +func (UnimplementedPlaygroundServiceServer) mustEmbedUnimplementedPlaygroundServiceServer() {} + +// UnsafePlaygroundServiceServer may be embedded to opt out of forward compatibility for this service. +// Use of this interface is not recommended, as added methods to PlaygroundServiceServer will +// result in compilation errors. +type UnsafePlaygroundServiceServer interface { + mustEmbedUnimplementedPlaygroundServiceServer() +} + +func RegisterPlaygroundServiceServer(s grpc.ServiceRegistrar, srv PlaygroundServiceServer) { + s.RegisterService(&PlaygroundService_ServiceDesc, srv) +} + +func _PlaygroundService_RunCode_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(RunCodeRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(PlaygroundServiceServer).RunCode(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/api.v1.PlaygroundService/RunCode", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(PlaygroundServiceServer).RunCode(ctx, req.(*RunCodeRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _PlaygroundService_CheckStatus_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(CheckStatusRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(PlaygroundServiceServer).CheckStatus(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/api.v1.PlaygroundService/CheckStatus", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(PlaygroundServiceServer).CheckStatus(ctx, req.(*CheckStatusRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _PlaygroundService_GetRunOutput_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(GetRunOutputRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(PlaygroundServiceServer).GetRunOutput(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/api.v1.PlaygroundService/GetRunOutput", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(PlaygroundServiceServer).GetRunOutput(ctx, req.(*GetRunOutputRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _PlaygroundService_GetLogs_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(GetLogsRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(PlaygroundServiceServer).GetLogs(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/api.v1.PlaygroundService/GetLogs", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(PlaygroundServiceServer).GetLogs(ctx, req.(*GetLogsRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _PlaygroundService_GetGraph_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(GetGraphRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(PlaygroundServiceServer).GetGraph(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/api.v1.PlaygroundService/GetGraph", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(PlaygroundServiceServer).GetGraph(ctx, req.(*GetGraphRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _PlaygroundService_GetRunError_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(GetRunErrorRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(PlaygroundServiceServer).GetRunError(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/api.v1.PlaygroundService/GetRunError", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(PlaygroundServiceServer).GetRunError(ctx, req.(*GetRunErrorRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _PlaygroundService_GetValidationOutput_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(GetValidationOutputRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(PlaygroundServiceServer).GetValidationOutput(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/api.v1.PlaygroundService/GetValidationOutput", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(PlaygroundServiceServer).GetValidationOutput(ctx, req.(*GetValidationOutputRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _PlaygroundService_GetPreparationOutput_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(GetPreparationOutputRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(PlaygroundServiceServer).GetPreparationOutput(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/api.v1.PlaygroundService/GetPreparationOutput", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(PlaygroundServiceServer).GetPreparationOutput(ctx, req.(*GetPreparationOutputRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _PlaygroundService_GetCompileOutput_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(GetCompileOutputRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(PlaygroundServiceServer).GetCompileOutput(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/api.v1.PlaygroundService/GetCompileOutput", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(PlaygroundServiceServer).GetCompileOutput(ctx, req.(*GetCompileOutputRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _PlaygroundService_Cancel_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(CancelRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(PlaygroundServiceServer).Cancel(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/api.v1.PlaygroundService/Cancel", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(PlaygroundServiceServer).Cancel(ctx, req.(*CancelRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _PlaygroundService_GetPrecompiledObjects_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(GetPrecompiledObjectsRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(PlaygroundServiceServer).GetPrecompiledObjects(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/api.v1.PlaygroundService/GetPrecompiledObjects", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(PlaygroundServiceServer).GetPrecompiledObjects(ctx, req.(*GetPrecompiledObjectsRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _PlaygroundService_GetPrecompiledObject_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(GetPrecompiledObjectRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(PlaygroundServiceServer).GetPrecompiledObject(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/api.v1.PlaygroundService/GetPrecompiledObject", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(PlaygroundServiceServer).GetPrecompiledObject(ctx, req.(*GetPrecompiledObjectRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _PlaygroundService_GetPrecompiledObjectCode_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(GetPrecompiledObjectCodeRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(PlaygroundServiceServer).GetPrecompiledObjectCode(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/api.v1.PlaygroundService/GetPrecompiledObjectCode", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(PlaygroundServiceServer).GetPrecompiledObjectCode(ctx, req.(*GetPrecompiledObjectCodeRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _PlaygroundService_GetPrecompiledObjectOutput_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(GetPrecompiledObjectOutputRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(PlaygroundServiceServer).GetPrecompiledObjectOutput(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/api.v1.PlaygroundService/GetPrecompiledObjectOutput", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(PlaygroundServiceServer).GetPrecompiledObjectOutput(ctx, req.(*GetPrecompiledObjectOutputRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _PlaygroundService_GetPrecompiledObjectLogs_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(GetPrecompiledObjectLogsRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(PlaygroundServiceServer).GetPrecompiledObjectLogs(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/api.v1.PlaygroundService/GetPrecompiledObjectLogs", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(PlaygroundServiceServer).GetPrecompiledObjectLogs(ctx, req.(*GetPrecompiledObjectLogsRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _PlaygroundService_GetPrecompiledObjectGraph_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(GetPrecompiledObjectGraphRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(PlaygroundServiceServer).GetPrecompiledObjectGraph(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/api.v1.PlaygroundService/GetPrecompiledObjectGraph", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(PlaygroundServiceServer).GetPrecompiledObjectGraph(ctx, req.(*GetPrecompiledObjectGraphRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _PlaygroundService_GetDefaultPrecompiledObject_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(GetDefaultPrecompiledObjectRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(PlaygroundServiceServer).GetDefaultPrecompiledObject(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/api.v1.PlaygroundService/GetDefaultPrecompiledObject", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(PlaygroundServiceServer).GetDefaultPrecompiledObject(ctx, req.(*GetDefaultPrecompiledObjectRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _PlaygroundService_SaveSnippet_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(SaveSnippetRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(PlaygroundServiceServer).SaveSnippet(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/api.v1.PlaygroundService/SaveSnippet", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(PlaygroundServiceServer).SaveSnippet(ctx, req.(*SaveSnippetRequest)) + } + return interceptor(ctx, in, info, handler) +} + +func _PlaygroundService_GetSnippet_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(GetSnippetRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(PlaygroundServiceServer).GetSnippet(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: "/api.v1.PlaygroundService/GetSnippet", + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(PlaygroundServiceServer).GetSnippet(ctx, req.(*GetSnippetRequest)) + } + return interceptor(ctx, in, info, handler) +} + +// PlaygroundService_ServiceDesc is the grpc.ServiceDesc for PlaygroundService service. +// It's only intended for direct use with grpc.RegisterService, +// and not to be introspected or modified (even as a copy) +var PlaygroundService_ServiceDesc = grpc.ServiceDesc{ + ServiceName: "api.v1.PlaygroundService", + HandlerType: (*PlaygroundServiceServer)(nil), + Methods: []grpc.MethodDesc{ + { + MethodName: "RunCode", + Handler: _PlaygroundService_RunCode_Handler, + }, + { + MethodName: "CheckStatus", + Handler: _PlaygroundService_CheckStatus_Handler, + }, + { + MethodName: "GetRunOutput", + Handler: _PlaygroundService_GetRunOutput_Handler, + }, + { + MethodName: "GetLogs", + Handler: _PlaygroundService_GetLogs_Handler, + }, + { + MethodName: "GetGraph", + Handler: _PlaygroundService_GetGraph_Handler, + }, + { + MethodName: "GetRunError", + Handler: _PlaygroundService_GetRunError_Handler, + }, + { + MethodName: "GetValidationOutput", + Handler: _PlaygroundService_GetValidationOutput_Handler, + }, + { + MethodName: "GetPreparationOutput", + Handler: _PlaygroundService_GetPreparationOutput_Handler, + }, + { + MethodName: "GetCompileOutput", + Handler: _PlaygroundService_GetCompileOutput_Handler, + }, + { + MethodName: "Cancel", + Handler: _PlaygroundService_Cancel_Handler, + }, + { + MethodName: "GetPrecompiledObjects", + Handler: _PlaygroundService_GetPrecompiledObjects_Handler, + }, + { + MethodName: "GetPrecompiledObject", + Handler: _PlaygroundService_GetPrecompiledObject_Handler, + }, + { + MethodName: "GetPrecompiledObjectCode", + Handler: _PlaygroundService_GetPrecompiledObjectCode_Handler, + }, + { + MethodName: "GetPrecompiledObjectOutput", + Handler: _PlaygroundService_GetPrecompiledObjectOutput_Handler, + }, + { + MethodName: "GetPrecompiledObjectLogs", + Handler: _PlaygroundService_GetPrecompiledObjectLogs_Handler, + }, + { + MethodName: "GetPrecompiledObjectGraph", + Handler: _PlaygroundService_GetPrecompiledObjectGraph_Handler, + }, + { + MethodName: "GetDefaultPrecompiledObject", + Handler: _PlaygroundService_GetDefaultPrecompiledObject_Handler, + }, + { + MethodName: "SaveSnippet", + Handler: _PlaygroundService_SaveSnippet_Handler, + }, + { + MethodName: "GetSnippet", + Handler: _PlaygroundService_GetSnippet_Handler, + }, + }, + Streams: []grpc.StreamDesc{}, + Metadata: "api.proto", +} diff --git a/learning/tour-of-beam/backend/playground_api/helper.go b/learning/tour-of-beam/backend/playground_api/helper.go new file mode 100644 index 000000000000..896d3ecc5cf0 --- /dev/null +++ b/learning/tour-of-beam/backend/playground_api/helper.go @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package playground + +import ( + context "context" + + grpc "google.golang.org/grpc" +) + +func GetMockClient() PlaygroundServiceClient { + + return &PlaygroundServiceClientMock{ + SaveSnippetFunc: func(ctx context.Context, in *SaveSnippetRequest, opts ...grpc.CallOption) (*SaveSnippetResponse, error) { + return &SaveSnippetResponse{Id: "snippet_id_1"}, nil + }, + GetSnippetFunc: func(ctx context.Context, in *GetSnippetRequest, opts ...grpc.CallOption) (*GetSnippetResponse, error) { + return &GetSnippetResponse{ + Files: []*SnippetFile{ + {Name: "main.py", Content: "import sys; sys.exit(0)", IsMain: true}, + }, + Sdk: Sdk_SDK_PYTHON, + PipelineOptions: "some opts", + }, nil + }, + } +} diff --git a/learning/tour-of-beam/backend/playground_api/mock.go b/learning/tour-of-beam/backend/playground_api/mock.go new file mode 100644 index 000000000000..c4dc009c1ff3 --- /dev/null +++ b/learning/tour-of-beam/backend/playground_api/mock.go @@ -0,0 +1,1077 @@ +// Code generated by moq; DO NOT EDIT. +// github.com/matryer/moq + +package playground + +import ( + context "context" + grpc "google.golang.org/grpc" + sync "sync" +) + +// Ensure, that PlaygroundServiceClientMock does implement PlaygroundServiceClient. +// If this is not the case, regenerate this file with moq. +var _ PlaygroundServiceClient = &PlaygroundServiceClientMock{} + +// PlaygroundServiceClientMock is a mock implementation of PlaygroundServiceClient. +// +// func TestSomethingThatUsesPlaygroundServiceClient(t *testing.T) { +// +// // make and configure a mocked PlaygroundServiceClient +// mockedPlaygroundServiceClient := &PlaygroundServiceClientMock{ +// CancelFunc: func(ctx context.Context, in *CancelRequest, opts ...grpc.CallOption) (*CancelResponse, error) { +// panic("mock out the Cancel method") +// }, +// CheckStatusFunc: func(ctx context.Context, in *CheckStatusRequest, opts ...grpc.CallOption) (*CheckStatusResponse, error) { +// panic("mock out the CheckStatus method") +// }, +// GetCompileOutputFunc: func(ctx context.Context, in *GetCompileOutputRequest, opts ...grpc.CallOption) (*GetCompileOutputResponse, error) { +// panic("mock out the GetCompileOutput method") +// }, +// GetDefaultPrecompiledObjectFunc: func(ctx context.Context, in *GetDefaultPrecompiledObjectRequest, opts ...grpc.CallOption) (*GetDefaultPrecompiledObjectResponse, error) { +// panic("mock out the GetDefaultPrecompiledObject method") +// }, +// GetGraphFunc: func(ctx context.Context, in *GetGraphRequest, opts ...grpc.CallOption) (*GetGraphResponse, error) { +// panic("mock out the GetGraph method") +// }, +// GetLogsFunc: func(ctx context.Context, in *GetLogsRequest, opts ...grpc.CallOption) (*GetLogsResponse, error) { +// panic("mock out the GetLogs method") +// }, +// GetPrecompiledObjectFunc: func(ctx context.Context, in *GetPrecompiledObjectRequest, opts ...grpc.CallOption) (*GetPrecompiledObjectResponse, error) { +// panic("mock out the GetPrecompiledObject method") +// }, +// GetPrecompiledObjectCodeFunc: func(ctx context.Context, in *GetPrecompiledObjectCodeRequest, opts ...grpc.CallOption) (*GetPrecompiledObjectCodeResponse, error) { +// panic("mock out the GetPrecompiledObjectCode method") +// }, +// GetPrecompiledObjectGraphFunc: func(ctx context.Context, in *GetPrecompiledObjectGraphRequest, opts ...grpc.CallOption) (*GetPrecompiledObjectGraphResponse, error) { +// panic("mock out the GetPrecompiledObjectGraph method") +// }, +// GetPrecompiledObjectLogsFunc: func(ctx context.Context, in *GetPrecompiledObjectLogsRequest, opts ...grpc.CallOption) (*GetPrecompiledObjectLogsResponse, error) { +// panic("mock out the GetPrecompiledObjectLogs method") +// }, +// GetPrecompiledObjectOutputFunc: func(ctx context.Context, in *GetPrecompiledObjectOutputRequest, opts ...grpc.CallOption) (*GetPrecompiledObjectOutputResponse, error) { +// panic("mock out the GetPrecompiledObjectOutput method") +// }, +// GetPrecompiledObjectsFunc: func(ctx context.Context, in *GetPrecompiledObjectsRequest, opts ...grpc.CallOption) (*GetPrecompiledObjectsResponse, error) { +// panic("mock out the GetPrecompiledObjects method") +// }, +// GetPreparationOutputFunc: func(ctx context.Context, in *GetPreparationOutputRequest, opts ...grpc.CallOption) (*GetPreparationOutputResponse, error) { +// panic("mock out the GetPreparationOutput method") +// }, +// GetRunErrorFunc: func(ctx context.Context, in *GetRunErrorRequest, opts ...grpc.CallOption) (*GetRunErrorResponse, error) { +// panic("mock out the GetRunError method") +// }, +// GetRunOutputFunc: func(ctx context.Context, in *GetRunOutputRequest, opts ...grpc.CallOption) (*GetRunOutputResponse, error) { +// panic("mock out the GetRunOutput method") +// }, +// GetSnippetFunc: func(ctx context.Context, in *GetSnippetRequest, opts ...grpc.CallOption) (*GetSnippetResponse, error) { +// panic("mock out the GetSnippet method") +// }, +// GetValidationOutputFunc: func(ctx context.Context, in *GetValidationOutputRequest, opts ...grpc.CallOption) (*GetValidationOutputResponse, error) { +// panic("mock out the GetValidationOutput method") +// }, +// RunCodeFunc: func(ctx context.Context, in *RunCodeRequest, opts ...grpc.CallOption) (*RunCodeResponse, error) { +// panic("mock out the RunCode method") +// }, +// SaveSnippetFunc: func(ctx context.Context, in *SaveSnippetRequest, opts ...grpc.CallOption) (*SaveSnippetResponse, error) { +// panic("mock out the SaveSnippet method") +// }, +// } +// +// // use mockedPlaygroundServiceClient in code that requires PlaygroundServiceClient +// // and then make assertions. +// +// } +type PlaygroundServiceClientMock struct { + // CancelFunc mocks the Cancel method. + CancelFunc func(ctx context.Context, in *CancelRequest, opts ...grpc.CallOption) (*CancelResponse, error) + + // CheckStatusFunc mocks the CheckStatus method. + CheckStatusFunc func(ctx context.Context, in *CheckStatusRequest, opts ...grpc.CallOption) (*CheckStatusResponse, error) + + // GetCompileOutputFunc mocks the GetCompileOutput method. + GetCompileOutputFunc func(ctx context.Context, in *GetCompileOutputRequest, opts ...grpc.CallOption) (*GetCompileOutputResponse, error) + + // GetDefaultPrecompiledObjectFunc mocks the GetDefaultPrecompiledObject method. + GetDefaultPrecompiledObjectFunc func(ctx context.Context, in *GetDefaultPrecompiledObjectRequest, opts ...grpc.CallOption) (*GetDefaultPrecompiledObjectResponse, error) + + // GetGraphFunc mocks the GetGraph method. + GetGraphFunc func(ctx context.Context, in *GetGraphRequest, opts ...grpc.CallOption) (*GetGraphResponse, error) + + // GetLogsFunc mocks the GetLogs method. + GetLogsFunc func(ctx context.Context, in *GetLogsRequest, opts ...grpc.CallOption) (*GetLogsResponse, error) + + // GetPrecompiledObjectFunc mocks the GetPrecompiledObject method. + GetPrecompiledObjectFunc func(ctx context.Context, in *GetPrecompiledObjectRequest, opts ...grpc.CallOption) (*GetPrecompiledObjectResponse, error) + + // GetPrecompiledObjectCodeFunc mocks the GetPrecompiledObjectCode method. + GetPrecompiledObjectCodeFunc func(ctx context.Context, in *GetPrecompiledObjectCodeRequest, opts ...grpc.CallOption) (*GetPrecompiledObjectCodeResponse, error) + + // GetPrecompiledObjectGraphFunc mocks the GetPrecompiledObjectGraph method. + GetPrecompiledObjectGraphFunc func(ctx context.Context, in *GetPrecompiledObjectGraphRequest, opts ...grpc.CallOption) (*GetPrecompiledObjectGraphResponse, error) + + // GetPrecompiledObjectLogsFunc mocks the GetPrecompiledObjectLogs method. + GetPrecompiledObjectLogsFunc func(ctx context.Context, in *GetPrecompiledObjectLogsRequest, opts ...grpc.CallOption) (*GetPrecompiledObjectLogsResponse, error) + + // GetPrecompiledObjectOutputFunc mocks the GetPrecompiledObjectOutput method. + GetPrecompiledObjectOutputFunc func(ctx context.Context, in *GetPrecompiledObjectOutputRequest, opts ...grpc.CallOption) (*GetPrecompiledObjectOutputResponse, error) + + // GetPrecompiledObjectsFunc mocks the GetPrecompiledObjects method. + GetPrecompiledObjectsFunc func(ctx context.Context, in *GetPrecompiledObjectsRequest, opts ...grpc.CallOption) (*GetPrecompiledObjectsResponse, error) + + // GetPreparationOutputFunc mocks the GetPreparationOutput method. + GetPreparationOutputFunc func(ctx context.Context, in *GetPreparationOutputRequest, opts ...grpc.CallOption) (*GetPreparationOutputResponse, error) + + // GetRunErrorFunc mocks the GetRunError method. + GetRunErrorFunc func(ctx context.Context, in *GetRunErrorRequest, opts ...grpc.CallOption) (*GetRunErrorResponse, error) + + // GetRunOutputFunc mocks the GetRunOutput method. + GetRunOutputFunc func(ctx context.Context, in *GetRunOutputRequest, opts ...grpc.CallOption) (*GetRunOutputResponse, error) + + // GetSnippetFunc mocks the GetSnippet method. + GetSnippetFunc func(ctx context.Context, in *GetSnippetRequest, opts ...grpc.CallOption) (*GetSnippetResponse, error) + + // GetValidationOutputFunc mocks the GetValidationOutput method. + GetValidationOutputFunc func(ctx context.Context, in *GetValidationOutputRequest, opts ...grpc.CallOption) (*GetValidationOutputResponse, error) + + // RunCodeFunc mocks the RunCode method. + RunCodeFunc func(ctx context.Context, in *RunCodeRequest, opts ...grpc.CallOption) (*RunCodeResponse, error) + + // SaveSnippetFunc mocks the SaveSnippet method. + SaveSnippetFunc func(ctx context.Context, in *SaveSnippetRequest, opts ...grpc.CallOption) (*SaveSnippetResponse, error) + + // calls tracks calls to the methods. + calls struct { + // Cancel holds details about calls to the Cancel method. + Cancel []struct { + // Ctx is the ctx argument value. + Ctx context.Context + // In is the in argument value. + In *CancelRequest + // Opts is the opts argument value. + Opts []grpc.CallOption + } + // CheckStatus holds details about calls to the CheckStatus method. + CheckStatus []struct { + // Ctx is the ctx argument value. + Ctx context.Context + // In is the in argument value. + In *CheckStatusRequest + // Opts is the opts argument value. + Opts []grpc.CallOption + } + // GetCompileOutput holds details about calls to the GetCompileOutput method. + GetCompileOutput []struct { + // Ctx is the ctx argument value. + Ctx context.Context + // In is the in argument value. + In *GetCompileOutputRequest + // Opts is the opts argument value. + Opts []grpc.CallOption + } + // GetDefaultPrecompiledObject holds details about calls to the GetDefaultPrecompiledObject method. + GetDefaultPrecompiledObject []struct { + // Ctx is the ctx argument value. + Ctx context.Context + // In is the in argument value. + In *GetDefaultPrecompiledObjectRequest + // Opts is the opts argument value. + Opts []grpc.CallOption + } + // GetGraph holds details about calls to the GetGraph method. + GetGraph []struct { + // Ctx is the ctx argument value. + Ctx context.Context + // In is the in argument value. + In *GetGraphRequest + // Opts is the opts argument value. + Opts []grpc.CallOption + } + // GetLogs holds details about calls to the GetLogs method. + GetLogs []struct { + // Ctx is the ctx argument value. + Ctx context.Context + // In is the in argument value. + In *GetLogsRequest + // Opts is the opts argument value. + Opts []grpc.CallOption + } + // GetPrecompiledObject holds details about calls to the GetPrecompiledObject method. + GetPrecompiledObject []struct { + // Ctx is the ctx argument value. + Ctx context.Context + // In is the in argument value. + In *GetPrecompiledObjectRequest + // Opts is the opts argument value. + Opts []grpc.CallOption + } + // GetPrecompiledObjectCode holds details about calls to the GetPrecompiledObjectCode method. + GetPrecompiledObjectCode []struct { + // Ctx is the ctx argument value. + Ctx context.Context + // In is the in argument value. + In *GetPrecompiledObjectCodeRequest + // Opts is the opts argument value. + Opts []grpc.CallOption + } + // GetPrecompiledObjectGraph holds details about calls to the GetPrecompiledObjectGraph method. + GetPrecompiledObjectGraph []struct { + // Ctx is the ctx argument value. + Ctx context.Context + // In is the in argument value. + In *GetPrecompiledObjectGraphRequest + // Opts is the opts argument value. + Opts []grpc.CallOption + } + // GetPrecompiledObjectLogs holds details about calls to the GetPrecompiledObjectLogs method. + GetPrecompiledObjectLogs []struct { + // Ctx is the ctx argument value. + Ctx context.Context + // In is the in argument value. + In *GetPrecompiledObjectLogsRequest + // Opts is the opts argument value. + Opts []grpc.CallOption + } + // GetPrecompiledObjectOutput holds details about calls to the GetPrecompiledObjectOutput method. + GetPrecompiledObjectOutput []struct { + // Ctx is the ctx argument value. + Ctx context.Context + // In is the in argument value. + In *GetPrecompiledObjectOutputRequest + // Opts is the opts argument value. + Opts []grpc.CallOption + } + // GetPrecompiledObjects holds details about calls to the GetPrecompiledObjects method. + GetPrecompiledObjects []struct { + // Ctx is the ctx argument value. + Ctx context.Context + // In is the in argument value. + In *GetPrecompiledObjectsRequest + // Opts is the opts argument value. + Opts []grpc.CallOption + } + // GetPreparationOutput holds details about calls to the GetPreparationOutput method. + GetPreparationOutput []struct { + // Ctx is the ctx argument value. + Ctx context.Context + // In is the in argument value. + In *GetPreparationOutputRequest + // Opts is the opts argument value. + Opts []grpc.CallOption + } + // GetRunError holds details about calls to the GetRunError method. + GetRunError []struct { + // Ctx is the ctx argument value. + Ctx context.Context + // In is the in argument value. + In *GetRunErrorRequest + // Opts is the opts argument value. + Opts []grpc.CallOption + } + // GetRunOutput holds details about calls to the GetRunOutput method. + GetRunOutput []struct { + // Ctx is the ctx argument value. + Ctx context.Context + // In is the in argument value. + In *GetRunOutputRequest + // Opts is the opts argument value. + Opts []grpc.CallOption + } + // GetSnippet holds details about calls to the GetSnippet method. + GetSnippet []struct { + // Ctx is the ctx argument value. + Ctx context.Context + // In is the in argument value. + In *GetSnippetRequest + // Opts is the opts argument value. + Opts []grpc.CallOption + } + // GetValidationOutput holds details about calls to the GetValidationOutput method. + GetValidationOutput []struct { + // Ctx is the ctx argument value. + Ctx context.Context + // In is the in argument value. + In *GetValidationOutputRequest + // Opts is the opts argument value. + Opts []grpc.CallOption + } + // RunCode holds details about calls to the RunCode method. + RunCode []struct { + // Ctx is the ctx argument value. + Ctx context.Context + // In is the in argument value. + In *RunCodeRequest + // Opts is the opts argument value. + Opts []grpc.CallOption + } + // SaveSnippet holds details about calls to the SaveSnippet method. + SaveSnippet []struct { + // Ctx is the ctx argument value. + Ctx context.Context + // In is the in argument value. + In *SaveSnippetRequest + // Opts is the opts argument value. + Opts []grpc.CallOption + } + } + lockCancel sync.RWMutex + lockCheckStatus sync.RWMutex + lockGetCompileOutput sync.RWMutex + lockGetDefaultPrecompiledObject sync.RWMutex + lockGetGraph sync.RWMutex + lockGetLogs sync.RWMutex + lockGetPrecompiledObject sync.RWMutex + lockGetPrecompiledObjectCode sync.RWMutex + lockGetPrecompiledObjectGraph sync.RWMutex + lockGetPrecompiledObjectLogs sync.RWMutex + lockGetPrecompiledObjectOutput sync.RWMutex + lockGetPrecompiledObjects sync.RWMutex + lockGetPreparationOutput sync.RWMutex + lockGetRunError sync.RWMutex + lockGetRunOutput sync.RWMutex + lockGetSnippet sync.RWMutex + lockGetValidationOutput sync.RWMutex + lockRunCode sync.RWMutex + lockSaveSnippet sync.RWMutex +} + +// Cancel calls CancelFunc. +func (mock *PlaygroundServiceClientMock) Cancel(ctx context.Context, in *CancelRequest, opts ...grpc.CallOption) (*CancelResponse, error) { + if mock.CancelFunc == nil { + panic("PlaygroundServiceClientMock.CancelFunc: method is nil but PlaygroundServiceClient.Cancel was just called") + } + callInfo := struct { + Ctx context.Context + In *CancelRequest + Opts []grpc.CallOption + }{ + Ctx: ctx, + In: in, + Opts: opts, + } + mock.lockCancel.Lock() + mock.calls.Cancel = append(mock.calls.Cancel, callInfo) + mock.lockCancel.Unlock() + return mock.CancelFunc(ctx, in, opts...) +} + +// CancelCalls gets all the calls that were made to Cancel. +// Check the length with: +// len(mockedPlaygroundServiceClient.CancelCalls()) +func (mock *PlaygroundServiceClientMock) CancelCalls() []struct { + Ctx context.Context + In *CancelRequest + Opts []grpc.CallOption +} { + var calls []struct { + Ctx context.Context + In *CancelRequest + Opts []grpc.CallOption + } + mock.lockCancel.RLock() + calls = mock.calls.Cancel + mock.lockCancel.RUnlock() + return calls +} + +// CheckStatus calls CheckStatusFunc. +func (mock *PlaygroundServiceClientMock) CheckStatus(ctx context.Context, in *CheckStatusRequest, opts ...grpc.CallOption) (*CheckStatusResponse, error) { + if mock.CheckStatusFunc == nil { + panic("PlaygroundServiceClientMock.CheckStatusFunc: method is nil but PlaygroundServiceClient.CheckStatus was just called") + } + callInfo := struct { + Ctx context.Context + In *CheckStatusRequest + Opts []grpc.CallOption + }{ + Ctx: ctx, + In: in, + Opts: opts, + } + mock.lockCheckStatus.Lock() + mock.calls.CheckStatus = append(mock.calls.CheckStatus, callInfo) + mock.lockCheckStatus.Unlock() + return mock.CheckStatusFunc(ctx, in, opts...) +} + +// CheckStatusCalls gets all the calls that were made to CheckStatus. +// Check the length with: +// len(mockedPlaygroundServiceClient.CheckStatusCalls()) +func (mock *PlaygroundServiceClientMock) CheckStatusCalls() []struct { + Ctx context.Context + In *CheckStatusRequest + Opts []grpc.CallOption +} { + var calls []struct { + Ctx context.Context + In *CheckStatusRequest + Opts []grpc.CallOption + } + mock.lockCheckStatus.RLock() + calls = mock.calls.CheckStatus + mock.lockCheckStatus.RUnlock() + return calls +} + +// GetCompileOutput calls GetCompileOutputFunc. +func (mock *PlaygroundServiceClientMock) GetCompileOutput(ctx context.Context, in *GetCompileOutputRequest, opts ...grpc.CallOption) (*GetCompileOutputResponse, error) { + if mock.GetCompileOutputFunc == nil { + panic("PlaygroundServiceClientMock.GetCompileOutputFunc: method is nil but PlaygroundServiceClient.GetCompileOutput was just called") + } + callInfo := struct { + Ctx context.Context + In *GetCompileOutputRequest + Opts []grpc.CallOption + }{ + Ctx: ctx, + In: in, + Opts: opts, + } + mock.lockGetCompileOutput.Lock() + mock.calls.GetCompileOutput = append(mock.calls.GetCompileOutput, callInfo) + mock.lockGetCompileOutput.Unlock() + return mock.GetCompileOutputFunc(ctx, in, opts...) +} + +// GetCompileOutputCalls gets all the calls that were made to GetCompileOutput. +// Check the length with: +// len(mockedPlaygroundServiceClient.GetCompileOutputCalls()) +func (mock *PlaygroundServiceClientMock) GetCompileOutputCalls() []struct { + Ctx context.Context + In *GetCompileOutputRequest + Opts []grpc.CallOption +} { + var calls []struct { + Ctx context.Context + In *GetCompileOutputRequest + Opts []grpc.CallOption + } + mock.lockGetCompileOutput.RLock() + calls = mock.calls.GetCompileOutput + mock.lockGetCompileOutput.RUnlock() + return calls +} + +// GetDefaultPrecompiledObject calls GetDefaultPrecompiledObjectFunc. +func (mock *PlaygroundServiceClientMock) GetDefaultPrecompiledObject(ctx context.Context, in *GetDefaultPrecompiledObjectRequest, opts ...grpc.CallOption) (*GetDefaultPrecompiledObjectResponse, error) { + if mock.GetDefaultPrecompiledObjectFunc == nil { + panic("PlaygroundServiceClientMock.GetDefaultPrecompiledObjectFunc: method is nil but PlaygroundServiceClient.GetDefaultPrecompiledObject was just called") + } + callInfo := struct { + Ctx context.Context + In *GetDefaultPrecompiledObjectRequest + Opts []grpc.CallOption + }{ + Ctx: ctx, + In: in, + Opts: opts, + } + mock.lockGetDefaultPrecompiledObject.Lock() + mock.calls.GetDefaultPrecompiledObject = append(mock.calls.GetDefaultPrecompiledObject, callInfo) + mock.lockGetDefaultPrecompiledObject.Unlock() + return mock.GetDefaultPrecompiledObjectFunc(ctx, in, opts...) +} + +// GetDefaultPrecompiledObjectCalls gets all the calls that were made to GetDefaultPrecompiledObject. +// Check the length with: +// len(mockedPlaygroundServiceClient.GetDefaultPrecompiledObjectCalls()) +func (mock *PlaygroundServiceClientMock) GetDefaultPrecompiledObjectCalls() []struct { + Ctx context.Context + In *GetDefaultPrecompiledObjectRequest + Opts []grpc.CallOption +} { + var calls []struct { + Ctx context.Context + In *GetDefaultPrecompiledObjectRequest + Opts []grpc.CallOption + } + mock.lockGetDefaultPrecompiledObject.RLock() + calls = mock.calls.GetDefaultPrecompiledObject + mock.lockGetDefaultPrecompiledObject.RUnlock() + return calls +} + +// GetGraph calls GetGraphFunc. +func (mock *PlaygroundServiceClientMock) GetGraph(ctx context.Context, in *GetGraphRequest, opts ...grpc.CallOption) (*GetGraphResponse, error) { + if mock.GetGraphFunc == nil { + panic("PlaygroundServiceClientMock.GetGraphFunc: method is nil but PlaygroundServiceClient.GetGraph was just called") + } + callInfo := struct { + Ctx context.Context + In *GetGraphRequest + Opts []grpc.CallOption + }{ + Ctx: ctx, + In: in, + Opts: opts, + } + mock.lockGetGraph.Lock() + mock.calls.GetGraph = append(mock.calls.GetGraph, callInfo) + mock.lockGetGraph.Unlock() + return mock.GetGraphFunc(ctx, in, opts...) +} + +// GetGraphCalls gets all the calls that were made to GetGraph. +// Check the length with: +// len(mockedPlaygroundServiceClient.GetGraphCalls()) +func (mock *PlaygroundServiceClientMock) GetGraphCalls() []struct { + Ctx context.Context + In *GetGraphRequest + Opts []grpc.CallOption +} { + var calls []struct { + Ctx context.Context + In *GetGraphRequest + Opts []grpc.CallOption + } + mock.lockGetGraph.RLock() + calls = mock.calls.GetGraph + mock.lockGetGraph.RUnlock() + return calls +} + +// GetLogs calls GetLogsFunc. +func (mock *PlaygroundServiceClientMock) GetLogs(ctx context.Context, in *GetLogsRequest, opts ...grpc.CallOption) (*GetLogsResponse, error) { + if mock.GetLogsFunc == nil { + panic("PlaygroundServiceClientMock.GetLogsFunc: method is nil but PlaygroundServiceClient.GetLogs was just called") + } + callInfo := struct { + Ctx context.Context + In *GetLogsRequest + Opts []grpc.CallOption + }{ + Ctx: ctx, + In: in, + Opts: opts, + } + mock.lockGetLogs.Lock() + mock.calls.GetLogs = append(mock.calls.GetLogs, callInfo) + mock.lockGetLogs.Unlock() + return mock.GetLogsFunc(ctx, in, opts...) +} + +// GetLogsCalls gets all the calls that were made to GetLogs. +// Check the length with: +// len(mockedPlaygroundServiceClient.GetLogsCalls()) +func (mock *PlaygroundServiceClientMock) GetLogsCalls() []struct { + Ctx context.Context + In *GetLogsRequest + Opts []grpc.CallOption +} { + var calls []struct { + Ctx context.Context + In *GetLogsRequest + Opts []grpc.CallOption + } + mock.lockGetLogs.RLock() + calls = mock.calls.GetLogs + mock.lockGetLogs.RUnlock() + return calls +} + +// GetPrecompiledObject calls GetPrecompiledObjectFunc. +func (mock *PlaygroundServiceClientMock) GetPrecompiledObject(ctx context.Context, in *GetPrecompiledObjectRequest, opts ...grpc.CallOption) (*GetPrecompiledObjectResponse, error) { + if mock.GetPrecompiledObjectFunc == nil { + panic("PlaygroundServiceClientMock.GetPrecompiledObjectFunc: method is nil but PlaygroundServiceClient.GetPrecompiledObject was just called") + } + callInfo := struct { + Ctx context.Context + In *GetPrecompiledObjectRequest + Opts []grpc.CallOption + }{ + Ctx: ctx, + In: in, + Opts: opts, + } + mock.lockGetPrecompiledObject.Lock() + mock.calls.GetPrecompiledObject = append(mock.calls.GetPrecompiledObject, callInfo) + mock.lockGetPrecompiledObject.Unlock() + return mock.GetPrecompiledObjectFunc(ctx, in, opts...) +} + +// GetPrecompiledObjectCalls gets all the calls that were made to GetPrecompiledObject. +// Check the length with: +// len(mockedPlaygroundServiceClient.GetPrecompiledObjectCalls()) +func (mock *PlaygroundServiceClientMock) GetPrecompiledObjectCalls() []struct { + Ctx context.Context + In *GetPrecompiledObjectRequest + Opts []grpc.CallOption +} { + var calls []struct { + Ctx context.Context + In *GetPrecompiledObjectRequest + Opts []grpc.CallOption + } + mock.lockGetPrecompiledObject.RLock() + calls = mock.calls.GetPrecompiledObject + mock.lockGetPrecompiledObject.RUnlock() + return calls +} + +// GetPrecompiledObjectCode calls GetPrecompiledObjectCodeFunc. +func (mock *PlaygroundServiceClientMock) GetPrecompiledObjectCode(ctx context.Context, in *GetPrecompiledObjectCodeRequest, opts ...grpc.CallOption) (*GetPrecompiledObjectCodeResponse, error) { + if mock.GetPrecompiledObjectCodeFunc == nil { + panic("PlaygroundServiceClientMock.GetPrecompiledObjectCodeFunc: method is nil but PlaygroundServiceClient.GetPrecompiledObjectCode was just called") + } + callInfo := struct { + Ctx context.Context + In *GetPrecompiledObjectCodeRequest + Opts []grpc.CallOption + }{ + Ctx: ctx, + In: in, + Opts: opts, + } + mock.lockGetPrecompiledObjectCode.Lock() + mock.calls.GetPrecompiledObjectCode = append(mock.calls.GetPrecompiledObjectCode, callInfo) + mock.lockGetPrecompiledObjectCode.Unlock() + return mock.GetPrecompiledObjectCodeFunc(ctx, in, opts...) +} + +// GetPrecompiledObjectCodeCalls gets all the calls that were made to GetPrecompiledObjectCode. +// Check the length with: +// len(mockedPlaygroundServiceClient.GetPrecompiledObjectCodeCalls()) +func (mock *PlaygroundServiceClientMock) GetPrecompiledObjectCodeCalls() []struct { + Ctx context.Context + In *GetPrecompiledObjectCodeRequest + Opts []grpc.CallOption +} { + var calls []struct { + Ctx context.Context + In *GetPrecompiledObjectCodeRequest + Opts []grpc.CallOption + } + mock.lockGetPrecompiledObjectCode.RLock() + calls = mock.calls.GetPrecompiledObjectCode + mock.lockGetPrecompiledObjectCode.RUnlock() + return calls +} + +// GetPrecompiledObjectGraph calls GetPrecompiledObjectGraphFunc. +func (mock *PlaygroundServiceClientMock) GetPrecompiledObjectGraph(ctx context.Context, in *GetPrecompiledObjectGraphRequest, opts ...grpc.CallOption) (*GetPrecompiledObjectGraphResponse, error) { + if mock.GetPrecompiledObjectGraphFunc == nil { + panic("PlaygroundServiceClientMock.GetPrecompiledObjectGraphFunc: method is nil but PlaygroundServiceClient.GetPrecompiledObjectGraph was just called") + } + callInfo := struct { + Ctx context.Context + In *GetPrecompiledObjectGraphRequest + Opts []grpc.CallOption + }{ + Ctx: ctx, + In: in, + Opts: opts, + } + mock.lockGetPrecompiledObjectGraph.Lock() + mock.calls.GetPrecompiledObjectGraph = append(mock.calls.GetPrecompiledObjectGraph, callInfo) + mock.lockGetPrecompiledObjectGraph.Unlock() + return mock.GetPrecompiledObjectGraphFunc(ctx, in, opts...) +} + +// GetPrecompiledObjectGraphCalls gets all the calls that were made to GetPrecompiledObjectGraph. +// Check the length with: +// len(mockedPlaygroundServiceClient.GetPrecompiledObjectGraphCalls()) +func (mock *PlaygroundServiceClientMock) GetPrecompiledObjectGraphCalls() []struct { + Ctx context.Context + In *GetPrecompiledObjectGraphRequest + Opts []grpc.CallOption +} { + var calls []struct { + Ctx context.Context + In *GetPrecompiledObjectGraphRequest + Opts []grpc.CallOption + } + mock.lockGetPrecompiledObjectGraph.RLock() + calls = mock.calls.GetPrecompiledObjectGraph + mock.lockGetPrecompiledObjectGraph.RUnlock() + return calls +} + +// GetPrecompiledObjectLogs calls GetPrecompiledObjectLogsFunc. +func (mock *PlaygroundServiceClientMock) GetPrecompiledObjectLogs(ctx context.Context, in *GetPrecompiledObjectLogsRequest, opts ...grpc.CallOption) (*GetPrecompiledObjectLogsResponse, error) { + if mock.GetPrecompiledObjectLogsFunc == nil { + panic("PlaygroundServiceClientMock.GetPrecompiledObjectLogsFunc: method is nil but PlaygroundServiceClient.GetPrecompiledObjectLogs was just called") + } + callInfo := struct { + Ctx context.Context + In *GetPrecompiledObjectLogsRequest + Opts []grpc.CallOption + }{ + Ctx: ctx, + In: in, + Opts: opts, + } + mock.lockGetPrecompiledObjectLogs.Lock() + mock.calls.GetPrecompiledObjectLogs = append(mock.calls.GetPrecompiledObjectLogs, callInfo) + mock.lockGetPrecompiledObjectLogs.Unlock() + return mock.GetPrecompiledObjectLogsFunc(ctx, in, opts...) +} + +// GetPrecompiledObjectLogsCalls gets all the calls that were made to GetPrecompiledObjectLogs. +// Check the length with: +// len(mockedPlaygroundServiceClient.GetPrecompiledObjectLogsCalls()) +func (mock *PlaygroundServiceClientMock) GetPrecompiledObjectLogsCalls() []struct { + Ctx context.Context + In *GetPrecompiledObjectLogsRequest + Opts []grpc.CallOption +} { + var calls []struct { + Ctx context.Context + In *GetPrecompiledObjectLogsRequest + Opts []grpc.CallOption + } + mock.lockGetPrecompiledObjectLogs.RLock() + calls = mock.calls.GetPrecompiledObjectLogs + mock.lockGetPrecompiledObjectLogs.RUnlock() + return calls +} + +// GetPrecompiledObjectOutput calls GetPrecompiledObjectOutputFunc. +func (mock *PlaygroundServiceClientMock) GetPrecompiledObjectOutput(ctx context.Context, in *GetPrecompiledObjectOutputRequest, opts ...grpc.CallOption) (*GetPrecompiledObjectOutputResponse, error) { + if mock.GetPrecompiledObjectOutputFunc == nil { + panic("PlaygroundServiceClientMock.GetPrecompiledObjectOutputFunc: method is nil but PlaygroundServiceClient.GetPrecompiledObjectOutput was just called") + } + callInfo := struct { + Ctx context.Context + In *GetPrecompiledObjectOutputRequest + Opts []grpc.CallOption + }{ + Ctx: ctx, + In: in, + Opts: opts, + } + mock.lockGetPrecompiledObjectOutput.Lock() + mock.calls.GetPrecompiledObjectOutput = append(mock.calls.GetPrecompiledObjectOutput, callInfo) + mock.lockGetPrecompiledObjectOutput.Unlock() + return mock.GetPrecompiledObjectOutputFunc(ctx, in, opts...) +} + +// GetPrecompiledObjectOutputCalls gets all the calls that were made to GetPrecompiledObjectOutput. +// Check the length with: +// len(mockedPlaygroundServiceClient.GetPrecompiledObjectOutputCalls()) +func (mock *PlaygroundServiceClientMock) GetPrecompiledObjectOutputCalls() []struct { + Ctx context.Context + In *GetPrecompiledObjectOutputRequest + Opts []grpc.CallOption +} { + var calls []struct { + Ctx context.Context + In *GetPrecompiledObjectOutputRequest + Opts []grpc.CallOption + } + mock.lockGetPrecompiledObjectOutput.RLock() + calls = mock.calls.GetPrecompiledObjectOutput + mock.lockGetPrecompiledObjectOutput.RUnlock() + return calls +} + +// GetPrecompiledObjects calls GetPrecompiledObjectsFunc. +func (mock *PlaygroundServiceClientMock) GetPrecompiledObjects(ctx context.Context, in *GetPrecompiledObjectsRequest, opts ...grpc.CallOption) (*GetPrecompiledObjectsResponse, error) { + if mock.GetPrecompiledObjectsFunc == nil { + panic("PlaygroundServiceClientMock.GetPrecompiledObjectsFunc: method is nil but PlaygroundServiceClient.GetPrecompiledObjects was just called") + } + callInfo := struct { + Ctx context.Context + In *GetPrecompiledObjectsRequest + Opts []grpc.CallOption + }{ + Ctx: ctx, + In: in, + Opts: opts, + } + mock.lockGetPrecompiledObjects.Lock() + mock.calls.GetPrecompiledObjects = append(mock.calls.GetPrecompiledObjects, callInfo) + mock.lockGetPrecompiledObjects.Unlock() + return mock.GetPrecompiledObjectsFunc(ctx, in, opts...) +} + +// GetPrecompiledObjectsCalls gets all the calls that were made to GetPrecompiledObjects. +// Check the length with: +// len(mockedPlaygroundServiceClient.GetPrecompiledObjectsCalls()) +func (mock *PlaygroundServiceClientMock) GetPrecompiledObjectsCalls() []struct { + Ctx context.Context + In *GetPrecompiledObjectsRequest + Opts []grpc.CallOption +} { + var calls []struct { + Ctx context.Context + In *GetPrecompiledObjectsRequest + Opts []grpc.CallOption + } + mock.lockGetPrecompiledObjects.RLock() + calls = mock.calls.GetPrecompiledObjects + mock.lockGetPrecompiledObjects.RUnlock() + return calls +} + +// GetPreparationOutput calls GetPreparationOutputFunc. +func (mock *PlaygroundServiceClientMock) GetPreparationOutput(ctx context.Context, in *GetPreparationOutputRequest, opts ...grpc.CallOption) (*GetPreparationOutputResponse, error) { + if mock.GetPreparationOutputFunc == nil { + panic("PlaygroundServiceClientMock.GetPreparationOutputFunc: method is nil but PlaygroundServiceClient.GetPreparationOutput was just called") + } + callInfo := struct { + Ctx context.Context + In *GetPreparationOutputRequest + Opts []grpc.CallOption + }{ + Ctx: ctx, + In: in, + Opts: opts, + } + mock.lockGetPreparationOutput.Lock() + mock.calls.GetPreparationOutput = append(mock.calls.GetPreparationOutput, callInfo) + mock.lockGetPreparationOutput.Unlock() + return mock.GetPreparationOutputFunc(ctx, in, opts...) +} + +// GetPreparationOutputCalls gets all the calls that were made to GetPreparationOutput. +// Check the length with: +// len(mockedPlaygroundServiceClient.GetPreparationOutputCalls()) +func (mock *PlaygroundServiceClientMock) GetPreparationOutputCalls() []struct { + Ctx context.Context + In *GetPreparationOutputRequest + Opts []grpc.CallOption +} { + var calls []struct { + Ctx context.Context + In *GetPreparationOutputRequest + Opts []grpc.CallOption + } + mock.lockGetPreparationOutput.RLock() + calls = mock.calls.GetPreparationOutput + mock.lockGetPreparationOutput.RUnlock() + return calls +} + +// GetRunError calls GetRunErrorFunc. +func (mock *PlaygroundServiceClientMock) GetRunError(ctx context.Context, in *GetRunErrorRequest, opts ...grpc.CallOption) (*GetRunErrorResponse, error) { + if mock.GetRunErrorFunc == nil { + panic("PlaygroundServiceClientMock.GetRunErrorFunc: method is nil but PlaygroundServiceClient.GetRunError was just called") + } + callInfo := struct { + Ctx context.Context + In *GetRunErrorRequest + Opts []grpc.CallOption + }{ + Ctx: ctx, + In: in, + Opts: opts, + } + mock.lockGetRunError.Lock() + mock.calls.GetRunError = append(mock.calls.GetRunError, callInfo) + mock.lockGetRunError.Unlock() + return mock.GetRunErrorFunc(ctx, in, opts...) +} + +// GetRunErrorCalls gets all the calls that were made to GetRunError. +// Check the length with: +// len(mockedPlaygroundServiceClient.GetRunErrorCalls()) +func (mock *PlaygroundServiceClientMock) GetRunErrorCalls() []struct { + Ctx context.Context + In *GetRunErrorRequest + Opts []grpc.CallOption +} { + var calls []struct { + Ctx context.Context + In *GetRunErrorRequest + Opts []grpc.CallOption + } + mock.lockGetRunError.RLock() + calls = mock.calls.GetRunError + mock.lockGetRunError.RUnlock() + return calls +} + +// GetRunOutput calls GetRunOutputFunc. +func (mock *PlaygroundServiceClientMock) GetRunOutput(ctx context.Context, in *GetRunOutputRequest, opts ...grpc.CallOption) (*GetRunOutputResponse, error) { + if mock.GetRunOutputFunc == nil { + panic("PlaygroundServiceClientMock.GetRunOutputFunc: method is nil but PlaygroundServiceClient.GetRunOutput was just called") + } + callInfo := struct { + Ctx context.Context + In *GetRunOutputRequest + Opts []grpc.CallOption + }{ + Ctx: ctx, + In: in, + Opts: opts, + } + mock.lockGetRunOutput.Lock() + mock.calls.GetRunOutput = append(mock.calls.GetRunOutput, callInfo) + mock.lockGetRunOutput.Unlock() + return mock.GetRunOutputFunc(ctx, in, opts...) +} + +// GetRunOutputCalls gets all the calls that were made to GetRunOutput. +// Check the length with: +// len(mockedPlaygroundServiceClient.GetRunOutputCalls()) +func (mock *PlaygroundServiceClientMock) GetRunOutputCalls() []struct { + Ctx context.Context + In *GetRunOutputRequest + Opts []grpc.CallOption +} { + var calls []struct { + Ctx context.Context + In *GetRunOutputRequest + Opts []grpc.CallOption + } + mock.lockGetRunOutput.RLock() + calls = mock.calls.GetRunOutput + mock.lockGetRunOutput.RUnlock() + return calls +} + +// GetSnippet calls GetSnippetFunc. +func (mock *PlaygroundServiceClientMock) GetSnippet(ctx context.Context, in *GetSnippetRequest, opts ...grpc.CallOption) (*GetSnippetResponse, error) { + if mock.GetSnippetFunc == nil { + panic("PlaygroundServiceClientMock.GetSnippetFunc: method is nil but PlaygroundServiceClient.GetSnippet was just called") + } + callInfo := struct { + Ctx context.Context + In *GetSnippetRequest + Opts []grpc.CallOption + }{ + Ctx: ctx, + In: in, + Opts: opts, + } + mock.lockGetSnippet.Lock() + mock.calls.GetSnippet = append(mock.calls.GetSnippet, callInfo) + mock.lockGetSnippet.Unlock() + return mock.GetSnippetFunc(ctx, in, opts...) +} + +// GetSnippetCalls gets all the calls that were made to GetSnippet. +// Check the length with: +// len(mockedPlaygroundServiceClient.GetSnippetCalls()) +func (mock *PlaygroundServiceClientMock) GetSnippetCalls() []struct { + Ctx context.Context + In *GetSnippetRequest + Opts []grpc.CallOption +} { + var calls []struct { + Ctx context.Context + In *GetSnippetRequest + Opts []grpc.CallOption + } + mock.lockGetSnippet.RLock() + calls = mock.calls.GetSnippet + mock.lockGetSnippet.RUnlock() + return calls +} + +// GetValidationOutput calls GetValidationOutputFunc. +func (mock *PlaygroundServiceClientMock) GetValidationOutput(ctx context.Context, in *GetValidationOutputRequest, opts ...grpc.CallOption) (*GetValidationOutputResponse, error) { + if mock.GetValidationOutputFunc == nil { + panic("PlaygroundServiceClientMock.GetValidationOutputFunc: method is nil but PlaygroundServiceClient.GetValidationOutput was just called") + } + callInfo := struct { + Ctx context.Context + In *GetValidationOutputRequest + Opts []grpc.CallOption + }{ + Ctx: ctx, + In: in, + Opts: opts, + } + mock.lockGetValidationOutput.Lock() + mock.calls.GetValidationOutput = append(mock.calls.GetValidationOutput, callInfo) + mock.lockGetValidationOutput.Unlock() + return mock.GetValidationOutputFunc(ctx, in, opts...) +} + +// GetValidationOutputCalls gets all the calls that were made to GetValidationOutput. +// Check the length with: +// len(mockedPlaygroundServiceClient.GetValidationOutputCalls()) +func (mock *PlaygroundServiceClientMock) GetValidationOutputCalls() []struct { + Ctx context.Context + In *GetValidationOutputRequest + Opts []grpc.CallOption +} { + var calls []struct { + Ctx context.Context + In *GetValidationOutputRequest + Opts []grpc.CallOption + } + mock.lockGetValidationOutput.RLock() + calls = mock.calls.GetValidationOutput + mock.lockGetValidationOutput.RUnlock() + return calls +} + +// RunCode calls RunCodeFunc. +func (mock *PlaygroundServiceClientMock) RunCode(ctx context.Context, in *RunCodeRequest, opts ...grpc.CallOption) (*RunCodeResponse, error) { + if mock.RunCodeFunc == nil { + panic("PlaygroundServiceClientMock.RunCodeFunc: method is nil but PlaygroundServiceClient.RunCode was just called") + } + callInfo := struct { + Ctx context.Context + In *RunCodeRequest + Opts []grpc.CallOption + }{ + Ctx: ctx, + In: in, + Opts: opts, + } + mock.lockRunCode.Lock() + mock.calls.RunCode = append(mock.calls.RunCode, callInfo) + mock.lockRunCode.Unlock() + return mock.RunCodeFunc(ctx, in, opts...) +} + +// RunCodeCalls gets all the calls that were made to RunCode. +// Check the length with: +// len(mockedPlaygroundServiceClient.RunCodeCalls()) +func (mock *PlaygroundServiceClientMock) RunCodeCalls() []struct { + Ctx context.Context + In *RunCodeRequest + Opts []grpc.CallOption +} { + var calls []struct { + Ctx context.Context + In *RunCodeRequest + Opts []grpc.CallOption + } + mock.lockRunCode.RLock() + calls = mock.calls.RunCode + mock.lockRunCode.RUnlock() + return calls +} + +// SaveSnippet calls SaveSnippetFunc. +func (mock *PlaygroundServiceClientMock) SaveSnippet(ctx context.Context, in *SaveSnippetRequest, opts ...grpc.CallOption) (*SaveSnippetResponse, error) { + if mock.SaveSnippetFunc == nil { + panic("PlaygroundServiceClientMock.SaveSnippetFunc: method is nil but PlaygroundServiceClient.SaveSnippet was just called") + } + callInfo := struct { + Ctx context.Context + In *SaveSnippetRequest + Opts []grpc.CallOption + }{ + Ctx: ctx, + In: in, + Opts: opts, + } + mock.lockSaveSnippet.Lock() + mock.calls.SaveSnippet = append(mock.calls.SaveSnippet, callInfo) + mock.lockSaveSnippet.Unlock() + return mock.SaveSnippetFunc(ctx, in, opts...) +} + +// SaveSnippetCalls gets all the calls that were made to SaveSnippet. +// Check the length with: +// len(mockedPlaygroundServiceClient.SaveSnippetCalls()) +func (mock *PlaygroundServiceClientMock) SaveSnippetCalls() []struct { + Ctx context.Context + In *SaveSnippetRequest + Opts []grpc.CallOption +} { + var calls []struct { + Ctx context.Context + In *SaveSnippetRequest + Opts []grpc.CallOption + } + mock.lockSaveSnippet.RLock() + calls = mock.calls.SaveSnippet + mock.lockSaveSnippet.RUnlock() + return calls +} diff --git a/learning/tour-of-beam/backend/samples/api/get_user_progress.json b/learning/tour-of-beam/backend/samples/api/get_user_progress.json new file mode 100644 index 000000000000..fc8027c9eb9f --- /dev/null +++ b/learning/tour-of-beam/backend/samples/api/get_user_progress.json @@ -0,0 +1,12 @@ +{ + "units": [ + { + "id": "unit_id_1", + "isCompleted": true + }, + { + "id": "unit_id_2", + "userSnippetId": "QY2dskTNu_w" + } + ] +} \ No newline at end of file diff --git a/learning/tour-of-beam/frontend/pubspec.lock b/learning/tour-of-beam/frontend/pubspec.lock index f3d88fa9e0f3..9983b9bb5304 100644 --- a/learning/tour-of-beam/frontend/pubspec.lock +++ b/learning/tour-of-beam/frontend/pubspec.lock @@ -50,6 +50,13 @@ packages: url: "https://pub.dartlang.org" source: hosted version: "2.9.0" + autotrie: + dependency: transitive + description: + name: autotrie + url: "https://pub.dartlang.org" + source: hosted + version: "2.0.0" boolean_selector: dependency: transitive description: @@ -120,6 +127,13 @@ packages: url: "https://pub.dartlang.org" source: hosted version: "1.2.1" + charcode: + dependency: transitive + description: + name: charcode + url: "https://pub.dartlang.org" + source: hosted + version: "1.3.1" checked_yaml: dependency: transitive description: @@ -141,15 +155,6 @@ packages: url: "https://pub.dartlang.org" source: hosted version: "4.2.0" - code_text_field: - dependency: "direct main" - description: - path: "." - ref: "9e2c9fe52a69481f038f4b6609e8a0a776429437" - resolved-ref: "9e2c9fe52a69481f038f4b6609e8a0a776429437" - url: "https://github.com/BertrandBev/code_field.git" - source: git - version: "1.0.3" collection: dependency: transitive description: @@ -267,6 +272,13 @@ packages: description: flutter source: sdk version: "0.0.0" + flutter_code_editor: + dependency: transitive + description: + name: flutter_code_editor + url: "https://pub.dartlang.org" + source: hosted + version: "0.1.1" flutter_driver: dependency: transitive description: flutter @@ -390,6 +402,13 @@ packages: url: "https://pub.dartlang.org" source: hosted version: "0.7.0" + hive: + dependency: transitive + description: + name: hive + url: "https://pub.dartlang.org" + source: hosted + version: "2.2.3" http: dependency: "direct main" description: @@ -668,6 +687,13 @@ packages: url: "https://pub.dartlang.org" source: hosted version: "0.27.5" + scrollable_positioned_list: + dependency: transitive + description: + name: scrollable_positioned_list + url: "https://pub.dartlang.org" + source: hosted + version: "0.3.5" shared_preferences: dependency: "direct main" description: @@ -834,6 +860,13 @@ packages: url: "https://pub.dartlang.org" source: hosted version: "2.17.4" + tuple: + dependency: transitive + description: + name: tuple + url: "https://pub.dartlang.org" + source: hosted + version: "2.0.1" typed_data: dependency: transitive description: diff --git a/learning/tour-of-beam/frontend/pubspec.yaml b/learning/tour-of-beam/frontend/pubspec.yaml index da6c4c74ffa4..a6e829542e0c 100644 --- a/learning/tour-of-beam/frontend/pubspec.yaml +++ b/learning/tour-of-beam/frontend/pubspec.yaml @@ -28,10 +28,6 @@ environment: dependencies: app_state: ^0.8.1 - code_text_field: - git: - url: https://github.com/BertrandBev/code_field.git - ref: 9e2c9fe52a69481f038f4b6609e8a0a776429437 easy_localization: ^3.0.1 easy_localization_ext: ^0.1.0 easy_localization_loader: ^1.0.0 diff --git a/model/fn-execution/src/main/resources/org/apache/beam/model/fnexecution/v1/standard_coders.yaml b/model/fn-execution/src/main/resources/org/apache/beam/model/fnexecution/v1/standard_coders.yaml index ca914b6e6f8d..8ce9d2751545 100644 --- a/model/fn-execution/src/main/resources/org/apache/beam/model/fnexecution/v1/standard_coders.yaml +++ b/model/fn-execution/src/main/resources/org/apache/beam/model/fnexecution/v1/standard_coders.yaml @@ -487,6 +487,17 @@ examples: --- +coder: + urn: "beam:coder:row:v1" + # f_char: logical(fixed_char(5)), f_varchar: logical(var_char(5)), f_bytes: logical(fixed_bytes(5)), f_varbytes: logical(var_bytes(5)) + payload: "\n=\n\x06f_char\x1a3\x08\x01:/\n\x1fbeam:logical_type:fixed_char:v1\x1a\x02\x10\x07\"\x02\x10\x03*\x04\n\x02\x18\x05\nB\n\tf_varchar\x1a1\x08\x01:-\n\x1dbeam:logical_type:var_char:v1\x1a\x02\x10\x07\"\x02\x10\x03*\x04\n\x02\x18\n \x01(\x01\nC\n\x07f_bytes\x1a4\x08\x01:0\n beam:logical_type:fixed_bytes:v1\x1a\x02\x10\t\"\x02\x10\x03*\x04\n\x02\x18\x05 \x02(\x02\nD\n\nf_varbytes\x1a2\x08\x01:.\n\x1ebeam:logical_type:var_bytes:v1\x1a\x02\x10\t\"\x02\x10\x03*\x04\n\x02\x18\n \x03(\x03\x12$f0ffb3a4-f46f-41ca-a942-85e3e939452a" +examples: + "\x04\x00\x05ABCDE\x05ABCDE\x05ABCDE\x05ABCDE": {f_char: "ABCDE", f_varchar: "ABCDE", f_bytes: "ABCDE", f_varbytes: "ABCDE"} + "\x04\x00\x05A\n \x02A\n\x05A\n\x00\x00\x00\x02A\n": {f_char: "A\n ", f_varchar: "A\n", f_bytes: "A\n\x00\x00\x00", f_varbytes: "A\n"} + "\x04\x01\x06\x05null?\x04null": {f_char: "null?", f_varchar: null, f_bytes: null, f_varbytes: "null"} + +--- + coder: urn: "beam:coder:sharded_key:v1" components: [{urn: "beam:coder:string_utf8:v1"}] diff --git a/model/pipeline/src/main/proto/org/apache/beam/model/pipeline/v1/schema.proto b/model/pipeline/src/main/proto/org/apache/beam/model/pipeline/v1/schema.proto index fa626bc747f0..6e05aada21f9 100644 --- a/model/pipeline/src/main/proto/org/apache/beam/model/pipeline/v1/schema.proto +++ b/model/pipeline/src/main/proto/org/apache/beam/model/pipeline/v1/schema.proto @@ -147,6 +147,34 @@ message LogicalTypes { // two's complement encoded big integer. DECIMAL = 3 [(org.apache.beam.model.pipeline.v1.beam_urn) = "beam:logical_type:decimal:v1"]; + + // A URN for FixedLengthBytes type + // - Representation type: BYTES + // - Argument type: INT32. + // A fixed-length bytes with its length as the argument. + FIXED_BYTES = 4 [(org.apache.beam.model.pipeline.v1.beam_urn) = + "beam:logical_type:fixed_bytes:v1"]; + + // A URN for VariableLengthBytes type + // - Representation type: BYTES + // - Argument type: INT32. + // A variable-length bytes with its maximum length as the argument. + VAR_BYTES = 5 [(org.apache.beam.model.pipeline.v1.beam_urn) = + "beam:logical_type:var_bytes:v1"]; + + // A URN for FixedLengthString type + // - Representation type: STRING + // - Argument type: INT32. + // A fixed-length string with its length as the argument. + FIXED_CHAR = 6 [(org.apache.beam.model.pipeline.v1.beam_urn) = + "beam:logical_type:fixed_char:v1"]; + + // A URN for VariableLengthString type + // - Representation type: STRING + // - Argument type: INT32. + // A variable-length string with its maximum length as the argument. + VAR_CHAR = 7 [(org.apache.beam.model.pipeline.v1.beam_urn) = + "beam:logical_type:var_char:v1"]; } } diff --git a/playground/backend/README.md b/playground/backend/README.md index 8d847d2569e2..1b3ad66f2b42 100644 --- a/playground/backend/README.md +++ b/playground/backend/README.md @@ -94,6 +94,7 @@ default value and there is no need to set them up to launch locally: - `SDK_CONFIG` - is the sdk configuration file path, e.g. default example for corresponding sdk. It will be saved to cloud datastore during application startup (default value = `../sdks.yaml`) - `DATASTORE_EMULATOR_HOST` - is the datastore emulator address. If it is given in the environment, the application will connect to the datastore emulator. - `PROPERTY_PATH` - is the application properties path (default value = `.`) +- `CACHE_REQUEST_TIMEOUT` - is the timeout to request data from cache (default value = `5 sec`) ### Application properties diff --git a/playground/backend/cmd/server/controller.go b/playground/backend/cmd/server/controller.go index 43cf57fd3e62..d2a5f087132c 100644 --- a/playground/backend/cmd/server/controller.go +++ b/playground/backend/cmd/server/controller.go @@ -284,7 +284,7 @@ func (controller *playgroundController) Cancel(ctx context.Context, info *pb.Can // - If there is no catalog in the cache, gets the catalog from the Datastore and saves it to the cache // - If SDK or category is specified in the request, gets the catalog from the cache and filters it by SDK and category func (controller *playgroundController) GetPrecompiledObjects(ctx context.Context, info *pb.GetPrecompiledObjectsRequest) (*pb.GetPrecompiledObjectsResponse, error) { - catalog, err := controller.cacheComponent.GetCatalogFromCacheOrDatastore(ctx) + catalog, err := controller.cacheComponent.GetCatalogFromCacheOrDatastore(ctx, controller.env.ApplicationEnvs.CacheRequestTimeout()) if err != nil { return nil, errors.InternalError(errorTitleGetCatalog, userCloudConnectionErrMsg) } @@ -299,7 +299,7 @@ func (controller *playgroundController) GetPrecompiledObject(ctx context.Context if err != nil { return nil, errors.InvalidArgumentError(errorTitleGetExample, userBadCloudPathErrMsg) } - sdks, err := controller.cacheComponent.GetSdkCatalogFromCacheOrDatastore(ctx) + sdks, err := controller.cacheComponent.GetSdkCatalogFromCacheOrDatastore(ctx, controller.env.ApplicationEnvs.CacheRequestTimeout()) if err != nil { return nil, errors.InternalError(errorTitleGetExample, err.Error()) } @@ -398,7 +398,7 @@ func (controller *playgroundController) GetDefaultPrecompiledObject(ctx context. logger.Errorf("GetDefaultPrecompiledObject(): unimplemented sdk: %s\n", info.Sdk) return nil, errors.InvalidArgumentError("Error during preparing", "Sdk is not implemented yet: %s", info.Sdk.String()) } - precompiledObject, err := controller.cacheComponent.GetDefaultPrecompiledObjectFromCacheOrDatastore(ctx, info.Sdk) + precompiledObject, err := controller.cacheComponent.GetDefaultPrecompiledObjectFromCacheOrDatastore(ctx, info.Sdk, controller.env.ApplicationEnvs.CacheRequestTimeout()) if err != nil { logger.Errorf("GetDefaultPrecompiledObject(): error during getting catalog: %s", err.Error()) return nil, errors.InternalError("Error during getting Precompiled Objects", "Error with cloud connection") diff --git a/playground/backend/containers/python/Dockerfile b/playground/backend/containers/python/Dockerfile index 2a9182a129a5..5c6fba0fd7c3 100644 --- a/playground/backend/containers/python/Dockerfile +++ b/playground/backend/containers/python/Dockerfile @@ -49,7 +49,6 @@ COPY --from=build /go/bin/server_python_backend /opt/playground/backend/ COPY --from=build /go/src/playground/backend/configs /opt/playground/backend/configs/ # Install Python Katas Utils -COPY katas/log_elements.py /go/src/katas/ ENV PYTHONPATH="$PYTHONPATH:/go/src/katas" # Install mitmpoxy diff --git a/playground/backend/internal/components/cache_component.go b/playground/backend/internal/components/cache_component.go index 9626e9a3030c..72d6c6213dce 100644 --- a/playground/backend/internal/components/cache_component.go +++ b/playground/backend/internal/components/cache_component.go @@ -18,6 +18,7 @@ package components import ( "context" "fmt" + "time" pb "beam.apache.org/playground/backend/internal/api/v1" "beam.apache.org/playground/backend/internal/cache" @@ -37,73 +38,102 @@ func NewService(cache cache.Cache, db db.Database) *CacheComponent { // GetSdkCatalogFromCacheOrDatastore returns the sdk catalog from the cache // - If there is no sdk catalog in the cache, gets the sdk catalog from the Cloud Datastore and saves it to the cache -func (cp *CacheComponent) GetSdkCatalogFromCacheOrDatastore(ctx context.Context) ([]*entity.SDKEntity, error) { - sdks, err := cp.cache.GetSdkCatalog(ctx) +func (cp *CacheComponent) GetSdkCatalogFromCacheOrDatastore(ctx context.Context, cacheRequestTimeout time.Duration) ([]*entity.SDKEntity, error) { + cctx, cancel := context.WithTimeout(ctx, cacheRequestTimeout) + defer cancel() + sdks, err := cp.cache.GetSdkCatalog(cctx) if err != nil { logger.Errorf("error during getting the sdk catalog from the cache, err: %s", err.Error()) - sdks, err = cp.db.GetSDKs(ctx) - if err != nil { - logger.Errorf("error during getting the sdk catalog from the cloud datastore, err: %s", err.Error()) - return nil, err - } - if err = cp.cache.SetSdkCatalog(ctx, sdks); err != nil { - logger.Errorf("error during setting the sdk catalog to the cache, err: %s", err.Error()) - return nil, err - } + return cp.getSdks(ctx, cacheRequestTimeout) + } else { + return sdks, nil + } +} + +func (cp *CacheComponent) getSdks(ctx context.Context, cacheRequestTimeout time.Duration) ([]*entity.SDKEntity, error) { + sdks, err := cp.db.GetSDKs(ctx) + if err != nil { + logger.Errorf("error during getting the sdk catalog from the cloud datastore, err: %s", err.Error()) + return nil, err + } + cctx, cancel := context.WithTimeout(ctx, cacheRequestTimeout) + defer cancel() + if err = cp.cache.SetSdkCatalog(cctx, sdks); err != nil { + logger.Errorf("error during setting the sdk catalog to the cache, err: %s", err.Error()) } return sdks, nil } // GetCatalogFromCacheOrDatastore returns the example catalog from cache // - If there is no catalog in the cache, gets the catalog from the Cloud Datastore and saves it to the cache -func (cp *CacheComponent) GetCatalogFromCacheOrDatastore(ctx context.Context) ([]*pb.Categories, error) { - catalog, err := cp.cache.GetCatalog(ctx) +func (cp *CacheComponent) GetCatalogFromCacheOrDatastore(ctx context.Context, cacheRequestTimeout time.Duration) ([]*pb.Categories, error) { + cctx, cancel := context.WithTimeout(ctx, cacheRequestTimeout) + defer cancel() + catalog, err := cp.cache.GetCatalog(cctx) if err != nil { logger.Errorf("error during getting the catalog from the cache, err: %s", err.Error()) - sdkCatalog, err := cp.GetSdkCatalogFromCacheOrDatastore(ctx) - if err != nil { - logger.Errorf("error during getting the sdk catalog from the cache or datastore, err: %s", err.Error()) - return nil, err - } - catalog, err = cp.db.GetCatalog(ctx, sdkCatalog) - if err != nil { - return nil, err - } - if len(catalog) == 0 { - logger.Warn("example catalog is empty") - return catalog, nil - } - if err = cp.cache.SetCatalog(ctx, catalog); err != nil { - logger.Errorf("SetCatalog(): cache error: %s", err.Error()) - return nil, err - } + return cp.getCatalog(ctx, cacheRequestTimeout) + } else { + return catalog, nil + } +} + +func (cp *CacheComponent) getCatalog(ctx context.Context, cacheRequestTimeout time.Duration) ([]*pb.Categories, error) { + sdkCatalog, err := cp.GetSdkCatalogFromCacheOrDatastore(ctx, cacheRequestTimeout) + if err != nil { + return nil, err + } + catalog, err := cp.db.GetCatalog(ctx, sdkCatalog) + if err != nil { + return nil, err + } + if len(catalog) == 0 { + logger.Warn("example catalog is empty") + return catalog, nil + } + cctx, cancel := context.WithTimeout(ctx, cacheRequestTimeout) + defer cancel() + if err = cp.cache.SetCatalog(cctx, catalog); err != nil { + logger.Errorf("SetCatalog(): cache error: %s", err.Error()) } return catalog, nil } // GetDefaultPrecompiledObjectFromCacheOrDatastore returns the default example from cache by sdk // - If there is no a default example in the cache, gets the default example from the Cloud Datastore and saves it to the cache -func (cp *CacheComponent) GetDefaultPrecompiledObjectFromCacheOrDatastore(ctx context.Context, sdk pb.Sdk) (*pb.PrecompiledObject, error) { - defaultExample, err := cp.cache.GetDefaultPrecompiledObject(ctx, sdk) +func (cp *CacheComponent) GetDefaultPrecompiledObjectFromCacheOrDatastore(ctx context.Context, sdk pb.Sdk, cacheRequestTimeout time.Duration) (*pb.PrecompiledObject, error) { + cctx, cancel := context.WithTimeout(ctx, cacheRequestTimeout) + defer cancel() + defaultExample, err := cp.cache.GetDefaultPrecompiledObject(cctx, sdk) if err != nil { - logger.Errorf("error during getting a default precompiled object, err: %s", err.Error()) - sdks, err := cp.GetSdkCatalogFromCacheOrDatastore(ctx) - if err != nil { - logger.Errorf("error during getting sdk catalog from the cache or the cloud datastore, err: %s", err.Error()) - return nil, err - } - defaultExamples, err := cp.db.GetDefaultExamples(ctx, sdks) - for sdk, defaultExample := range defaultExamples { - if err := cp.cache.SetDefaultPrecompiledObject(ctx, sdk, defaultExample); err != nil { - logger.Errorf("error during setting a default example to the cache: %s", err.Error()) - return nil, err - } - } - defaultExample, ok := defaultExamples[sdk] - if !ok { - return nil, fmt.Errorf("no default example found for this sdk: %s", sdk) - } + logger.Errorf("error during getting the default precompiled object from the cache, err: %s", err.Error()) + return cp.getDefaultExample(ctx, sdk, cacheRequestTimeout) + } else { return defaultExample, nil } +} + +func (cp *CacheComponent) getDefaultExample(ctx context.Context, sdk pb.Sdk, cacheRequestTimeout time.Duration) (*pb.PrecompiledObject, error) { + sdks, err := cp.GetSdkCatalogFromCacheOrDatastore(ctx, cacheRequestTimeout) + if err != nil { + logger.Errorf("error during getting sdk catalog from the cache or the cloud datastore, err: %s", err.Error()) + return nil, err + } + defaultExamples, err := cp.db.GetDefaultExamples(ctx, sdks) + if err != nil { + logger.Errorf("error during getting default examples from the cloud datastore, err: %s", err.Error()) + return nil, err + } + cctx, cancel := context.WithTimeout(ctx, cacheRequestTimeout) + defer cancel() + for sdk, defaultExample := range defaultExamples { + if err := cp.cache.SetDefaultPrecompiledObject(cctx, sdk, defaultExample); err != nil { + logger.Errorf("error during setting a default example to the cache: %s", err.Error()) + } + } + defaultExample, ok := defaultExamples[sdk] + if !ok { + return nil, fmt.Errorf("no default example found for this sdk: %s", sdk) + } return defaultExample, nil } diff --git a/playground/backend/internal/components/cache_component_test.go b/playground/backend/internal/components/cache_component_test.go index 2651b4d2ac8c..9aaad6ea7b1b 100644 --- a/playground/backend/internal/components/cache_component_test.go +++ b/playground/backend/internal/components/cache_component_test.go @@ -20,6 +20,7 @@ import ( "os" "reflect" "testing" + "time" pb "beam.apache.org/playground/backend/internal/api/v1" "beam.apache.org/playground/backend/internal/cache" @@ -36,6 +37,7 @@ var datastoreDb *db.Datastore var ctx context.Context var cacheComponent *CacheComponent var cacheService cache.Cache +var defaultCacheRequestTimeout = 10 * time.Second func TestMain(m *testing.M) { setup() @@ -89,7 +91,7 @@ func TestCacheComponent_GetSdkCatalogFromCacheOrDatastore(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { tt.prepare() - result, err := cacheComponent.GetSdkCatalogFromCacheOrDatastore(ctx) + result, err := cacheComponent.GetSdkCatalogFromCacheOrDatastore(ctx, defaultCacheRequestTimeout) if (err != nil) != tt.wantErr { t.Error("GetSdkCatalogFromCacheOrDatastore() unexpected error") return @@ -145,7 +147,7 @@ func TestCacheComponent_GetCatalogFromCacheOrDatastore(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { tt.prepare() - result, err := cacheComponent.GetCatalogFromCacheOrDatastore(ctx) + result, err := cacheComponent.GetCatalogFromCacheOrDatastore(ctx, defaultCacheRequestTimeout) if (err != nil) != tt.wantErr { t.Error("GetCatalogFromCacheOrDatastore() unexpected error") return @@ -202,7 +204,7 @@ func TestCacheComponent_GetDefaultPrecompiledObjectFromCacheOrDatastore(t *testi for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { tt.prepare() - result, err := cacheComponent.GetDefaultPrecompiledObjectFromCacheOrDatastore(ctx, pb.Sdk_SDK_JAVA) + result, err := cacheComponent.GetDefaultPrecompiledObjectFromCacheOrDatastore(ctx, pb.Sdk_SDK_JAVA, defaultCacheRequestTimeout) if (err != nil) != tt.wantErr { t.Error("GetDefaultPrecompiledObjectFromCacheOrDatastore() unexpected error") return diff --git a/playground/backend/internal/db/mapper/datastore_mapper_test.go b/playground/backend/internal/db/mapper/datastore_mapper_test.go index f032d939d961..8cd8d19b2028 100644 --- a/playground/backend/internal/db/mapper/datastore_mapper_test.go +++ b/playground/backend/internal/db/mapper/datastore_mapper_test.go @@ -31,7 +31,7 @@ var testable *DatastoreMapper var datastoreMapperCtx = context.Background() func TestMain(m *testing.M) { - appEnv := environment.NewApplicationEnvs("/app", "", "", "", "", "../../../.", nil, 0) + appEnv := environment.NewApplicationEnvs("/app", "", "", "", "", "../../../.", nil, 0, 0) appEnv.SetSchemaVersion("MOCK_SCHEMA") props, _ := environment.NewProperties(appEnv.PropertyPath()) testable = NewDatastoreMapper(datastoreMapperCtx, appEnv, props) diff --git a/playground/backend/internal/db/schema/migration/migrations_test.go b/playground/backend/internal/db/schema/migration/migrations_test.go index c22d07e3fdfd..f2ca13f2c54e 100644 --- a/playground/backend/internal/db/schema/migration/migrations_test.go +++ b/playground/backend/internal/db/schema/migration/migrations_test.go @@ -53,7 +53,7 @@ func setup() { if err != nil { panic(err) } - appEnvs = environment.NewApplicationEnvs("/app", "", "", "", "../../../../../sdks-emulator.yaml", "../../../../.", nil, 0) + appEnvs = environment.NewApplicationEnvs("/app", "", "", "", "../../../../../sdks-emulator.yaml", "../../../../.", nil, 0, 0) props, err = environment.NewProperties(appEnvs.PropertyPath()) if err != nil { panic(err) diff --git a/playground/backend/internal/environment/application.go b/playground/backend/internal/environment/application.go index b32a41a8e69f..05c28a7293d9 100644 --- a/playground/backend/internal/environment/application.go +++ b/playground/backend/internal/environment/application.go @@ -108,13 +108,16 @@ type ApplicationEnvs struct { // propertyPath is the application properties path propertyPath string + + // cacheRequestTimeout is timeout to request data from cache + cacheRequestTimeout time.Duration } // NewApplicationEnvs constructor for ApplicationEnvs func NewApplicationEnvs( workingDir, launchSite, projectId, pipelinesFolder, sdkConfigPath, propertyPath string, cacheEnvs *CacheEnvs, - pipelineExecuteTimeout time.Duration, + pipelineExecuteTimeout, cacheRequestTimeout time.Duration, ) *ApplicationEnvs { return &ApplicationEnvs{ workingDir: workingDir, @@ -125,6 +128,7 @@ func NewApplicationEnvs( pipelinesFolder: pipelinesFolder, sdkConfigPath: sdkConfigPath, propertyPath: propertyPath, + cacheRequestTimeout: cacheRequestTimeout, } } @@ -177,3 +181,8 @@ func (ae *ApplicationEnvs) PropertyPath() string { func (ae *ApplicationEnvs) SetSchemaVersion(schemaVersion string) { ae.schemaVersion = schemaVersion } + +// CacheRequestTimeout returns timeout to request data from cache +func (ae *ApplicationEnvs) CacheRequestTimeout() time.Duration { + return ae.cacheRequestTimeout +} diff --git a/playground/backend/internal/environment/environment_service.go b/playground/backend/internal/environment/environment_service.go index 0cbf8149d5e5..a7ddf0984c03 100644 --- a/playground/backend/internal/environment/environment_service.go +++ b/playground/backend/internal/environment/environment_service.go @@ -65,6 +65,8 @@ const ( defaultSDKConfigPath = "../sdks.yaml" propertyPathKey = "PROPERTY_PATH" defaultPropertyPath = "." + cacheRequestTimeoutKey = "CACHE_REQUEST_TIMEOUT" + defaultCacheRequestTimeout = time.Second * 5 ) // Environment operates with environment structures: NetworkEnvs, BeamEnvs, ApplicationEnvs @@ -99,8 +101,8 @@ func NewEnvironment(networkEnvs NetworkEnvs, beamEnvs BeamEnvs, appEnvs Applicat // - cache address: localhost:6379 // If os environment variables don't contain a value for app working dir - returns error. func GetApplicationEnvsFromOsEnvs() (*ApplicationEnvs, error) { - pipelineExecuteTimeout := defaultPipelineExecuteTimeout - cacheExpirationTime := defaultCacheKeyExpirationTime + pipelineExecuteTimeout := getEnvAsDuration(pipelineExecuteTimeoutKey, defaultPipelineExecuteTimeout, "couldn't convert provided pipeline execute timeout. Using default %s\n") + cacheExpirationTime := getEnvAsDuration(cacheKeyExpirationTimeKey, defaultCacheKeyExpirationTime, "couldn't convert provided cache expiration time. Using default %s\n") cacheType := getEnv(cacheTypeKey, defaultCacheType) cacheAddress := getEnv(cacheAddressKey, defaultCacheAddress) launchSite := getEnv(launchSiteKey, defaultLaunchSite) @@ -108,24 +110,10 @@ func GetApplicationEnvsFromOsEnvs() (*ApplicationEnvs, error) { pipelinesFolder := getEnv(pipelinesFolderKey, defaultPipelinesFolder) sdkConfigPath := getEnv(SDKConfigPathKey, defaultSDKConfigPath) propertyPath := getEnv(propertyPathKey, defaultPropertyPath) - - if value, present := os.LookupEnv(cacheKeyExpirationTimeKey); present { - if converted, err := time.ParseDuration(value); err == nil { - cacheExpirationTime = converted - } else { - log.Printf("couldn't convert provided cache expiration time. Using default %s\n", defaultCacheKeyExpirationTime) - } - } - if value, present := os.LookupEnv(pipelineExecuteTimeoutKey); present { - if converted, err := time.ParseDuration(value); err == nil { - pipelineExecuteTimeout = converted - } else { - log.Printf("couldn't convert provided pipeline execute timeout. Using default %s\n", defaultPipelineExecuteTimeout) - } - } + cacheRequestTimeout := getEnvAsDuration(cacheRequestTimeoutKey, defaultCacheRequestTimeout, "couldn't convert provided cache request timeout. Using default %s\n") if value, present := os.LookupEnv(workingDirKey); present { - return NewApplicationEnvs(value, launchSite, projectId, pipelinesFolder, sdkConfigPath, propertyPath, NewCacheEnvs(cacheType, cacheAddress, cacheExpirationTime), pipelineExecuteTimeout), nil + return NewApplicationEnvs(value, launchSite, projectId, pipelinesFolder, sdkConfigPath, propertyPath, NewCacheEnvs(cacheType, cacheAddress, cacheExpirationTime), pipelineExecuteTimeout, cacheRequestTimeout), nil } return nil, errors.New("APP_WORK_DIR env should be provided with os.env") } @@ -260,3 +248,15 @@ func getEnvAsInt(key string, defaultValue int) int { } return defaultValue } + +// getEnvAsDuration returns an environment variable or default value as duration +func getEnvAsDuration(key string, defaultValue time.Duration, errMsg string) time.Duration { + if value, present := os.LookupEnv(key); present { + if converted, err := time.ParseDuration(value); err == nil { + return converted + } else { + log.Printf(errMsg, defaultValue) + } + } + return defaultValue +} diff --git a/playground/backend/internal/environment/environment_service_test.go b/playground/backend/internal/environment/environment_service_test.go index 04eb13d4b980..0df904f14e9c 100644 --- a/playground/backend/internal/environment/environment_service_test.go +++ b/playground/backend/internal/environment/environment_service_test.go @@ -105,7 +105,7 @@ func TestNewEnvironment(t *testing.T) { {name: "Create env service with default envs", want: &Environment{ NetworkEnvs: *NewNetworkEnvs(defaultIp, defaultPort, defaultProtocol), BeamSdkEnvs: *NewBeamEnvs(defaultSdk, executorConfig, preparedModDir, 0), - ApplicationEnvs: *NewApplicationEnvs("/app", defaultLaunchSite, defaultProjectId, defaultPipelinesFolder, defaultSDKConfigPath, defaultPropertyPath, &CacheEnvs{defaultCacheType, defaultCacheAddress, defaultCacheKeyExpirationTime}, defaultPipelineExecuteTimeout), + ApplicationEnvs: *NewApplicationEnvs("/app", defaultLaunchSite, defaultProjectId, defaultPipelinesFolder, defaultSDKConfigPath, defaultPropertyPath, &CacheEnvs{defaultCacheType, defaultCacheAddress, defaultCacheKeyExpirationTime}, defaultPipelineExecuteTimeout, defaultCacheRequestTimeout), }}, } for _, tt := range tests { @@ -113,7 +113,7 @@ func TestNewEnvironment(t *testing.T) { if got := NewEnvironment( *NewNetworkEnvs(defaultIp, defaultPort, defaultProtocol), *NewBeamEnvs(defaultSdk, executorConfig, preparedModDir, 0), - *NewApplicationEnvs("/app", defaultLaunchSite, defaultProjectId, defaultPipelinesFolder, defaultSDKConfigPath, defaultPropertyPath, &CacheEnvs{defaultCacheType, defaultCacheAddress, defaultCacheKeyExpirationTime}, defaultPipelineExecuteTimeout)); !reflect.DeepEqual(got, tt.want) { + *NewApplicationEnvs("/app", defaultLaunchSite, defaultProjectId, defaultPipelinesFolder, defaultSDKConfigPath, defaultPropertyPath, &CacheEnvs{defaultCacheType, defaultCacheAddress, defaultCacheKeyExpirationTime}, defaultPipelineExecuteTimeout, defaultCacheRequestTimeout)); !reflect.DeepEqual(got, tt.want) { t.Errorf("NewEnvironment() = %v, want %v", got, tt.want) } }) @@ -224,7 +224,7 @@ func Test_getApplicationEnvsFromOsEnvs(t *testing.T) { }{ { name: "Working dir is provided", - want: NewApplicationEnvs("/app", defaultLaunchSite, defaultProjectId, defaultPipelinesFolder, defaultSDKConfigPath, defaultPropertyPath, &CacheEnvs{defaultCacheType, defaultCacheAddress, defaultCacheKeyExpirationTime}, defaultPipelineExecuteTimeout), + want: NewApplicationEnvs("/app", defaultLaunchSite, defaultProjectId, defaultPipelinesFolder, defaultSDKConfigPath, defaultPropertyPath, &CacheEnvs{defaultCacheType, defaultCacheAddress, defaultCacheKeyExpirationTime}, defaultPipelineExecuteTimeout, defaultCacheRequestTimeout), wantErr: false, envsToSet: map[string]string{workingDirKey: "/app", launchSiteKey: defaultLaunchSite, projectIdKey: defaultProjectId}, }, @@ -235,25 +235,25 @@ func Test_getApplicationEnvsFromOsEnvs(t *testing.T) { }, { name: "CacheKeyExpirationTimeKey is set with the correct value", - want: NewApplicationEnvs("/app", defaultLaunchSite, defaultProjectId, defaultPipelinesFolder, defaultSDKConfigPath, defaultPropertyPath, &CacheEnvs{defaultCacheType, defaultCacheAddress, convertedTime}, defaultPipelineExecuteTimeout), + want: NewApplicationEnvs("/app", defaultLaunchSite, defaultProjectId, defaultPipelinesFolder, defaultSDKConfigPath, defaultPropertyPath, &CacheEnvs{defaultCacheType, defaultCacheAddress, convertedTime}, defaultPipelineExecuteTimeout, defaultCacheRequestTimeout), wantErr: false, envsToSet: map[string]string{workingDirKey: "/app", cacheKeyExpirationTimeKey: hour}, }, { name: "CacheKeyExpirationTimeKey is set with the incorrect value", - want: NewApplicationEnvs("/app", defaultLaunchSite, defaultProjectId, defaultPipelinesFolder, defaultSDKConfigPath, defaultPropertyPath, &CacheEnvs{defaultCacheType, defaultCacheAddress, defaultCacheKeyExpirationTime}, defaultPipelineExecuteTimeout), + want: NewApplicationEnvs("/app", defaultLaunchSite, defaultProjectId, defaultPipelinesFolder, defaultSDKConfigPath, defaultPropertyPath, &CacheEnvs{defaultCacheType, defaultCacheAddress, defaultCacheKeyExpirationTime}, defaultPipelineExecuteTimeout, defaultCacheRequestTimeout), wantErr: false, envsToSet: map[string]string{workingDirKey: "/app", cacheKeyExpirationTimeKey: "1"}, }, { name: "CacheKeyExpirationTimeKey is set with the correct value", - want: NewApplicationEnvs("/app", defaultLaunchSite, defaultProjectId, defaultPipelinesFolder, defaultSDKConfigPath, defaultPropertyPath, &CacheEnvs{defaultCacheType, defaultCacheAddress, defaultCacheKeyExpirationTime}, convertedTime), + want: NewApplicationEnvs("/app", defaultLaunchSite, defaultProjectId, defaultPipelinesFolder, defaultSDKConfigPath, defaultPropertyPath, &CacheEnvs{defaultCacheType, defaultCacheAddress, defaultCacheKeyExpirationTime}, convertedTime, defaultCacheRequestTimeout), wantErr: false, envsToSet: map[string]string{workingDirKey: "/app", pipelineExecuteTimeoutKey: hour}, }, { name: "PipelineExecuteTimeoutKey is set with the incorrect value", - want: NewApplicationEnvs("/app", defaultLaunchSite, defaultProjectId, defaultPipelinesFolder, defaultSDKConfigPath, defaultPropertyPath, &CacheEnvs{defaultCacheType, defaultCacheAddress, defaultCacheKeyExpirationTime}, defaultPipelineExecuteTimeout), + want: NewApplicationEnvs("/app", defaultLaunchSite, defaultProjectId, defaultPipelinesFolder, defaultSDKConfigPath, defaultPropertyPath, &CacheEnvs{defaultCacheType, defaultCacheAddress, defaultCacheKeyExpirationTime}, defaultPipelineExecuteTimeout, defaultCacheRequestTimeout), wantErr: false, envsToSet: map[string]string{workingDirKey: "/app", pipelineExecuteTimeoutKey: "1"}, }, diff --git a/playground/frontend/playground_components/lib/src/controllers/playground_controller.dart b/playground/frontend/playground_components/lib/src/controllers/playground_controller.dart index 794449110772..f3f1f52ba098 100644 --- a/playground/frontend/playground_components/lib/src/controllers/playground_controller.dart +++ b/playground/frontend/playground_components/lib/src/controllers/playground_controller.dart @@ -115,7 +115,7 @@ class PlaygroundController with ChangeNotifier { return controller; } - String? get source => snippetEditingController?.codeController.text; + String? get source => snippetEditingController?.codeController.fullText; bool get isCodeRunning => !(result?.isFinished ?? true); @@ -177,7 +177,7 @@ class PlaygroundController with ChangeNotifier { void setSource(String source) { final controller = requireSnippetEditingController(); - controller.codeController.text = source; + controller.setSource(source); } void setSelectedOutputFilterType(OutputType type) { @@ -234,7 +234,7 @@ class PlaygroundController with ChangeNotifier { _showPrecompiledResult(controller); } else { final request = RunCodeRequest( - code: controller.codeController.text, + code: controller.codeController.fullText, sdk: controller.sdk, pipelineOptions: parsedPipelineOptions, ); @@ -351,7 +351,9 @@ class PlaygroundController with ChangeNotifier { final controller = requireSnippetEditingController(); return exampleCache.getSnippetId( - files: [SharedFile(code: controller.codeController.text, isMain: true)], + files: [ + SharedFile(code: controller.codeController.fullText, isMain: true), + ], sdk: controller.sdk, pipelineOptions: controller.pipelineOptions, ); @@ -370,27 +372,27 @@ class PlaygroundController with ChangeNotifier { } late BeamShortcut runShortcut = BeamShortcut( - shortcuts: LogicalKeySet( - LogicalKeyboardKey.meta, - LogicalKeyboardKey.enter, - ), - actionIntent: const RunIntent(), - createAction: (BuildContext context) => CallbackAction( - onInvoke: (_) => runCode(), - ), - ); + shortcuts: LogicalKeySet( + LogicalKeyboardKey.meta, + LogicalKeyboardKey.enter, + ), + actionIntent: const RunIntent(), + createAction: (BuildContext context) => CallbackAction( + onInvoke: (_) => runCode(), + ), + ); late BeamShortcut resetShortcut = BeamShortcut( - shortcuts: LogicalKeySet( - LogicalKeyboardKey.meta, - LogicalKeyboardKey.shift, - LogicalKeyboardKey.keyE, - ), - actionIntent: const ResetIntent(), - createAction: (BuildContext context) => CallbackAction( - onInvoke: (_) => reset(), - ), - ); + shortcuts: LogicalKeySet( + LogicalKeyboardKey.meta, + LogicalKeyboardKey.shift, + LogicalKeyboardKey.keyE, + ), + actionIntent: const ResetIntent(), + createAction: (BuildContext context) => CallbackAction( + onInvoke: (_) => reset(), + ), + ); List get shortcuts => [ runShortcut, diff --git a/playground/frontend/playground_components/lib/src/controllers/snippet_editing_controller.dart b/playground/frontend/playground_components/lib/src/controllers/snippet_editing_controller.dart index 9e36eed809bf..8bb285eff421 100644 --- a/playground/frontend/playground_components/lib/src/controllers/snippet_editing_controller.dart +++ b/playground/frontend/playground_components/lib/src/controllers/snippet_editing_controller.dart @@ -16,8 +16,8 @@ * limitations under the License. */ -import 'package:code_text_field/code_text_field.dart'; import 'package:flutter/widgets.dart'; +import 'package:flutter_code_editor/flutter_code_editor.dart'; import '../enums/complexity.dart'; import '../models/example.dart'; @@ -44,7 +44,7 @@ class SnippetEditingController extends ChangeNotifier { set selectedExample(Example? value) { _selectedExample = value; - codeController.text = _selectedExample?.source ?? ''; + setSource(_selectedExample?.source ?? ''); _pipelineOptions = _selectedExample?.pipelineOptions ?? ''; notifyListeners(); } @@ -63,7 +63,7 @@ class SnippetEditingController extends ChangeNotifier { } bool _isCodeChanged() { - return _selectedExample?.source != codeController.text; + return _selectedExample?.source != codeController.fullText; } bool _arePipelineOptionsChanged() { @@ -82,10 +82,15 @@ class SnippetEditingController extends ChangeNotifier { // user-shared examples, and an empty editor, // https://github.com/apache/beam/issues/23252 return ContentExampleLoadingDescriptor( - content: codeController.text, + content: codeController.fullText, name: _selectedExample?.name, complexity: _selectedExample?.complexity ?? Complexity.unspecified, sdk: sdk, ); } + + void setSource(String source) { + codeController.text = source; + codeController.historyController.deleteHistory(); + } } diff --git a/playground/frontend/playground_components/lib/src/theme/theme.dart b/playground/frontend/playground_components/lib/src/theme/theme.dart index fed70dee36b2..14c811abe931 100644 --- a/playground/frontend/playground_components/lib/src/theme/theme.dart +++ b/playground/frontend/playground_components/lib/src/theme/theme.dart @@ -16,8 +16,8 @@ * limitations under the License. */ -import 'package:code_text_field/code_text_field.dart'; import 'package:flutter/material.dart'; +import 'package:flutter_code_editor/flutter_code_editor.dart'; import 'package:flutter_markdown/flutter_markdown.dart'; import 'package:google_fonts/google_fonts.dart'; @@ -138,7 +138,6 @@ final kLightTheme = ThemeData( secondaryBackgroundColor: BeamLightThemeColors.secondaryBackground, codeBackgroundColor: BeamLightThemeColors.codeBackground, codeRootStyle: GoogleFonts.sourceCodePro( - backgroundColor: BeamLightThemeColors.primaryBackground, color: BeamLightThemeColors.text, fontSize: codeFontSize, ), @@ -212,7 +211,6 @@ final kDarkTheme = ThemeData( secondaryBackgroundColor: BeamDarkThemeColors.secondaryBackground, codeBackgroundColor: BeamDarkThemeColors.codeBackground, codeRootStyle: GoogleFonts.sourceCodePro( - backgroundColor: BeamDarkThemeColors.primaryBackground, color: BeamDarkThemeColors.text, fontSize: codeFontSize, ), diff --git a/playground/frontend/playground_components/lib/src/widgets/editor_textarea.dart b/playground/frontend/playground_components/lib/src/widgets/editor_textarea.dart index 05231d0ee8b7..9714177ec945 100644 --- a/playground/frontend/playground_components/lib/src/widgets/editor_textarea.dart +++ b/playground/frontend/playground_components/lib/src/widgets/editor_textarea.dart @@ -18,8 +18,8 @@ // TODO(alexeyinkin): Refactor this, merge into snippet_editor.dart -import 'package:code_text_field/code_text_field.dart'; import 'package:flutter/material.dart'; +import 'package:flutter_code_editor/flutter_code_editor.dart'; import '../models/example.dart'; import '../models/sdk.dart'; @@ -58,7 +58,7 @@ class EditorTextArea extends StatefulWidget { class _EditorTextAreaState extends State { var focusNode = FocusNode(); - final GlobalKey codeFieldKey = LabeledGlobalKey('CodeFieldKey'); + final GlobalKey _sizeKey = LabeledGlobalKey('CodeFieldKey'); @override void dispose() { @@ -82,16 +82,21 @@ class _EditorTextAreaState extends State { readOnly: widget.enabled, label: 'widgets.codeEditor.label', child: FocusScope( + key: _sizeKey, node: FocusScopeNode(canRequestFocus: widget.isEditable), child: CodeTheme( data: ext.codeTheme, - child: CodeField( - key: codeFieldKey, - focusNode: focusNode, - enabled: widget.enabled, - controller: widget.codeController, - textStyle: ext.codeRootStyle, - expands: true, + child: Container( + color: ext.codeTheme.styles['root']?.backgroundColor, + child: SingleChildScrollView( + child: CodeField( + key: ValueKey(widget.codeController), + focusNode: focusNode, + enabled: widget.enabled, + controller: widget.codeController, + textStyle: ext.codeRootStyle, + ), + ), ), ), ), @@ -137,9 +142,8 @@ class _EditorTextAreaState extends State { } int _getQntOfStringsOnScreen() { - RenderBox rBox = - codeFieldKey.currentContext?.findRenderObject() as RenderBox; - double height = rBox.size.height * .75; + final renderBox = _sizeKey.currentContext!.findRenderObject()! as RenderBox; + final height = renderBox.size.height * .75; return height ~/ codeFontSize; } diff --git a/playground/frontend/playground_components/pubspec.yaml b/playground/frontend/playground_components/pubspec.yaml index 1a4c91197a0c..7922c2bbec35 100644 --- a/playground/frontend/playground_components/pubspec.yaml +++ b/playground/frontend/playground_components/pubspec.yaml @@ -26,16 +26,13 @@ environment: dependencies: aligned_dialog: ^0.0.6 - code_text_field: - git: - url: https://github.com/BertrandBev/code_field.git - ref: 9e2c9fe52a69481f038f4b6609e8a0a776429437 collection: ^1.16.0 easy_localization: ^3.0.1 easy_localization_ext: ^0.1.1 easy_localization_loader: ^1.0.0 equatable: ^2.0.5 flutter: { sdk: flutter } + flutter_code_editor: ^0.1.3 flutter_markdown: ^0.6.12 flutter_svg: ^1.0.3 google_fonts: ^3.0.1 diff --git a/playground/frontend/pubspec.lock b/playground/frontend/pubspec.lock index 7b3186dbd6bd..6e7da5f916a1 100644 --- a/playground/frontend/pubspec.lock +++ b/playground/frontend/pubspec.lock @@ -50,6 +50,13 @@ packages: url: "https://pub.dartlang.org" source: hosted version: "2.9.0" + autotrie: + dependency: transitive + description: + name: autotrie + url: "https://pub.dartlang.org" + source: hosted + version: "2.0.0" boolean_selector: dependency: transitive description: @@ -148,15 +155,6 @@ packages: url: "https://pub.dartlang.org" source: hosted version: "4.1.0" - code_text_field: - dependency: transitive - description: - path: "." - ref: "9e2c9fe52a69481f038f4b6609e8a0a776429437" - resolved-ref: "9e2c9fe52a69481f038f4b6609e8a0a776429437" - url: "https://github.com/BertrandBev/code_field.git" - source: git - version: "1.0.3" collection: dependency: "direct main" description: @@ -274,6 +272,13 @@ packages: description: flutter source: sdk version: "0.0.0" + flutter_code_editor: + dependency: transitive + description: + name: flutter_code_editor + url: "https://pub.dartlang.org" + source: hosted + version: "0.1.3" flutter_highlight: dependency: transitive description: @@ -293,6 +298,13 @@ packages: description: flutter source: sdk version: "0.0.0" + flutter_markdown: + dependency: transitive + description: + name: flutter_markdown + url: "https://pub.dartlang.org" + source: hosted + version: "0.6.12" flutter_svg: dependency: "direct main" description: @@ -359,6 +371,13 @@ packages: url: "https://pub.dartlang.org" source: hosted version: "0.7.0" + hive: + dependency: transitive + description: + name: hive + url: "https://pub.dartlang.org" + source: hosted + version: "2.2.3" html: dependency: transitive description: @@ -421,7 +440,7 @@ packages: name: json_annotation url: "https://pub.dartlang.org" source: hosted - version: "4.5.0" + version: "4.7.0" linked_scroll_controller: dependency: transitive description: @@ -443,6 +462,13 @@ packages: url: "https://pub.dartlang.org" source: hosted version: "1.0.2" + markdown: + dependency: transitive + description: + name: markdown + url: "https://pub.dartlang.org" + source: hosted + version: "6.0.1" matcher: dependency: transitive description: @@ -639,6 +665,13 @@ packages: url: "https://pub.dartlang.org" source: hosted version: "1.2.0" + scrollable_positioned_list: + dependency: transitive + description: + name: scrollable_positioned_list + url: "https://pub.dartlang.org" + source: hosted + version: "0.3.5" shared_preferences: dependency: "direct main" description: @@ -777,6 +810,13 @@ packages: url: "https://pub.dartlang.org" source: hosted version: "1.0.0" + tuple: + dependency: transitive + description: + name: tuple + url: "https://pub.dartlang.org" + source: hosted + version: "2.0.1" typed_data: dependency: transitive description: diff --git a/release/build.gradle.kts b/release/build.gradle.kts index 7de4ab3af61a..ce895af80f8b 100644 --- a/release/build.gradle.kts +++ b/release/build.gradle.kts @@ -38,7 +38,7 @@ task("runJavaExamplesValidationTask") { description = "Run the Beam quickstart across all Java runners" dependsOn(":runners:direct-java:runQuickstartJavaDirect") dependsOn(":runners:google-cloud-dataflow-java:runQuickstartJavaDataflow") - dependsOn(":runners:spark:2:runQuickstartJavaSpark") + dependsOn(":runners:spark:3:runQuickstartJavaSpark") dependsOn(":runners:flink:1.13:runQuickstartJavaFlinkLocal") dependsOn(":runners:direct-java:runMobileGamingJavaDirect") dependsOn(":runners:google-cloud-dataflow-java:runMobileGamingJavaDataflow") diff --git a/release/src/main/scripts/mass_comment.py b/release/src/main/scripts/mass_comment.py index dde2fc7e8e04..cb60bf6d49d0 100644 --- a/release/src/main/scripts/mass_comment.py +++ b/release/src/main/scripts/mass_comment.py @@ -61,7 +61,6 @@ "Run Java Examples_Flink", "Run Java Examples_Spark", "Run Java Flink PortableValidatesRunner Streaming", - "Run Java Portability examples on Dataflow with Java 11", "Run Java PostCommit", "Run Java PreCommit", "Run Java Samza PortableValidatesRunner", diff --git a/runners/google-cloud-dataflow-java/build.gradle b/runners/google-cloud-dataflow-java/build.gradle index 8429bb40816a..b8f292df9f9d 100644 --- a/runners/google-cloud-dataflow-java/build.gradle +++ b/runners/google-cloud-dataflow-java/build.gradle @@ -55,7 +55,7 @@ processResources { 'dataflow.legacy_environment_major_version' : '8', 'dataflow.fnapi_environment_major_version' : '8', 'dataflow.legacy_container_version' : 'beam-master-20220816', - 'dataflow.fnapi_container_version' : 'beam-master-20220923', + 'dataflow.fnapi_container_version' : 'beam-master-20221022', 'dataflow.container_base_repository' : 'gcr.io/cloud-dataflow/v1beta3', ] } diff --git a/runners/samza/src/main/java/org/apache/beam/runners/samza/runtime/AsyncDoFnRunner.java b/runners/samza/src/main/java/org/apache/beam/runners/samza/runtime/AsyncDoFnRunner.java index 7120696aa4f1..76dfe5b720d8 100644 --- a/runners/samza/src/main/java/org/apache/beam/runners/samza/runtime/AsyncDoFnRunner.java +++ b/runners/samza/src/main/java/org/apache/beam/runners/samza/runtime/AsyncDoFnRunner.java @@ -18,7 +18,10 @@ package org.apache.beam.runners.samza.runtime; import java.util.Collection; +import java.util.Collections; +import java.util.Map; import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutorService; import java.util.stream.Collectors; import org.apache.beam.runners.core.DoFnRunner; @@ -27,6 +30,8 @@ import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.windowing.BoundedWindow; import org.apache.beam.sdk.util.WindowedValue; +import org.apache.beam.sdk.values.KV; +import org.checkerframework.checker.nullness.qual.Nullable; import org.joda.time.Instant; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -39,30 +44,46 @@ public class AsyncDoFnRunner implements DoFnRunner { private static final Logger LOG = LoggerFactory.getLogger(AsyncDoFnRunner.class); + // A dummy key to represent null keys + private static final Object NULL_KEY = new Object(); + private final DoFnRunner underlying; private final ExecutorService executor; private final OpEmitter emitter; private final FutureCollector futureCollector; + private final boolean isStateful; + + /** + * This map keeps track of the last outputFutures for a certain key. When the next element of the + * key comes in, its outputFutures will be chained from the last outputFutures in the map. When + * all futures of a key have been complete, the key entry will be removed. The map is bounded by + * (bundle size * 2). + */ + private final Map>>> keyedOutputFutures; public static AsyncDoFnRunner create( DoFnRunner runner, OpEmitter emitter, FutureCollector futureCollector, + boolean isStateful, SamzaPipelineOptions options) { LOG.info("Run DoFn with " + AsyncDoFnRunner.class.getName()); - return new AsyncDoFnRunner<>(runner, emitter, futureCollector, options); + return new AsyncDoFnRunner<>(runner, emitter, futureCollector, isStateful, options); } private AsyncDoFnRunner( DoFnRunner runner, OpEmitter emitter, FutureCollector futureCollector, + boolean isStateful, SamzaPipelineOptions options) { this.underlying = runner; this.executor = options.getExecutorServiceForProcessElement(); this.emitter = emitter; this.futureCollector = futureCollector; + this.isStateful = isStateful; + this.keyedOutputFutures = new ConcurrentHashMap<>(); } @Override @@ -72,23 +93,59 @@ public void startBundle() { @Override public void processElement(WindowedValue elem) { - final CompletableFuture future = - CompletableFuture.runAsync( - () -> { - underlying.processElement(elem); - }, - executor); - final CompletableFuture>> outputFutures = - future.thenApply( - x -> - emitter.collectOutput().stream() - .map(OpMessage::getElement) - .collect(Collectors.toList())); + isStateful ? processStateful(elem) : processElement(elem, null); futureCollector.addAll(outputFutures); } + private CompletableFuture>> processElement( + WindowedValue elem, + @Nullable CompletableFuture>> prevOutputFuture) { + + final CompletableFuture>> prevFuture = + prevOutputFuture == null + ? CompletableFuture.completedFuture(Collections.emptyList()) + : prevOutputFuture; + + // For ordering by key, we chain the processing of the elem to the completion of + // the previous output of the same key + return prevFuture.thenApplyAsync( + x -> { + underlying.processElement(elem); + + return emitter.collectOutput().stream() + .map(OpMessage::getElement) + .collect(Collectors.toList()); + }, + executor); + } + + private CompletableFuture>> processStateful( + WindowedValue elem) { + final Object key = getKey(elem); + + final CompletableFuture>> outputFutures = + processElement(elem, keyedOutputFutures.get(key)); + + // Update the latest outputFuture for key + keyedOutputFutures.put(key, outputFutures); + + // Remove the outputFuture from the map once it's complete. + // This ensures the map will be cleaned up immediately. + return outputFutures.thenApply( + output -> { + // Under the condition that the outputFutures has not been updated + keyedOutputFutures.remove(key, outputFutures); + return output; + }); + } + + /** Package private for testing. */ + boolean hasOutputFuturesForKey(Object key) { + return keyedOutputFutures.containsKey(key); + } + @Override public void onTimer( String timerId, @@ -115,4 +172,14 @@ public void onWindowExpiration(BoundedWindow window, Instant timestamp, K public DoFn getFn() { return underlying.getFn(); } + + private Object getKey(WindowedValue elem) { + KV kv = (KV) elem.getValue(); + if (kv == null) { + return NULL_KEY; + } else { + Object key = kv.getKey(); + return key == null ? NULL_KEY : key; + } + } } diff --git a/runners/samza/src/main/java/org/apache/beam/runners/samza/runtime/OpAdapter.java b/runners/samza/src/main/java/org/apache/beam/runners/samza/runtime/OpAdapter.java index eabcd87f5f36..c5353b0e2352 100644 --- a/runners/samza/src/main/java/org/apache/beam/runners/samza/runtime/OpAdapter.java +++ b/runners/samza/src/main/java/org/apache/beam/runners/samza/runtime/OpAdapter.java @@ -159,12 +159,12 @@ public void close() { op.close(); } - private static class OpEmitterImpl implements OpEmitter { + static class OpEmitterImpl implements OpEmitter { private final Queue> outputQueue; private CompletionStage>> outputFuture; private Instant outputWatermark; - private OpEmitterImpl() { + OpEmitterImpl() { outputQueue = new ConcurrentLinkedQueue<>(); } diff --git a/runners/samza/src/main/java/org/apache/beam/runners/samza/runtime/SamzaDoFnRunners.java b/runners/samza/src/main/java/org/apache/beam/runners/samza/runtime/SamzaDoFnRunners.java index 12872b82d8f7..ec1a9f365090 100644 --- a/runners/samza/src/main/java/org/apache/beam/runners/samza/runtime/SamzaDoFnRunners.java +++ b/runners/samza/src/main/java/org/apache/beam/runners/samza/runtime/SamzaDoFnRunners.java @@ -153,7 +153,8 @@ public static DoFnRunner create( } return pipelineOptions.getNumThreadsForProcessElement() > 1 - ? AsyncDoFnRunner.create(doFnRunnerWithStates, emitter, futureCollector, pipelineOptions) + ? AsyncDoFnRunner.create( + doFnRunnerWithStates, emitter, futureCollector, keyedInternals != null, pipelineOptions) : doFnRunnerWithStates; } diff --git a/runners/samza/src/test/java/org/apache/beam/runners/samza/runtime/AsyncDoFnRunnerTest.java b/runners/samza/src/test/java/org/apache/beam/runners/samza/runtime/AsyncDoFnRunnerTest.java index d62a28b374f1..6d4ffc70d5a4 100644 --- a/runners/samza/src/test/java/org/apache/beam/runners/samza/runtime/AsyncDoFnRunnerTest.java +++ b/runners/samza/src/test/java/org/apache/beam/runners/samza/runtime/AsyncDoFnRunnerTest.java @@ -17,12 +17,22 @@ */ package org.apache.beam.runners.samza.runtime; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.mockito.Mockito.any; +import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.mock; + import java.io.Serializable; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Map; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; +import org.apache.beam.runners.core.DoFnRunner; +import org.apache.beam.runners.samza.SamzaPipelineOptions; import org.apache.beam.sdk.coders.VarIntCoder; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.state.CombiningState; @@ -36,6 +46,7 @@ import org.apache.beam.sdk.transforms.MapElements; import org.apache.beam.sdk.transforms.ParDo; import org.apache.beam.sdk.transforms.Sum; +import org.apache.beam.sdk.util.WindowedValue; import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.TypeDescriptors; @@ -116,11 +127,7 @@ public void processElement( return; } - // Need explicit synchronization here - synchronized (this) { - countState.add(1); - } - + countState.add(1); String key = c.element().getKey(); int n = countState.read(); if (n >= expectedCount.get(key)) { @@ -152,7 +159,7 @@ public void testPipelineWithAggregation() { KV.of("banana", 5L))); // TODO: remove after SAMZA-2761 fix - for (int i = 0; i < 20; i++) { + for (int i = 0; i < 50; i++) { input.add(KV.of("*", 0L)); } @@ -168,4 +175,61 @@ public void testPipelineWithAggregation() { pipeline.run(); } + + @Test + public void testKeyedOutputFutures() { + // We test the scenario that two elements of the same key needs to be processed in order. + final DoFnRunner, Void> doFnRunner = mock(DoFnRunner.class); + final AtomicInteger prev = new AtomicInteger(0); + final CountDownLatch latch = new CountDownLatch(1); + doAnswer( + invocation -> { + latch.await(); + WindowedValue> wv = invocation.getArgument(0); + Integer val = wv.getValue().getValue(); + + // Verify the previous element has been fully processed by checking the prev value + assertEquals(val - 1, prev.get()); + + prev.set(val); + return null; + }) + .when(doFnRunner) + .processElement(any()); + + SamzaPipelineOptions options = PipelineOptionsFactory.as(SamzaPipelineOptions.class); + options.setNumThreadsForProcessElement(4); + + final OpEmitter opEmitter = new OpAdapter.OpEmitterImpl<>(); + final FutureCollector futureCollector = new DoFnOp.FutureCollectorImpl<>(); + futureCollector.prepare(); + + final AsyncDoFnRunner, Void> asyncDoFnRunner = + AsyncDoFnRunner.create(doFnRunner, opEmitter, futureCollector, true, options); + + final String appleKey = "apple"; + + final WindowedValue> input1 = + WindowedValue.valueInGlobalWindow(KV.of(appleKey, 1)); + + final WindowedValue> input2 = + WindowedValue.valueInGlobalWindow(KV.of(appleKey, 2)); + + asyncDoFnRunner.processElement(input1); + asyncDoFnRunner.processElement(input2); + // Resume input1 process afterwards + latch.countDown(); + + // Waiting for the futures to be resolved + try { + futureCollector.finish().toCompletableFuture().get(); + } catch (Exception e) { + // ignore interruption here. + } + + // The final val should be the last element value + assertEquals(2, prev.get()); + // The appleKey in keyedOutputFutures map should be removed + assertFalse(asyncDoFnRunner.hasOutputFuturesForKey(appleKey)); + } } diff --git a/runners/spark/3/build.gradle b/runners/spark/3/build.gradle index 3d59bd525c4b..494d367131b4 100644 --- a/runners/spark/3/build.gradle +++ b/runners/spark/3/build.gradle @@ -29,6 +29,9 @@ project.ext { // Load the main build script which contains all build logic. apply from: "$basePath/spark_runner.gradle" +// Generates runQuickstartJavaSpark task (can only support 1 version of Spark) +createJavaExamplesArchetypeValidationTask(type: 'Quickstart', runner: 'Spark') + // Additional supported Spark versions (used in compatibility tests) def sparkVersions = [ "330": "3.3.0", diff --git a/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/translation/TransformTranslator.java b/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/translation/TransformTranslator.java index d991a0d9148d..468fefe3fca3 100644 --- a/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/translation/TransformTranslator.java +++ b/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/translation/TransformTranslator.java @@ -69,6 +69,17 @@ public final void translate( translate(transform, new Context(appliedTransform, cxt)); } + /** + * Checks if a composite / primitive transform can be translated. Composites that cannot be + * translated as is, will be exploded further for translation of their parts. + * + *

This should be overridden where necessary. If a transform is know to be unsupported, this + * should throw a runtime exception to give early feedback before any part of the pipeline is run. + */ + public boolean canTranslate(TransformT transform) { + return true; + } + protected class Context { private final AppliedPTransform> transform; private final TranslationContext cxt; diff --git a/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/translation/batch/CombineGroupedValuesTranslatorBatch.java b/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/translation/batch/CombineGroupedValuesTranslatorBatch.java new file mode 100644 index 000000000000..58fcf9b737d2 --- /dev/null +++ b/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/translation/batch/CombineGroupedValuesTranslatorBatch.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.spark.structuredstreaming.translation.batch; + +import java.io.IOException; +import org.apache.beam.runners.spark.structuredstreaming.translation.TransformTranslator; +import org.apache.beam.runners.spark.structuredstreaming.translation.utils.ScalaInterop.Fun1; +import org.apache.beam.sdk.transforms.Combine; +import org.apache.beam.sdk.transforms.Combine.CombineFn; +import org.apache.beam.sdk.transforms.CombineWithContext; +import org.apache.beam.sdk.util.WindowedValue; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollection; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoder; +import org.apache.spark.sql.expressions.Aggregator; + +/** + * Translator for {@link Combine.GroupedValues} if the {@link CombineFn} doesn't require context / + * side-inputs. + * + *

This doesn't require a Spark {@link Aggregator}. Instead it can directly use the respective + * {@link CombineFn} to reduce each iterable of values into an aggregated output value. + */ +public class CombineGroupedValuesTranslatorBatch + extends TransformTranslator< + PCollection>>, + PCollection>, + Combine.GroupedValues> { + + @Override + protected void translate(Combine.GroupedValues transform, Context cxt) + throws IOException { + CombineFn combineFn = (CombineFn) transform.getFn(); + + Encoder>> enc = cxt.windowedEncoder(cxt.getOutput().getCoder()); + Dataset>>> inputDs = (Dataset) cxt.getDataset(cxt.getInput()); + + cxt.putDataset(cxt.getOutput(), inputDs.map(reduce(combineFn), enc)); + } + + @Override + public boolean canTranslate(Combine.GroupedValues transform) { + return !(transform.getFn() instanceof CombineWithContext); + } + + private static + Fun1>>, WindowedValue>> reduce( + CombineFn fn) { + return wv -> { + KV> kv = wv.getValue(); + AccT acc = null; + for (InT in : kv.getValue()) { + acc = fn.addInput(acc != null ? acc : fn.createAccumulator(), in); + } + OutT res = acc != null ? fn.extractOutput(acc) : fn.defaultValue(); + return wv.withValue(KV.of(kv.getKey(), res)); + }; + } +} diff --git a/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/translation/batch/PipelineTranslatorBatch.java b/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/translation/batch/PipelineTranslatorBatch.java index 455cf4cce01a..1a635f24a798 100644 --- a/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/translation/batch/PipelineTranslatorBatch.java +++ b/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/translation/batch/PipelineTranslatorBatch.java @@ -65,6 +65,8 @@ public class PipelineTranslatorBatch extends PipelineTranslator { TRANSFORM_TRANSLATORS.put(Impulse.class, new ImpulseTranslatorBatch()); TRANSFORM_TRANSLATORS.put(Combine.PerKey.class, new CombinePerKeyTranslatorBatch<>()); TRANSFORM_TRANSLATORS.put(Combine.Globally.class, new CombineGloballyTranslatorBatch<>()); + TRANSFORM_TRANSLATORS.put( + Combine.GroupedValues.class, new CombineGroupedValuesTranslatorBatch<>()); TRANSFORM_TRANSLATORS.put(GroupByKey.class, new GroupByKeyTranslatorBatch<>()); TRANSFORM_TRANSLATORS.put(Reshuffle.class, new ReshuffleTranslatorBatch<>()); @@ -98,6 +100,8 @@ TransformTranslator getTransformTranslator( if (transform == null) { return null; } - return TRANSFORM_TRANSLATORS.get(transform.getClass()); + TransformTranslator translator = + TRANSFORM_TRANSLATORS.get(transform.getClass()); + return translator != null && translator.canTranslate(transform) ? translator : null; } } diff --git a/runners/spark/3/src/test/java/org/apache/beam/runners/spark/structuredstreaming/translation/batch/CombineGroupedValuesTest.java b/runners/spark/3/src/test/java/org/apache/beam/runners/spark/structuredstreaming/translation/batch/CombineGroupedValuesTest.java new file mode 100644 index 000000000000..cce3199d2c37 --- /dev/null +++ b/runners/spark/3/src/test/java/org/apache/beam/runners/spark/structuredstreaming/translation/batch/CombineGroupedValuesTest.java @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.runners.spark.structuredstreaming.translation.batch; + +import java.io.Serializable; +import org.apache.beam.runners.spark.structuredstreaming.SparkStructuredStreamingPipelineOptions; +import org.apache.beam.runners.spark.structuredstreaming.SparkStructuredStreamingRunner; +import org.apache.beam.sdk.coders.IterableCoder; +import org.apache.beam.sdk.coders.KvCoder; +import org.apache.beam.sdk.coders.StringUtf8Coder; +import org.apache.beam.sdk.coders.VarIntCoder; +import org.apache.beam.sdk.options.PipelineOptions; +import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.apache.beam.sdk.testing.PAssert; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.transforms.Combine; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.transforms.Sum; +import org.apache.beam.sdk.values.KV; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList; +import org.junit.Rule; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +/** Test class for beam to spark {@link Combine#groupedValues} translation. */ +@RunWith(JUnit4.class) +public class CombineGroupedValuesTest implements Serializable { + @Rule public transient TestPipeline pipeline = TestPipeline.fromOptions(testOptions()); + + private static PipelineOptions testOptions() { + SparkStructuredStreamingPipelineOptions options = + PipelineOptionsFactory.create().as(SparkStructuredStreamingPipelineOptions.class); + options.setRunner(SparkStructuredStreamingRunner.class); + options.setTestMode(true); + return options; + } + + @Test + public void testCombineGroupedValues() { + PCollection> input = + pipeline + .apply( + Create.>>of( + KV.of("a", ImmutableList.of(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)), + KV.of("b", ImmutableList.of())) + .withCoder( + KvCoder.of(StringUtf8Coder.of(), IterableCoder.of(VarIntCoder.of())))) + .apply(Combine.groupedValues(Sum.ofIntegers())); + + PAssert.that(input).containsInAnyOrder(KV.of("a", 55), KV.of("b", 0)); + pipeline.run(); + } +} diff --git a/runners/spark/3/src/test/java/org/apache/beam/runners/spark/structuredstreaming/utils/SerializationDebugger.java b/runners/spark/3/src/test/java/org/apache/beam/runners/spark/structuredstreaming/utils/SerializationDebugger.java deleted file mode 100644 index b384b9b9d35d..000000000000 --- a/runners/spark/3/src/test/java/org/apache/beam/runners/spark/structuredstreaming/utils/SerializationDebugger.java +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.spark.structuredstreaming.utils; - -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.ObjectOutputStream; -import java.io.OutputStream; -import java.lang.reflect.Field; -import java.util.ArrayList; -import java.util.List; - -/** A {@code SerializationDebugger} for Spark Runner. */ -public class SerializationDebugger { - - public static void testSerialization(Object object, File to) throws IOException { - DebuggingObjectOutputStream out = new DebuggingObjectOutputStream(new FileOutputStream(to)); - try { - out.writeObject(object); - } catch (Exception e) { - throw new RuntimeException("Serialization error. Path to bad object: " + out.getStack(), e); - } - } - - private static class DebuggingObjectOutputStream extends ObjectOutputStream { - - private static final Field DEPTH_FIELD; - - static { - try { - DEPTH_FIELD = ObjectOutputStream.class.getDeclaredField("depth"); - DEPTH_FIELD.setAccessible(true); - } catch (NoSuchFieldException e) { - throw new AssertionError(e); - } - } - - final List stack = new ArrayList<>(); - - /** - * Indicates whether or not OOS has tried to write an IOException (presumably as the result of a - * serialization error) to the stream. - */ - boolean broken = false; - - DebuggingObjectOutputStream(OutputStream out) throws IOException { - super(out); - enableReplaceObject(true); - } - - /** Abuse {@code replaceObject()} as a hook to maintain our stack. */ - @Override - protected Object replaceObject(Object o) { - // ObjectOutputStream writes serialization - // exceptions to the stream. Ignore - // everything after that so we don't lose - // the path to a non-serializable object. So - // long as the user doesn't write an - // IOException as the root object, we're OK. - int currentDepth = currentDepth(); - if (o instanceof IOException && currentDepth == 0) { - broken = true; - } - if (!broken) { - truncate(currentDepth); - stack.add(o); - } - return o; - } - - private void truncate(int depth) { - while (stack.size() > depth) { - pop(); - } - } - - private Object pop() { - return stack.remove(stack.size() - 1); - } - - /** Returns a 0-based depth within the object graph of the current object being serialized. */ - private int currentDepth() { - try { - Integer oneBased = ((Integer) DEPTH_FIELD.get(this)); - return oneBased - 1; - } catch (IllegalAccessException e) { - throw new AssertionError(e); - } - } - - /** - * Returns the path to the last object serialized. If an exception occurred, this should be the - * path to the non-serializable object. - */ - List getStack() { - return stack; - } - } -} diff --git a/runners/spark/3/src/test/java/org/apache/beam/runners/spark/structuredstreaming/utils/package-info.java b/runners/spark/3/src/test/java/org/apache/beam/runners/spark/structuredstreaming/utils/package-info.java deleted file mode 100644 index 3d7da111a9c4..000000000000 --- a/runners/spark/3/src/test/java/org/apache/beam/runners/spark/structuredstreaming/utils/package-info.java +++ /dev/null @@ -1,20 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** Testing utils for spark structured streaming runner. */ -package org.apache.beam.runners.spark.structuredstreaming.utils; diff --git a/runners/spark/spark_runner.gradle b/runners/spark/spark_runner.gradle index 14a433162fb6..f8c0a061b0d7 100644 --- a/runners/spark/spark_runner.gradle +++ b/runners/spark/spark_runner.gradle @@ -152,15 +152,16 @@ dependencies { implementation project(":sdks:java:fn-execution") implementation library.java.vendored_grpc_1_48_1 implementation library.java.vendored_guava_26_0_jre - implementation "io.dropwizard.metrics:metrics-core:3.1.5" // version used by Spark 2.4 spark.components.each { component -> provided "$component:$spark_version" } permitUnusedDeclared "org.apache.spark:spark-network-common_$spark_scala_version:$spark_version" if (project.property("spark_scala_version").equals("2.11")) { + implementation "io.dropwizard.metrics:metrics-core:3.1.5" // version used by Spark 2.4 compileOnly "org.scala-lang:scala-library:2.11.12" runtimeOnly library.java.jackson_module_scala_2_11 } else { + implementation "io.dropwizard.metrics:metrics-core:4.1.1" // version used by Spark 3.1 compileOnly "org.scala-lang:scala-library:2.12.15" runtimeOnly library.java.jackson_module_scala_2_12 } @@ -385,9 +386,6 @@ tasks.register("validatesRunner") { //dependsOn validatesStructuredStreamingRunnerBatch } -// Generates :runners:spark:*:runQuickstartJavaSpark task -createJavaExamplesArchetypeValidationTask(type: 'Quickstart', runner: 'Spark') - tasks.register("hadoopVersionsTest") { group = "Verification" dependsOn hadoopVersions.collect{k,v -> "hadoopVersion${k}Test"} diff --git a/scripts/ci/ci_check_git_branch.sh b/scripts/ci/ci_check_git_branch.sh old mode 100644 new mode 100755 index e7bff3e77998..8040664c4885 --- a/scripts/ci/ci_check_git_branch.sh +++ b/scripts/ci/ci_check_git_branch.sh @@ -16,18 +16,22 @@ # specific language governing permissions and limitations # under the License. -function is_branch() { - #if nothing matches show-ref will return an error code of 1 - if git show-ref --quiet --verify -- "refs/heads/$1" ; then - return 1 +function is_in_remote() { + local branch=${1} + local existed_in_remote=$(git ls-remote --heads origin ${branch}) + + if [[ -z ${existed_in_remote} ]]; then + return 1 else - return 0 + return 0 fi } -if is_branch "$1"; then +if ! is_in_remote "$1"; then echo "Branch [$1] doesn't exist." + exit 0 else echo "Branch [$1] already exists!" - echo >&2 "Please make sure your branch doesn't exist." + echo "Please make sure your branch doesn't exist." + exit 1 fi diff --git a/sdks/go.mod b/sdks/go.mod index 438ffe1b5b6f..27670370ecf2 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -23,10 +23,10 @@ module github.com/apache/beam/sdks/v2 go 1.18 require ( - cloud.google.com/go/bigquery v1.42.0 + cloud.google.com/go/bigquery v1.43.0 cloud.google.com/go/datastore v1.8.0 cloud.google.com/go/profiler v0.3.0 - cloud.google.com/go/pubsub v1.25.1 + cloud.google.com/go/pubsub v1.26.0 cloud.google.com/go/storage v1.27.0 github.com/docker/go-connections v0.4.0 github.com/go-sql-driver/mysql v1.6.0 @@ -37,17 +37,17 @@ require ( github.com/linkedin/goavro v2.1.0+incompatible github.com/nightlyone/lockfile v1.0.0 github.com/proullon/ramsql v0.0.0-20211120092837-c8d0a408b939 - github.com/spf13/cobra v1.6.0 + github.com/spf13/cobra v1.6.1 github.com/testcontainers/testcontainers-go v0.14.0 github.com/xitongsys/parquet-go v1.6.2 github.com/xitongsys/parquet-go-source v0.0.0-20220315005136-aec0fe3e777c - golang.org/x/net v0.0.0-20221012135044-0b7e1fb9d458 - golang.org/x/oauth2 v0.0.0-20221006150949-b44042a4b9c1 - golang.org/x/sync v0.0.0-20220929204114-8fcdb60fdcc0 + golang.org/x/net v0.0.0-20221014081412-f15817d10f9b + golang.org/x/oauth2 v0.0.0-20221014153046-6fdb5e3db783 + golang.org/x/sync v0.1.0 golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10 golang.org/x/text v0.4.0 - google.golang.org/api v0.99.0 - google.golang.org/genproto v0.0.0-20221010155953-15ba04fc1c0e + google.golang.org/api v0.101.0 + google.golang.org/genproto v0.0.0-20221018160656-63c7b68cfc55 google.golang.org/grpc v1.50.1 google.golang.org/protobuf v1.28.1 gopkg.in/retry.v1 v1.0.3 @@ -57,7 +57,7 @@ require ( require ( cloud.google.com/go v0.104.0 // indirect cloud.google.com/go/compute v1.10.0 // indirect - cloud.google.com/go/iam v0.3.0 // indirect + cloud.google.com/go/iam v0.5.0 // indirect github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 // indirect github.com/Microsoft/go-winio v0.5.2 // indirect github.com/Microsoft/hcsshim v0.9.4 // indirect @@ -74,7 +74,7 @@ require ( github.com/golang/snappy v0.0.4 // indirect github.com/google/pprof v0.0.0-20220412212628-83db2b799d1f // indirect github.com/googleapis/enterprise-certificate-proxy v0.2.0 // indirect - github.com/googleapis/gax-go/v2 v2.5.1 // indirect + github.com/googleapis/gax-go/v2 v2.6.0 // indirect github.com/inconshreveable/mousetrap v1.0.1 // indirect github.com/klauspost/compress v1.13.1 // indirect github.com/magiconair/properties v1.8.6 // indirect diff --git a/sdks/go.sum b/sdks/go.sum index a129517f4dfe..55d19289d8e6 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -38,8 +38,8 @@ cloud.google.com/go/bigquery v1.4.0/go.mod h1:S8dzgnTigyfTmLBfrtrhyYhwRxG72rYxvf cloud.google.com/go/bigquery v1.5.0/go.mod h1:snEHRnqQbz117VIFhE8bmtwIDY80NLUZUMb4Nv6dBIg= cloud.google.com/go/bigquery v1.7.0/go.mod h1://okPTzCYNXSlb24MZs83e2Do+h+VXtc4gLoIoXIAPc= cloud.google.com/go/bigquery v1.8.0/go.mod h1:J5hqkt3O0uAFnINi6JXValWIb1v0goeZM77hZzJN/fQ= -cloud.google.com/go/bigquery v1.42.0 h1:JuTk8po4bCKRwObdT0zLb1K0BGkGHJdtgs2GK3j2Gws= -cloud.google.com/go/bigquery v1.42.0/go.mod h1:8dRTJxhtG+vwBKzE5OseQn/hiydoQN3EedCaOdYmxRA= +cloud.google.com/go/bigquery v1.43.0 h1:u0fvz5ysJBe1jwUPI4LuPwAX+o+6fCUwf3ECeg6eDUQ= +cloud.google.com/go/bigquery v1.43.0/go.mod h1:ZMQcXHsl+xmU1z36G2jNGZmKp9zNY5BUua5wDgmNCfw= cloud.google.com/go/compute v0.1.0/go.mod h1:GAesmwr110a34z04OlxYkATPBEfVhkymfTBXtfbBFow= cloud.google.com/go/compute v1.3.0/go.mod h1:cCZiE1NHEtai4wiufUhW8I8S1JKkAnhnQJWM7YD99wM= cloud.google.com/go/compute v1.5.0/go.mod h1:9SMHyhJlzhlkJqrPAc839t2BZFTSk6Jdj6mkzQJeu0M= @@ -53,8 +53,9 @@ cloud.google.com/go/datastore v1.1.0/go.mod h1:umbIZjpQpHh4hmRpGhH4tLFup+FVzqBi1 cloud.google.com/go/datastore v1.8.0 h1:2qo2G7hABSeqswa+5Ga3+QB8/ZwKOJmDsCISM9scmsU= cloud.google.com/go/datastore v1.8.0/go.mod h1:q1CpHVByTlXppdqTcu4LIhCsTn3fhtZ5R7+TajciO+M= cloud.google.com/go/firestore v1.1.0/go.mod h1:ulACoGHTpvq5r8rxGJ4ddJZBZqakUQqClKRT5SZwBmk= -cloud.google.com/go/iam v0.3.0 h1:exkAomrVUuzx9kWFI1wm3KI0uoDeUFPB4kKGzx6x+Gc= cloud.google.com/go/iam v0.3.0/go.mod h1:XzJPvDayI+9zsASAFO68Hk07u3z+f+JrT2xXNdp4bnY= +cloud.google.com/go/iam v0.5.0 h1:fz9X5zyTWBmamZsqvqZqD7khbifcZF/q+Z1J8pfhIUg= +cloud.google.com/go/iam v0.5.0/go.mod h1:wPU9Vt0P4UmCux7mqtRu6jcpPAb74cP1fh50J3QpkUc= cloud.google.com/go/kms v1.4.0 h1:iElbfoE61VeLhnZcGOltqL8HIly8Nhbe5t6JlH9GXjo= cloud.google.com/go/profiler v0.3.0 h1:R6y/xAeifaUXxd2x6w+jIwKxoKl8Cv5HJvcvASTPWJo= cloud.google.com/go/profiler v0.3.0/go.mod h1:9wYk9eY4iZHsev8TQb61kh3wiOiSyz/xOYixWPzweCU= @@ -62,8 +63,8 @@ cloud.google.com/go/pubsub v1.0.1/go.mod h1:R0Gpsv3s54REJCy4fxDixWD93lHJMoZTyQ2k cloud.google.com/go/pubsub v1.1.0/go.mod h1:EwwdRX2sKPjnvnqCa270oGRyludottCI76h+R3AArQw= cloud.google.com/go/pubsub v1.2.0/go.mod h1:jhfEVHT8odbXTkndysNHCcx0awwzvfOlguIAii9o8iA= cloud.google.com/go/pubsub v1.3.1/go.mod h1:i+ucay31+CNRpDW4Lu78I4xXG+O1r/MAHgjpRVR+TSU= -cloud.google.com/go/pubsub v1.25.1 h1:l0wCNZKuEp2Q54wAy8283EV9O57+7biWOXnnU2/Tq/A= -cloud.google.com/go/pubsub v1.25.1/go.mod h1:bY6l7rF8kCcwz6V3RaQ6kK4p5g7qc7PqjRoE9wDOqOU= +cloud.google.com/go/pubsub v1.26.0 h1:Y/HcMxVXgkUV2pYeLMUkclMg0ue6U0jVyI5xEARQ4zA= +cloud.google.com/go/pubsub v1.26.0/go.mod h1:QgBH3U/jdJy/ftjPhTkyXNj543Tin1pRYcdcPRnFIRI= cloud.google.com/go/storage v1.0.0/go.mod h1:IhtSnM/ZTZV8YYJWCY8RULGVqBDmpoyjwiyrjsg+URw= cloud.google.com/go/storage v1.5.0/go.mod h1:tpKbwo567HUNpVclU5sGELwQWBDZ8gh0ZeosJ0Rtdos= cloud.google.com/go/storage v1.6.0/go.mod h1:N7U0C8pVQ/+NIKOBQyamJIeKQKkZ+mxpohlUTyfDhBk= @@ -587,8 +588,8 @@ github.com/googleapis/gax-go/v2 v2.1.1/go.mod h1:hddJymUZASv3XPyGkUpKj8pPO47Rmb0 github.com/googleapis/gax-go/v2 v2.2.0/go.mod h1:as02EH8zWkzwUoLbBaFeQ+arQaj/OthfcblKl4IGNaM= github.com/googleapis/gax-go/v2 v2.3.0/go.mod h1:b8LNqSzNabLiUpXKkY7HAR5jr6bIT99EXz9pXxye9YM= github.com/googleapis/gax-go/v2 v2.4.0/go.mod h1:XOTVJ59hdnfJLIP/dh8n5CGryZR2LxK9wbMD5+iXC6c= -github.com/googleapis/gax-go/v2 v2.5.1 h1:kBRZU0PSuI7PspsSb/ChWoVResUcwNVIdpB049pKTiw= -github.com/googleapis/gax-go/v2 v2.5.1/go.mod h1:h6B0KMMFNtI2ddbGJn3T3ZbwkeT6yqEF02fYlzkUCyo= +github.com/googleapis/gax-go/v2 v2.6.0 h1:SXk3ABtQYDT/OH8jAyvEOQ58mgawq5C4o/4/89qN2ZU= +github.com/googleapis/gax-go/v2 v2.6.0/go.mod h1:1mjbznJAPHFpesgE5ucqfYEscaz5kMdcIDwU/6+DDoY= github.com/googleapis/gnostic v0.4.1/go.mod h1:LRhVm6pbyptWbWbuZ38d1eyptfvIytN3ir6b65WBswg= github.com/googleapis/gnostic v0.5.1/go.mod h1:6U4PtQXGIEt/Z3h5MAT7FNofLnw9vXk2cUuW7uA/OeU= github.com/googleapis/gnostic v0.5.5/go.mod h1:7+EbHbldMins07ALC74bsA81Ovc97DwqyJO1AENw9kA= @@ -928,8 +929,8 @@ github.com/spf13/cobra v0.0.2-0.20171109065643-2da4a54c5cee/go.mod h1:1l0Ry5zgKv github.com/spf13/cobra v0.0.3/go.mod h1:1l0Ry5zgKvJasoi3XT1TypsSe7PqH0Sj9dhYf7v3XqQ= github.com/spf13/cobra v1.0.0/go.mod h1:/6GTrnGXV9HjY+aR4k0oJ5tcvakLuG6EuKReYlHNrgE= github.com/spf13/cobra v1.1.3/go.mod h1:pGADOWyqRD/YMrPZigI/zbliZ2wVD/23d+is3pSWzOo= -github.com/spf13/cobra v1.6.0 h1:42a0n6jwCot1pUmomAp4T7DeMD+20LFv4Q54pxLf2LI= -github.com/spf13/cobra v1.6.0/go.mod h1:IOw/AERYS7UzyrGinqmz6HLUo219MORXGxhbaJUqzrY= +github.com/spf13/cobra v1.6.1 h1:o94oiPyS4KD1mPy2fmcYYHHfCxLqYjJOhGsCHFZtEzA= +github.com/spf13/cobra v1.6.1/go.mod h1:IOw/AERYS7UzyrGinqmz6HLUo219MORXGxhbaJUqzrY= github.com/spf13/jwalterweatherman v1.0.0/go.mod h1:cQK4TGJAtQXfYWX+Ddv3mKDzgVb68N+wFjFa4jdeBTo= github.com/spf13/pflag v0.0.0-20170130214245-9ff6c6923cff/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4= github.com/spf13/pflag v1.0.1-0.20171106142849-4c012f6dcd95/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4= @@ -1176,8 +1177,8 @@ golang.org/x/net v0.0.0-20220412020605-290c469a71a5/go.mod h1:CfG3xpIq0wQ8r1q4Su golang.org/x/net v0.0.0-20220425223048-2871e0cb64e4/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= golang.org/x/net v0.0.0-20220607020251-c690dde0001d/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.0.0-20220617184016-355a448f1bc9/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= -golang.org/x/net v0.0.0-20221012135044-0b7e1fb9d458 h1:MgJ6t2zo8v0tbmLCueaCbF1RM+TtB0rs3Lv8DGtOIpY= -golang.org/x/net v0.0.0-20221012135044-0b7e1fb9d458/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk= +golang.org/x/net v0.0.0-20221014081412-f15817d10f9b h1:tvrvnPFcdzp294diPnrdZZZ8XUt2Tyj7svb7X52iDuU= +golang.org/x/net v0.0.0-20221014081412-f15817d10f9b/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -1198,8 +1199,8 @@ golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b/go.mod h1:DAh4E804XQdzx2j golang.org/x/oauth2 v0.0.0-20220309155454-6242fa91716a/go.mod h1:DAh4E804XQdzx2j+YRIaUnCqCV2RuMz24cGBJ5QYIrc= golang.org/x/oauth2 v0.0.0-20220411215720-9780585627b5/go.mod h1:DAh4E804XQdzx2j+YRIaUnCqCV2RuMz24cGBJ5QYIrc= golang.org/x/oauth2 v0.0.0-20220608161450-d0670ef3b1eb/go.mod h1:jaDAt6Dkxork7LmZnYtzbRWj0W47D86a3TGe0YHBvmE= -golang.org/x/oauth2 v0.0.0-20221006150949-b44042a4b9c1 h1:3VPzK7eqH25j7GYw5w6g/GzNRc0/fYtrxz27z1gD4W0= -golang.org/x/oauth2 v0.0.0-20221006150949-b44042a4b9c1/go.mod h1:h4gKUeWbJ4rQPri7E0u6Gs4e9Ri2zaLxzw5DI5XGrYg= +golang.org/x/oauth2 v0.0.0-20221014153046-6fdb5e3db783 h1:nt+Q6cXKz4MosCSpnbMtqiQ8Oz0pxTef2B4Vca2lvfk= +golang.org/x/oauth2 v0.0.0-20221014153046-6fdb5e3db783/go.mod h1:h4gKUeWbJ4rQPri7E0u6Gs4e9Ri2zaLxzw5DI5XGrYg= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -1212,8 +1213,8 @@ golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20201207232520-09787c993a3a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220601150217-0de741cfad7f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.0.0-20220929204114-8fcdb60fdcc0 h1:cu5kTvlzcw1Q5S9f5ip1/cpiB4nXvw1XYzFPGgzLUOY= -golang.org/x/sync v0.0.0-20220929204114-8fcdb60fdcc0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -1361,7 +1362,7 @@ golang.org/x/time v0.0.0-20200416051211-89c76fbcd5d1/go.mod h1:tRJNPiyCQ0inRvYxb golang.org/x/time v0.0.0-20200630173020-3af7569d3a1e/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20210220033141-f8bda1e9f3ba/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20210723032227-1f47c861a9ac/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= -golang.org/x/time v0.0.0-20220609170525-579cf78fd858 h1:Dpdu/EMxGMFgq0CeYMh4fazTD2vtlZRYE7wyynxJb9U= +golang.org/x/time v0.0.0-20220922220347-f3bd1da661af h1:Yx9k8YCG3dvF87UAn2tu2HQLf2dt/eR1bXxpLMWeH+Y= golang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20181011042414-1f849cf54d09/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= @@ -1476,8 +1477,8 @@ google.golang.org/api v0.74.0/go.mod h1:ZpfMZOVRMywNyvJFeqL9HRWBgAuRfSjJFpe9QtRR google.golang.org/api v0.75.0/go.mod h1:pU9QmyHLnzlpar1Mjt4IbapUCy8J+6HD6GeELN69ljA= google.golang.org/api v0.78.0/go.mod h1:1Sg78yoMLOhlQTeF+ARBoytAcH1NNyyl390YMy6rKmw= google.golang.org/api v0.84.0/go.mod h1:NTsGnUFJMYROtiquksZHBWtHfeMC7iYthki7Eq3pa8o= -google.golang.org/api v0.99.0 h1:tsBtOIklCE2OFxhmcYSVqGwSAN/Y897srxmcvAQnwK8= -google.golang.org/api v0.99.0/go.mod h1:1YOf74vkVndF7pG6hIHuINsM7eWwpVTAfNMNiL91A08= +google.golang.org/api v0.101.0 h1:lJPPeEBIRxGpGLwnBTam1NPEM8Z2BmmXEd3z812pjwM= +google.golang.org/api v0.101.0/go.mod h1:CjxAAWWt3A3VrUE2IGDY2bgK5qhoG/OkyWVlYcP05MY= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= @@ -1570,8 +1571,8 @@ google.golang.org/genproto v0.0.0-20220505152158-f39f71e6c8f3/go.mod h1:RAyBrSAP google.golang.org/genproto v0.0.0-20220518221133-4f43b3371335/go.mod h1:RAyBrSAP7Fh3Nc84ghnVLDPuV51xc9agzmm4Ph6i0Q4= google.golang.org/genproto v0.0.0-20220608133413-ed9918b62aac/go.mod h1:KEWEmljWE5zPzLBa/oHl6DaEt9LmfH6WtH1OHIvleBA= google.golang.org/genproto v0.0.0-20220617124728-180714bec0ad/go.mod h1:KEWEmljWE5zPzLBa/oHl6DaEt9LmfH6WtH1OHIvleBA= -google.golang.org/genproto v0.0.0-20221010155953-15ba04fc1c0e h1:halCgTFuLWDRD61piiNSxPsARANGD3Xl16hPrLgLiIg= -google.golang.org/genproto v0.0.0-20221010155953-15ba04fc1c0e/go.mod h1:3526vdqwhZAwq4wsRUaVG555sVgsNmIjRtO7t/JH29U= +google.golang.org/genproto v0.0.0-20221018160656-63c7b68cfc55 h1:U1u4KB2kx6KR/aJDjQ97hZ15wQs8ZPvDcGcRynBhkvg= +google.golang.org/genproto v0.0.0-20221018160656-63c7b68cfc55/go.mod h1:45EK0dUbEZ2NHjCeAd2LXmyjAgGUGrpGROgjhC3ADck= google.golang.org/grpc v0.0.0-20160317175043-d3ddb4469d5a/go.mod h1:yo6s7OP7yaDglbqo1J04qKzAhqBH6lvTonzMVmEdcZw= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= diff --git a/sdks/go/pkg/beam/core/metrics/dumper_test.go b/sdks/go/pkg/beam/core/metrics/dumper_test.go new file mode 100644 index 000000000000..e618354eda8c --- /dev/null +++ b/sdks/go/pkg/beam/core/metrics/dumper_test.go @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package metrics + +import ( + "fmt" + "testing" + "time" + + "github.com/google/go-cmp/cmp" +) + +func TestDumperExtractor(t *testing.T) { + var got []string + printer := func(format string, args ...interface{}) { + got = append(got, fmt.Sprintf(format, args...)) + } + + store := newStore() + now := time.Date(2019, 1, 1, 0, 0, 0, 0, time.UTC) + store.storeMetric("pid", newName("ns", "counter"), &counter{value: 1}) + store.storeMetric("pid", newName("ns", "distribution"), &distribution{count: 1, sum: 2, min: 3, max: 4}) + store.storeMetric("pid", newName("ns", "gauge"), &gauge{v: 1, t: now}) + + expected := []string{ + "PTransformID: \"pid\"", + " ns.counter - value: 1", + " ns.distribution - count: 1 sum: 2 min: 3 max: 4", + " ns.gauge - Gauge time: 2019-01-01 00:00:00 +0000 UTC value: 1", + } + + dumperExtractor(store, printer) + if diff := cmp.Diff(expected, got); diff != "" { + t.Errorf("dumperExtractor() got diff (-want +got): %v", diff) + } +} diff --git a/sdks/go/pkg/beam/core/metrics/metrics_test.go b/sdks/go/pkg/beam/core/metrics/metrics_test.go index ff3141d748aa..75b483184ab2 100644 --- a/sdks/go/pkg/beam/core/metrics/metrics_test.go +++ b/sdks/go/pkg/beam/core/metrics/metrics_test.go @@ -400,6 +400,53 @@ func TestMergeDistributions(t *testing.T) { } } +func TestMergePCols(t *testing.T) { + realKey := StepKey{Name: "real"} + pColA := PColValue{ElementCount: 1, SampledByteSize: DistributionValue{Count: 2, Sum: 3, Min: 4, Max: 5}} + pColB := PColValue{ElementCount: 5, SampledByteSize: DistributionValue{Count: 4, Sum: 3, Min: 2, Max: 1}} + tests := []struct { + name string + attempted, committed map[StepKey]PColValue + want []PColResult + }{ + { + name: "merge", + attempted: map[StepKey]PColValue{ + realKey: pColA, + }, + committed: map[StepKey]PColValue{ + realKey: pColB, + }, + want: []PColResult{{Attempted: pColA, Committed: pColB, Key: realKey}}, + }, { + name: "attempted only", + attempted: map[StepKey]PColValue{ + realKey: pColA, + }, + committed: map[StepKey]PColValue{}, + want: []PColResult{{Attempted: pColA, Key: realKey}}, + }, { + name: "committed only", + attempted: map[StepKey]PColValue{}, + committed: map[StepKey]PColValue{ + realKey: pColB, + }, + want: []PColResult{{Committed: pColB, Key: realKey}}, + }, + } + less := func(a, b DistributionResult) bool { + return a.Key.Name < b.Key.Name + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + got := MergePCols(test.attempted, test.committed) + if d := cmp.Diff(test.want, got, cmpopts.SortSlices(less)); d != "" { + t.Errorf("MergePCols(%+v, %+v) = %+v, want %+v\ndiff:\n%v", test.attempted, test.committed, got, test.want, d) + } + }) + } +} + func TestMergeGauges(t *testing.T) { realKey := StepKey{Name: "real"} now := time.Now() @@ -449,6 +496,53 @@ func TestMergeGauges(t *testing.T) { } } +func TestMergeMsecs(t *testing.T) { + realKey := StepKey{Name: "real"} + msecA := MsecValue{Start: time.Second, Process: 2 * time.Second, Finish: time.Second, Total: 4 * time.Second} + msecB := MsecValue{Start: 2 * time.Second, Process: time.Second, Finish: 2 * time.Second, Total: 5 * time.Second} + tests := []struct { + name string + attempted, committed map[StepKey]MsecValue + want []MsecResult + }{ + { + name: "merge", + attempted: map[StepKey]MsecValue{ + realKey: msecA, + }, + committed: map[StepKey]MsecValue{ + realKey: msecB, + }, + want: []MsecResult{{Attempted: msecA, Committed: msecB, Key: realKey}}, + }, { + name: "attempted only", + attempted: map[StepKey]MsecValue{ + realKey: msecA, + }, + committed: map[StepKey]MsecValue{}, + want: []MsecResult{{Attempted: msecA, Key: realKey}}, + }, { + name: "committed only", + attempted: map[StepKey]MsecValue{}, + committed: map[StepKey]MsecValue{ + realKey: msecB, + }, + want: []MsecResult{{Committed: msecB, Key: realKey}}, + }, + } + less := func(a, b DistributionResult) bool { + return a.Key.Name < b.Key.Name + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + got := MergeMsecs(test.attempted, test.committed) + if d := cmp.Diff(test.want, got, cmpopts.SortSlices(less)); d != "" { + t.Errorf("MergeMsecs(%+v, %+v) = %+v, want %+v\ndiff:\n%v", test.attempted, test.committed, got, test.want, d) + } + }) + } +} + func TestMsecQueryResult(t *testing.T) { realKey := StepKey{Step: "sumFn"} msecA := MsecValue{Start: 0, Process: 0, Finish: 0, Total: 0} diff --git a/sdks/go/pkg/beam/core/metrics/store_test.go b/sdks/go/pkg/beam/core/metrics/store_test.go new file mode 100644 index 000000000000..dcdadad74292 --- /dev/null +++ b/sdks/go/pkg/beam/core/metrics/store_test.go @@ -0,0 +1,62 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package metrics + +import ( + "reflect" + "testing" + "time" +) + +func TestStore(t *testing.T) { + store := newStore() + + m := make(map[Labels]interface{}) + e := &Extractor{ + SumInt64: func(l Labels, v int64) { + m[l] = &counter{value: v} + }, + DistributionInt64: func(l Labels, count, sum, min, max int64) { + m[l] = &distribution{count: count, sum: sum, min: min, max: max} + }, + GaugeInt64: func(l Labels, v int64, t time.Time) { + m[l] = &gauge{v: v, t: t} + }, + MsecsInt64: func(labels string, e *[4]ExecutionState) {}, + } + + now := time.Now() + + store.storeMetric("pid", newName("ns", "counter"), &counter{value: 1}) + store.storeMetric("pid", newName("ns", "distribution"), &distribution{count: 1, sum: 2, min: 3, max: 4}) + store.storeMetric("pid", newName("ns", "gauge"), &gauge{v: 1, t: now}) + + // storing the same metric twice doesn't change anything + store.storeMetric("pid", newName("ns", "counter"), &counter{value: 2}) + + err := e.ExtractFrom(store) + if err != nil { + t.Fatalf("e.ExtractFrom(store) = %q, want nil", err) + } + + expected := map[Labels]interface{}{ + {transform: "pid", namespace: "ns", name: "counter"}: &counter{value: 1}, + {transform: "pid", namespace: "ns", name: "distribution"}: &distribution{count: 1, sum: 2, min: 3, max: 4}, + {transform: "pid", namespace: "ns", name: "gauge"}: &gauge{v: 1, t: now}, + } + if !reflect.DeepEqual(m, expected) { + t.Errorf("e.ExtractFrom(store) = %v, want %v", m, expected) + } +} diff --git a/sdks/go/pkg/beam/core/runtime/graphx/schema/schema.go b/sdks/go/pkg/beam/core/runtime/graphx/schema/schema.go index 7ca516b56944..c5bf708fa76f 100644 --- a/sdks/go/pkg/beam/core/runtime/graphx/schema/schema.go +++ b/sdks/go/pkg/beam/core/runtime/graphx/schema/schema.go @@ -92,6 +92,7 @@ func getUUID(ut reflect.Type) string { // Registered returns whether the given type has been registered with // the schema package. func (r *Registry) Registered(ut reflect.Type) bool { + r.reconcileRegistrations() _, ok := r.syntheticToUser[ut] return ok } @@ -118,7 +119,10 @@ func (r *Registry) reconcileRegistrations() (deferedErr error) { check := func(ut reflect.Type) bool { return coder.LookupCustomCoder(ut) != nil } - if check(ut) || check(reflect.PtrTo(ut)) { + // We could have either a pointer or non pointer here, + // so we strip pointerness and then check both. + vT := reflectx.SkipPtr(ut) + if check(vT) && check(reflect.PtrTo(vT)) { continue } if err := r.registerType(ut, map[reflect.Type]struct{}{}); err != nil { diff --git a/sdks/go/pkg/beam/core/runtime/graphx/schema/schema_test.go b/sdks/go/pkg/beam/core/runtime/graphx/schema/schema_test.go index 5ef2f707280a..37b3e79f8f50 100644 --- a/sdks/go/pkg/beam/core/runtime/graphx/schema/schema_test.go +++ b/sdks/go/pkg/beam/core/runtime/graphx/schema/schema_test.go @@ -26,6 +26,7 @@ import ( pipepb "github.com/apache/beam/sdks/v2/go/pkg/beam/model/pipeline_v1" "github.com/golang/protobuf/proto" "github.com/google/go-cmp/cmp" + "google.golang.org/protobuf/encoding/prototext" "google.golang.org/protobuf/testing/protocmp" ) @@ -792,6 +793,7 @@ func TestSchemaConversion(t *testing.T) { // real embedded type. if !hasEmbeddedField(test.rt) && !test.rt.AssignableTo(got) { t.Errorf("%v not assignable to %v", test.rt, got) + t.Errorf("%v for schema %v", test.rt, prototext.Format(test.st)) if d := cmp.Diff(reflect.New(test.rt).Elem().Interface(), reflect.New(got).Elem().Interface()); d != "" { t.Errorf("diff (-want, +got): %v", d) } diff --git a/sdks/go/pkg/beam/core/runtime/xlangx/expansionx/process.go b/sdks/go/pkg/beam/core/runtime/xlangx/expansionx/process.go index 28dc3294f44f..590c9392a991 100644 --- a/sdks/go/pkg/beam/core/runtime/xlangx/expansionx/process.go +++ b/sdks/go/pkg/beam/core/runtime/xlangx/expansionx/process.go @@ -58,7 +58,7 @@ func NewExpansionServiceRunner(jarPath, servicePort string) (*ExpansionServiceRu return &ExpansionServiceRunner{execPath: jarPath, servicePort: servicePort, serviceCommand: serviceCommand}, nil } -// NewExpansionServiceRunner builds an ExpansionServiceRunner struct for a given python module and +// NewPyExpansionServiceRunner builds an ExpansionServiceRunner struct for a given python module and // Beam version and returns a pointer to it. Passing an empty string as servicePort will request an // open port to be assigned to the service. func NewPyExpansionServiceRunner(pythonExec, module, servicePort string) (*ExpansionServiceRunner, error) { diff --git a/sdks/go/pkg/beam/register/emitter.go b/sdks/go/pkg/beam/register/emitter.go index 6c88d28d9fce..3b9cb9910d25 100644 --- a/sdks/go/pkg/beam/register/emitter.go +++ b/sdks/go/pkg/beam/register/emitter.go @@ -129,7 +129,9 @@ func Emitter1[T1 any]() { registerFunc := func(n exec.ElementProcessor) exec.ReusableEmitter { return &emit1[T1]{n: n} } - exec.RegisterEmitter(reflect.TypeOf(e).Elem(), registerFunc) + eT := reflect.TypeOf(e).Elem() + registerType(eT.In(0)) + exec.RegisterEmitter(eT, registerFunc) } // Emitter2 registers parameters from your DoFn with a @@ -147,18 +149,25 @@ func Emitter2[T1, T2 any]() { return &emit1WithTimestamp[T2]{n: n} } } - exec.RegisterEmitter(reflect.TypeOf(e).Elem(), registerFunc) + eT := reflect.TypeOf(e).Elem() + registerType(eT.In(0)) + registerType(eT.In(1)) + exec.RegisterEmitter(eT, registerFunc) } // Emitter3 registers parameters from your DoFn with a -// signature func(T1, T2, T3) and optimizes their execution. +// signature func(beam.EventTime, T2, T3) and optimizes their execution. // This must be done by passing in type parameters of all inputs as constraints, // aka: register.Emitter3[beam.EventTime, T1, T2](), where T1 is the type of // your key and T2 is the type of your value. -func Emitter3[T1 typex.EventTime, T2, T3 any]() { - e := (*func(T1, T2, T3))(nil) +func Emitter3[ET typex.EventTime, T1, T2 any]() { + e := (*func(ET, T1, T2))(nil) registerFunc := func(n exec.ElementProcessor) exec.ReusableEmitter { - return &emit2WithTimestamp[T2, T3]{n: n} + return &emit2WithTimestamp[T1, T2]{n: n} } - exec.RegisterEmitter(reflect.TypeOf(e).Elem(), registerFunc) + eT := reflect.TypeOf(e).Elem() + // No need to register event time. + registerType(eT.In(1)) + registerType(eT.In(2)) + exec.RegisterEmitter(eT, registerFunc) } diff --git a/sdks/go/pkg/beam/register/emitter_test.go b/sdks/go/pkg/beam/register/emitter_test.go new file mode 100644 index 000000000000..32a45f5da9e4 --- /dev/null +++ b/sdks/go/pkg/beam/register/emitter_test.go @@ -0,0 +1,166 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package register + +import ( + "context" + "reflect" + "testing" + + "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/mtime" + "github.com/apache/beam/sdks/v2/go/pkg/beam/core/runtime/exec" + "github.com/apache/beam/sdks/v2/go/pkg/beam/core/typex" +) + +type myTestTypeEmitter1 struct { + Int int +} + +func TestEmitter1(t *testing.T) { + Emitter1[int]() + e1T := reflect.TypeOf((*func(int))(nil)).Elem() + if !exec.IsEmitterRegistered(e1T) { + t.Fatalf("exec.IsEmitterRegistered(%v) = false, want true", e1T) + } + + Emitter1[myTestTypeEmitter1]() + rt := reflect.TypeOf((*myTestTypeEmitter1)(nil)).Elem() + checkRegisterations(t, rt) +} + +type myTestTypeEmitter2A struct { + Int int +} + +type myTestTypeEmitter2B struct { + String string +} + +func TestEmitter2(t *testing.T) { + Emitter2[int, string]() + e2isT := reflect.TypeOf((*func(int, string))(nil)).Elem() + if !exec.IsEmitterRegistered(e2isT) { + t.Fatalf("exec.IsEmitterRegistered(%v) = false, want true", e2isT) + } + + Emitter2[*myTestTypeEmitter2A, myTestTypeEmitter2B]() + e2ABT := reflect.TypeOf((*func(*myTestTypeEmitter2A, myTestTypeEmitter2B))(nil)).Elem() + if !exec.IsEmitterRegistered(e2ABT) { + t.Fatalf("exec.IsEmitterRegistered(%v) = false, want true", e2ABT) + } + + tA := reflect.TypeOf((*myTestTypeEmitter2A)(nil)).Elem() + checkRegisterations(t, tA) + tB := reflect.TypeOf((*myTestTypeEmitter2B)(nil)).Elem() + checkRegisterations(t, tB) +} + +func TestEmitter2_WithTimestamp(t *testing.T) { + Emitter2[typex.EventTime, string]() + e2tssT := reflect.TypeOf((*func(typex.EventTime, string))(nil)).Elem() + if !exec.IsEmitterRegistered(e2tssT) { + t.Fatalf("exec.IsEmitterRegistered(%v) = false, want true", e2tssT) + } +} + +type myTestTypeEmitter3A struct { + Int int +} + +type myTestTypeEmitter3B struct { + String string +} + +func TestEmitter3(t *testing.T) { + Emitter3[typex.EventTime, int, string]() + if !exec.IsEmitterRegistered(reflect.TypeOf((*func(typex.EventTime, int, string))(nil)).Elem()) { + t.Fatalf("exec.IsEmitterRegistered(reflect.TypeOf((*func(typex.EventTime, int, string))(nil)).Elem()) = false, want true") + } + + Emitter3[typex.EventTime, myTestTypeEmitter3A, *myTestTypeEmitter3B]() + e3tsABT := reflect.TypeOf((*func(typex.EventTime, myTestTypeEmitter3A, *myTestTypeEmitter3B))(nil)).Elem() + if !exec.IsEmitterRegistered(e3tsABT) { + t.Fatalf("exec.IsEmitterRegistered(%v) = false, want true", e3tsABT) + } + tA := reflect.TypeOf((*myTestTypeEmitter3A)(nil)).Elem() + checkRegisterations(t, tA) + tB := reflect.TypeOf((*myTestTypeEmitter3B)(nil)).Elem() + checkRegisterations(t, tB) +} + +func TestEmit1(t *testing.T) { + e := &emit1[int]{n: &elementProcessor{}} + e.Init(context.Background(), []typex.Window{}, mtime.ZeroTimestamp) + fn := e.Value().(func(int)) + fn(3) + if got, want := e.n.(*elementProcessor).inFV.Elm, 3; got != want { + t.Errorf("e.Value.(func(int))(3).n.inFV.Elm=%v, want %v", got, want) + } + if got := e.n.(*elementProcessor).inFV.Elm2; got != nil { + t.Errorf("e.Value.(func(int))(3).n.inFV.Elm2=%v, want nil", got) + } + if got, want := e.n.(*elementProcessor).inFV.Timestamp, mtime.ZeroTimestamp; got != want { + t.Errorf("e.Value.(func(int))(3).n.inFV.Timestamp=%v, want %v", got, want) + } +} + +func TestEmit2(t *testing.T) { + e := &emit2[int, string]{n: &elementProcessor{}} + e.Init(context.Background(), []typex.Window{}, mtime.ZeroTimestamp) + fn := e.Value().(func(int, string)) + fn(3, "hello") + if got, want := e.n.(*elementProcessor).inFV.Elm, 3; got != want { + t.Errorf("e.Value.(func(int))(3).n.inFV.Elm=%v, want %v", got, want) + } + if got, want := e.n.(*elementProcessor).inFV.Elm2, "hello"; got != want { + t.Errorf("e.Value.(func(int))(3).n.inFV.Elm2=%v, want %v", got, want) + } + if got, want := e.n.(*elementProcessor).inFV.Timestamp, mtime.ZeroTimestamp; got != want { + t.Errorf("e.Value.(func(int))(3).n.inFV.Timestamp=%v, want %v", got, want) + } +} + +func TestEmit1WithTimestamp(t *testing.T) { + e := &emit1WithTimestamp[int]{n: &elementProcessor{}} + e.Init(context.Background(), []typex.Window{}, mtime.ZeroTimestamp) + fn := e.Value().(func(typex.EventTime, int)) + fn(mtime.MaxTimestamp, 3) + if got, want := e.n.(*elementProcessor).inFV.Elm, 3; got != want { + t.Errorf("e.Value.(func(int))(3).n.inFV.Elm=%v, want %v", got, want) + } + if got := e.n.(*elementProcessor).inFV.Elm2; got != nil { + t.Errorf("e.Value.(func(int))(3).n.inFV.Elm2=%v, want nil", got) + } + if got, want := e.n.(*elementProcessor).inFV.Timestamp, mtime.MaxTimestamp; got != want { + t.Errorf("e.Value.(func(int))(3).n.inFV.Timestamp=%v, want %v", got, want) + } +} + +func TestEmit2WithTimestamp(t *testing.T) { + e := &emit2WithTimestamp[int, string]{n: &elementProcessor{}} + e.Init(context.Background(), []typex.Window{}, mtime.ZeroTimestamp) + fn := e.Value().(func(typex.EventTime, int, string)) + fn(mtime.MaxTimestamp, 3, "hello") + if got, want := e.n.(*elementProcessor).inFV.Elm, 3; got != want { + t.Errorf("e.Value.(func(int))(3).n.inFV.Elm=%v, want %v", got, want) + } + if got, want := e.n.(*elementProcessor).inFV.Elm2, "hello"; got != want { + t.Errorf("e.Value.(func(int))(3).n.inFV.Elm2=%v, want %v", got, want) + } + if got, want := e.n.(*elementProcessor).inFV.Timestamp, mtime.MaxTimestamp; got != want { + t.Errorf("e.Value.(func(int))(3).n.inFV.Timestamp=%v, want %v", got, want) + } +} diff --git a/sdks/go/pkg/beam/register/iter.go b/sdks/go/pkg/beam/register/iter.go index 71d3f3df723f..2244a68b88af 100644 --- a/sdks/go/pkg/beam/register/iter.go +++ b/sdks/go/pkg/beam/register/iter.go @@ -20,7 +20,10 @@ import ( "io" "reflect" + "github.com/apache/beam/sdks/v2/go/pkg/beam/core/runtime" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/runtime/exec" + "github.com/apache/beam/sdks/v2/go/pkg/beam/core/runtime/graphx/schema" + "github.com/apache/beam/sdks/v2/go/pkg/beam/core/util/reflectx" ) type iter1[T any] struct { @@ -104,6 +107,16 @@ func (v *iter2[T1, T2]) invoke(key *T1, value *T2) bool { return true } +func registerType(t reflect.Type) { + // strip the pointer if present. + t = reflectx.SkipPtr(t) + if _, ok := runtime.TypeKey(t); !ok { + return + } + runtime.RegisterType(t) + schema.RegisterType(t) +} + // Iter1 registers parameters from your DoFn with a // signature func(*T) bool and optimizes their execution. // This must be done by passing in type parameters of all inputs as constraints, @@ -113,7 +126,9 @@ func Iter1[T any]() { registerFunc := func(s exec.ReStream) exec.ReusableInput { return &iter1[T]{s: s} } - exec.RegisterInput(reflect.TypeOf(i).Elem(), registerFunc) + itT := reflect.TypeOf(i).Elem() + registerType(itT.In(0).Elem()) + exec.RegisterInput(itT, registerFunc) } // Iter1 registers parameters from your DoFn with a @@ -125,5 +140,8 @@ func Iter2[T1, T2 any]() { registerFunc := func(s exec.ReStream) exec.ReusableInput { return &iter2[T1, T2]{s: s} } - exec.RegisterInput(reflect.TypeOf(i).Elem(), registerFunc) + itT := reflect.TypeOf(i).Elem() + registerType(itT.In(0).Elem()) + registerType(itT.In(1).Elem()) + exec.RegisterInput(itT, registerFunc) } diff --git a/sdks/go/pkg/beam/register/iter_test.go b/sdks/go/pkg/beam/register/iter_test.go new file mode 100644 index 000000000000..8994fac13799 --- /dev/null +++ b/sdks/go/pkg/beam/register/iter_test.go @@ -0,0 +1,200 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package register + +import ( + "reflect" + "testing" + + "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/mtime" + "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/window" + "github.com/apache/beam/sdks/v2/go/pkg/beam/core/runtime" + "github.com/apache/beam/sdks/v2/go/pkg/beam/core/runtime/exec" + "github.com/apache/beam/sdks/v2/go/pkg/beam/core/runtime/graphx/schema" + "github.com/apache/beam/sdks/v2/go/pkg/beam/core/util/reflectx" +) + +type myTestTypeIter1 struct { + Int int +} + +func checkRegisterations(t *testing.T, ort reflect.Type) { + t.Helper() + // Strip pointers for the original type since type key doesn't support them. + // Pointer handling is done elsewhere. + rt := reflectx.SkipPtr(ort) + key, ok := runtime.TypeKey(rt) + if !ok { + t.Fatalf("runtime.TypeKey(%v): no typekey for type", rt) + } + if _, ok := runtime.LookupType(key); !ok { + t.Errorf("want type %v to be available with key %q", rt, key) + } + if !schema.Registered(ort) { + t.Errorf("want type %v to be registered with schemas", ort) + } +} + +func TestIter1(t *testing.T) { + Iter1[int]() + itiT := reflect.TypeOf((*func(*int) bool)(nil)).Elem() + if !exec.IsInputRegistered(itiT) { + t.Fatalf("exec.IsInputRegistered(%v) = false, want true", itiT) + } + + Iter1[myTestTypeIter1]() + it1T := reflect.TypeOf((*func(*int) bool)(nil)).Elem() + if !exec.IsInputRegistered(it1T) { + t.Fatalf("exec.IsInputRegistered(%v) = false, want true", it1T) + } + + ttrt := reflect.TypeOf((*myTestTypeIter1)(nil)).Elem() + checkRegisterations(t, ttrt) +} + +type myTestTypeIter2A struct { + Int int +} + +type myTestTypeIter2B struct { + Int int +} + +func TestIter2(t *testing.T) { + Iter2[int, string]() + it2isT := reflect.TypeOf((*func(*int, *string) bool)(nil)).Elem() + if !exec.IsInputRegistered(it2isT) { + t.Fatalf("exec.IsInputRegistered(%v) = false, want true", it2isT) + } + + Iter2[myTestTypeIter2A, *myTestTypeIter2B]() + it2ABT := reflect.TypeOf((*func(*myTestTypeIter2A, **myTestTypeIter2B) bool)(nil)).Elem() + if !exec.IsInputRegistered(it2ABT) { + t.Fatalf("exec.IsInputRegistered(%v) = false, want true", it2ABT) + } + + ttArt := reflect.TypeOf((*myTestTypeIter2A)(nil)).Elem() + checkRegisterations(t, ttArt) + ttBrt := reflect.TypeOf((*myTestTypeIter2B)(nil)) + checkRegisterations(t, ttBrt) +} + +func TestIter1_Struct(t *testing.T) { + values := []exec.FullValue{{ + Windows: window.SingleGlobalWindow, + Timestamp: mtime.ZeroTimestamp, + Elm: "one", + }, { + Windows: window.SingleGlobalWindow, + Timestamp: mtime.ZeroTimestamp, + Elm: "two", + }, { + Windows: window.SingleGlobalWindow, + Timestamp: mtime.ZeroTimestamp, + Elm: "three", + }} + + i := iter1[string]{s: &exec.FixedReStream{Buf: values}} + + i.Init() + fn := i.Value().(func(value *string) bool) + + var s string + if ok := fn(&s); !ok { + t.Fatalf("First i.Value()(&s)=false, want true") + } + if got, want := s, "one"; got != want { + t.Fatalf("First iter value = %v, want %v", got, want) + } + if ok := fn(&s); !ok { + t.Fatalf("Second i.Value()(&s)=false, want true") + } + if got, want := s, "two"; got != want { + t.Fatalf("Second iter value = %v, want %v", got, want) + } + if ok := fn(&s); !ok { + t.Fatalf("Third i.Value()(&s)=false, want true") + } + if got, want := s, "three"; got != want { + t.Fatalf("Third iter value = %v, want %v", got, want) + } + if ok := fn(&s); ok { + t.Fatalf("Fourth i.Value()(&s)=true, want false") + } + if err := i.Reset(); err != nil { + t.Fatalf("i.Reset()=%v, want nil", err) + } +} + +func TestIter2_Struct(t *testing.T) { + values := []exec.FullValue{{ + Windows: window.SingleGlobalWindow, + Timestamp: mtime.ZeroTimestamp, + Elm: 1, + Elm2: "one", + }, { + Windows: window.SingleGlobalWindow, + Timestamp: mtime.ZeroTimestamp, + Elm: 2, + Elm2: "two", + }, { + Windows: window.SingleGlobalWindow, + Timestamp: mtime.ZeroTimestamp, + Elm: 3, + Elm2: "three", + }} + + i := iter2[int, string]{s: &exec.FixedReStream{Buf: values}} + + i.Init() + fn := i.Value().(func(key *int, value *string) bool) + + var s string + var key int + if ok := fn(&key, &s); !ok { + t.Fatalf("First i.Value()(&s)=false, want true") + } + if got, want := key, 1; got != want { + t.Fatalf("First iter key = %v, want %v", got, want) + } + if got, want := s, "one"; got != want { + t.Fatalf("First iter value = %v, want %v", got, want) + } + if ok := fn(&key, &s); !ok { + t.Fatalf("Second i.Value()(&s)=false, want true") + } + if got, want := key, 2; got != want { + t.Fatalf("Second iter key = %v, want %v", got, want) + } + if got, want := s, "two"; got != want { + t.Fatalf("Second iter value = %v, want %v", got, want) + } + if ok := fn(&key, &s); !ok { + t.Fatalf("Third i.Value()(&s)=false, want true") + } + if got, want := key, 3; got != want { + t.Fatalf("Third iter key = %v, want %v", got, want) + } + if got, want := s, "three"; got != want { + t.Fatalf("Third iter value = %v, want %v", got, want) + } + if ok := fn(&key, &s); ok { + t.Fatalf("Fourth i.Value()(&s)=true, want false") + } + if err := i.Reset(); err != nil { + t.Fatalf("i.Reset()=%v, want nil", err) + } +} diff --git a/sdks/go/pkg/beam/register/register_test.go b/sdks/go/pkg/beam/register/register_test.go index 39962ab3c421..8cab02122e98 100644 --- a/sdks/go/pkg/beam/register/register_test.go +++ b/sdks/go/pkg/beam/register/register_test.go @@ -21,13 +21,10 @@ import ( "testing" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph" - "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/mtime" - "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/window" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/runtime" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/runtime/exec" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/runtime/graphx" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/runtime/graphx/schema" - "github.com/apache/beam/sdks/v2/go/pkg/beam/core/typex" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/util/reflectx" ) @@ -324,219 +321,6 @@ func TestCombiner_PartialCombiner2(t *testing.T) { } } -func TestEmitter1(t *testing.T) { - Emitter1[int]() - if !exec.IsEmitterRegistered(reflect.TypeOf((*func(int))(nil)).Elem()) { - t.Fatalf("exec.IsEmitterRegistered(reflect.TypeOf((*func(int))(nil)).Elem()) = false, want true") - } -} - -func TestEmitter2(t *testing.T) { - Emitter2[int, string]() - if !exec.IsEmitterRegistered(reflect.TypeOf((*func(int, string))(nil)).Elem()) { - t.Fatalf("exec.IsEmitterRegistered(reflect.TypeOf((*func(int, string))(nil)).Elem()) = false, want true") - } -} - -func TestEmitter2_WithTimestamp(t *testing.T) { - Emitter2[typex.EventTime, string]() - if !exec.IsEmitterRegistered(reflect.TypeOf((*func(typex.EventTime, string))(nil)).Elem()) { - t.Fatalf("exec.IsEmitterRegistered(reflect.TypeOf((*func(typex.EventTime, string))(nil)).Elem()) = false, want true") - } -} - -func TestEmitter3(t *testing.T) { - Emitter3[typex.EventTime, int, string]() - if !exec.IsEmitterRegistered(reflect.TypeOf((*func(typex.EventTime, int, string))(nil)).Elem()) { - t.Fatalf("exec.IsEmitterRegistered(reflect.TypeOf((*func(typex.EventTime, int, string))(nil)).Elem()) = false, want true") - } -} - -func TestEmit1(t *testing.T) { - e := &emit1[int]{n: &elementProcessor{}} - e.Init(context.Background(), []typex.Window{}, mtime.ZeroTimestamp) - fn := e.Value().(func(int)) - fn(3) - if got, want := e.n.(*elementProcessor).inFV.Elm, 3; got != want { - t.Errorf("e.Value.(func(int))(3).n.inFV.Elm=%v, want %v", got, want) - } - if got := e.n.(*elementProcessor).inFV.Elm2; got != nil { - t.Errorf("e.Value.(func(int))(3).n.inFV.Elm2=%v, want nil", got) - } - if got, want := e.n.(*elementProcessor).inFV.Timestamp, mtime.ZeroTimestamp; got != want { - t.Errorf("e.Value.(func(int))(3).n.inFV.Timestamp=%v, want %v", got, want) - } -} - -func TestEmit2(t *testing.T) { - e := &emit2[int, string]{n: &elementProcessor{}} - e.Init(context.Background(), []typex.Window{}, mtime.ZeroTimestamp) - fn := e.Value().(func(int, string)) - fn(3, "hello") - if got, want := e.n.(*elementProcessor).inFV.Elm, 3; got != want { - t.Errorf("e.Value.(func(int))(3).n.inFV.Elm=%v, want %v", got, want) - } - if got, want := e.n.(*elementProcessor).inFV.Elm2, "hello"; got != want { - t.Errorf("e.Value.(func(int))(3).n.inFV.Elm2=%v, want %v", got, want) - } - if got, want := e.n.(*elementProcessor).inFV.Timestamp, mtime.ZeroTimestamp; got != want { - t.Errorf("e.Value.(func(int))(3).n.inFV.Timestamp=%v, want %v", got, want) - } -} - -func TestEmit1WithTimestamp(t *testing.T) { - e := &emit1WithTimestamp[int]{n: &elementProcessor{}} - e.Init(context.Background(), []typex.Window{}, mtime.ZeroTimestamp) - fn := e.Value().(func(typex.EventTime, int)) - fn(mtime.MaxTimestamp, 3) - if got, want := e.n.(*elementProcessor).inFV.Elm, 3; got != want { - t.Errorf("e.Value.(func(int))(3).n.inFV.Elm=%v, want %v", got, want) - } - if got := e.n.(*elementProcessor).inFV.Elm2; got != nil { - t.Errorf("e.Value.(func(int))(3).n.inFV.Elm2=%v, want nil", got) - } - if got, want := e.n.(*elementProcessor).inFV.Timestamp, mtime.MaxTimestamp; got != want { - t.Errorf("e.Value.(func(int))(3).n.inFV.Timestamp=%v, want %v", got, want) - } -} - -func TestEmit2WithTimestamp(t *testing.T) { - e := &emit2WithTimestamp[int, string]{n: &elementProcessor{}} - e.Init(context.Background(), []typex.Window{}, mtime.ZeroTimestamp) - fn := e.Value().(func(typex.EventTime, int, string)) - fn(mtime.MaxTimestamp, 3, "hello") - if got, want := e.n.(*elementProcessor).inFV.Elm, 3; got != want { - t.Errorf("e.Value.(func(int))(3).n.inFV.Elm=%v, want %v", got, want) - } - if got, want := e.n.(*elementProcessor).inFV.Elm2, "hello"; got != want { - t.Errorf("e.Value.(func(int))(3).n.inFV.Elm2=%v, want %v", got, want) - } - if got, want := e.n.(*elementProcessor).inFV.Timestamp, mtime.MaxTimestamp; got != want { - t.Errorf("e.Value.(func(int))(3).n.inFV.Timestamp=%v, want %v", got, want) - } -} - -func TestIter1(t *testing.T) { - Iter1[int]() - if !exec.IsInputRegistered(reflect.TypeOf((*func(*int) bool)(nil)).Elem()) { - t.Fatalf("exec.IsInputRegistered(reflect.TypeOf(((*func(*int) bool)(nil)).Elem()) = false, want true") - } -} - -func TestIter2(t *testing.T) { - Iter2[int, string]() - if !exec.IsInputRegistered(reflect.TypeOf((*func(*int, *string) bool)(nil)).Elem()) { - t.Fatalf("exec.IsInputRegistered(reflect.TypeOf((*func(*int, *string) bool)(nil)).Elem()) = false, want true") - } -} - -func TestIter1_Struct(t *testing.T) { - values := []exec.FullValue{exec.FullValue{ - Windows: window.SingleGlobalWindow, - Timestamp: mtime.ZeroTimestamp, - Elm: "one", - }, exec.FullValue{ - Windows: window.SingleGlobalWindow, - Timestamp: mtime.ZeroTimestamp, - Elm: "two", - }, exec.FullValue{ - Windows: window.SingleGlobalWindow, - Timestamp: mtime.ZeroTimestamp, - Elm: "three", - }} - - i := iter1[string]{s: &exec.FixedReStream{Buf: values}} - - i.Init() - fn := i.Value().(func(value *string) bool) - - var s string - if ok := fn(&s); !ok { - t.Fatalf("First i.Value()(&s)=false, want true") - } - if got, want := s, "one"; got != want { - t.Fatalf("First iter value = %v, want %v", got, want) - } - if ok := fn(&s); !ok { - t.Fatalf("Second i.Value()(&s)=false, want true") - } - if got, want := s, "two"; got != want { - t.Fatalf("Second iter value = %v, want %v", got, want) - } - if ok := fn(&s); !ok { - t.Fatalf("Third i.Value()(&s)=false, want true") - } - if got, want := s, "three"; got != want { - t.Fatalf("Third iter value = %v, want %v", got, want) - } - if ok := fn(&s); ok { - t.Fatalf("Fourth i.Value()(&s)=true, want false") - } - if err := i.Reset(); err != nil { - t.Fatalf("i.Reset()=%v, want nil", err) - } -} - -func TestIter2_Struct(t *testing.T) { - values := []exec.FullValue{exec.FullValue{ - Windows: window.SingleGlobalWindow, - Timestamp: mtime.ZeroTimestamp, - Elm: 1, - Elm2: "one", - }, exec.FullValue{ - Windows: window.SingleGlobalWindow, - Timestamp: mtime.ZeroTimestamp, - Elm: 2, - Elm2: "two", - }, exec.FullValue{ - Windows: window.SingleGlobalWindow, - Timestamp: mtime.ZeroTimestamp, - Elm: 3, - Elm2: "three", - }} - - i := iter2[int, string]{s: &exec.FixedReStream{Buf: values}} - - i.Init() - fn := i.Value().(func(key *int, value *string) bool) - - var s string - var key int - if ok := fn(&key, &s); !ok { - t.Fatalf("First i.Value()(&s)=false, want true") - } - if got, want := key, 1; got != want { - t.Fatalf("First iter key = %v, want %v", got, want) - } - if got, want := s, "one"; got != want { - t.Fatalf("First iter value = %v, want %v", got, want) - } - if ok := fn(&key, &s); !ok { - t.Fatalf("Second i.Value()(&s)=false, want true") - } - if got, want := key, 2; got != want { - t.Fatalf("Second iter key = %v, want %v", got, want) - } - if got, want := s, "two"; got != want { - t.Fatalf("Second iter value = %v, want %v", got, want) - } - if ok := fn(&key, &s); !ok { - t.Fatalf("Third i.Value()(&s)=false, want true") - } - if got, want := key, 3; got != want { - t.Fatalf("Third iter key = %v, want %v", got, want) - } - if got, want := s, "three"; got != want { - t.Fatalf("Third iter value = %v, want %v", got, want) - } - if ok := fn(&key, &s); ok { - t.Fatalf("Fourth i.Value()(&s)=true, want false") - } - if err := i.Reset(); err != nil { - t.Fatalf("i.Reset()=%v, want nil", err) - } -} - type CustomFunctionParameter struct { key string val int diff --git a/sdks/go/pkg/beam/transforms/xlang/python/external.go b/sdks/go/pkg/beam/transforms/xlang/python/external.go index 629ede0f9527..3fd6edd37e32 100644 --- a/sdks/go/pkg/beam/transforms/xlang/python/external.go +++ b/sdks/go/pkg/beam/transforms/xlang/python/external.go @@ -27,7 +27,8 @@ import ( ) const ( - pythonCallableUrn = "beam:logical_type:python_callable:v1" + pythonCallableUrn = "beam:logical_type:python_callable:v1" + // ExpansionServiceModule is the module containing the python expansion service for python external transforms. ExpansionServiceModule = "apache_beam.runners.portability.expansion_service_main" ) diff --git a/sdks/go/test/build.gradle b/sdks/go/test/build.gradle index 76acadb5db17..5d34f9c72c8a 100644 --- a/sdks/go/test/build.gradle +++ b/sdks/go/test/build.gradle @@ -104,7 +104,7 @@ task sparkValidatesRunner { dependsOn ":sdks:go:test:goBuild" dependsOn ":sdks:java:container:java8:docker" - dependsOn ":runners:spark:2:job-server:shadowJar" + dependsOn ":runners:spark:3:job-server:shadowJar" dependsOn ":sdks:java:testing:expansion-service:buildTestExpansionServiceJar" doLast { def pipelineOptions = [ // Pipeline options piped directly to Go SDK flags. @@ -112,7 +112,7 @@ task sparkValidatesRunner { ] def options = [ "--runner spark", - "--spark_job_server_jar ${project(":runners:spark:2:job-server").shadowJar.archivePath}", + "--spark_job_server_jar ${project(":runners:spark:3:job-server").shadowJar.archivePath}", "--pipeline_opts \"${pipelineOptions.join(' ')}\"", ] exec { diff --git a/sdks/go/test/regression/coders/fromyaml/fromyaml.go b/sdks/go/test/regression/coders/fromyaml/fromyaml.go index 86d7969c2a08..5fddc6226dd5 100644 --- a/sdks/go/test/regression/coders/fromyaml/fromyaml.go +++ b/sdks/go/test/regression/coders/fromyaml/fromyaml.go @@ -53,6 +53,7 @@ var filteredCases = []struct{ filter, reason string }{ {"30ea5a25-dcd8-4cdb-abeb-5332d15ab4b9", "https://github.com/apache/beam/issues/21206: Support encoding position."}, {"80be749a-5700-4ede-89d8-dd9a4433a3f8", "https://github.com/apache/beam/issues/19817: Support millis_instant."}, {"800c44ae-a1b7-4def-bbf6-6217cca89ec4", "https://github.com/apache/beam/issues/19817: Support decimal."}, + {"f0ffb3a4-f46f-41ca-a942-85e3e939452a", "https://github.com/apache/beam/issues/23526: Support char/varchar, binary/varbinary."}, } // Coder is a representation a serialized beam coder. diff --git a/sdks/java/container/agent/build.gradle b/sdks/java/container/agent/build.gradle index 9d86fd430a6a..df3780e45446 100644 --- a/sdks/java/container/agent/build.gradle +++ b/sdks/java/container/agent/build.gradle @@ -19,6 +19,13 @@ plugins { id 'org.apache.beam.module' } + +if (project.hasProperty('java11Home')) { + javaVersion = "1.11" +} else if (project.hasProperty('java17Home')) { + javaVersion = "1.17" +} + applyJavaNature( exportJavadoc: false, publish: false @@ -35,9 +42,7 @@ jar { } } - if (project.hasProperty('java11Home')) { - javaVersion = "1.11" def java11Home = project.findProperty('java11Home') project.tasks.withType(JavaCompile) { options.fork = true @@ -45,7 +50,6 @@ if (project.hasProperty('java11Home')) { options.compilerArgs += ['-Xlint:-path'] } } else if (project.hasProperty('java17Home')) { - javaVersion = "1.17" project.tasks.withType(JavaCompile) { setJava17Options(options) diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/GenerateSequence.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/GenerateSequence.java index 78ac1777e95f..742a75960749 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/GenerateSequence.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/GenerateSequence.java @@ -17,6 +17,7 @@ */ package org.apache.beam.sdk.io; +import static org.apache.beam.sdk.util.Preconditions.checkArgumentNotNull; import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument; import com.google.auto.service.AutoService; @@ -33,6 +34,7 @@ import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions; import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap; import org.checkerframework.checker.nullness.qual.Nullable; +import org.checkerframework.dataflow.qual.Pure; import org.joda.time.Duration; import org.joda.time.Instant; @@ -70,22 +72,26 @@ * will be present in the resulting {@link PCollection}. */ @AutoValue -@SuppressWarnings({ - "nullness" // TODO(https://github.com/apache/beam/issues/20497) -}) public abstract class GenerateSequence extends PTransform> { + @Pure abstract long getFrom(); + @Pure abstract long getTo(); + @Pure abstract @Nullable SerializableFunction getTimestampFn(); + @Pure abstract long getElementsPerPeriod(); + @Pure abstract @Nullable Duration getPeriod(); + @Pure abstract @Nullable Duration getMaxReadTime(); + @Pure abstract Builder toBuilder(); @AutoValue.Builder @@ -97,14 +103,15 @@ abstract static class Builder abstract Builder setTo(long to); - abstract Builder setTimestampFn(SerializableFunction timestampFn); + abstract Builder setTimestampFn(@Nullable SerializableFunction timestampFn); abstract Builder setElementsPerPeriod(long elementsPerPeriod); - abstract Builder setPeriod(Duration period); + abstract Builder setPeriod(@Nullable Duration period); - abstract Builder setMaxReadTime(Duration maxReadTime); + abstract Builder setMaxReadTime(@Nullable Duration maxReadTime); + @Pure abstract GenerateSequence build(); @Override @@ -144,7 +151,7 @@ public static class External implements ExternalTransformRegistrar { /** Parameters class to expose the transform to an external SDK. */ @Experimental public static class ExternalConfiguration { - private Long start; + private Long start = 0L; private @Nullable Long stop; private @Nullable Long period; private @Nullable Long maxReadTime; @@ -223,8 +230,13 @@ public PCollection expand(PBegin input) { if (getTimestampFn() != null) { source = source.withTimestampFn(getTimestampFn()); } - if (getElementsPerPeriod() > 0) { - source = source.withRate(getElementsPerPeriod(), getPeriod()); + if (getPeriod() != null || getElementsPerPeriod() > 0) { + Duration period = + checkArgumentNotNull( + getPeriod(), "elements per period specified, but no period specified"); + checkArgument( + getElementsPerPeriod() > 0, "elements per period not specified, but period specified"); + source = source.withRate(getElementsPerPeriod(), period); } Read.Unbounded readUnbounded = Read.from(source); diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/SchemaTranslation.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/SchemaTranslation.java index f38e50aea6f4..f79db31bf7ec 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/SchemaTranslation.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/SchemaTranslation.java @@ -44,11 +44,15 @@ import org.apache.beam.sdk.schemas.Schema.FieldType; import org.apache.beam.sdk.schemas.Schema.LogicalType; import org.apache.beam.sdk.schemas.Schema.TypeName; +import org.apache.beam.sdk.schemas.logicaltypes.FixedBytes; import org.apache.beam.sdk.schemas.logicaltypes.FixedPrecisionNumeric; +import org.apache.beam.sdk.schemas.logicaltypes.FixedString; import org.apache.beam.sdk.schemas.logicaltypes.MicrosInstant; import org.apache.beam.sdk.schemas.logicaltypes.PythonCallable; import org.apache.beam.sdk.schemas.logicaltypes.SchemaLogicalType; import org.apache.beam.sdk.schemas.logicaltypes.UnknownLogicalType; +import org.apache.beam.sdk.schemas.logicaltypes.VariableBytes; +import org.apache.beam.sdk.schemas.logicaltypes.VariableString; import org.apache.beam.sdk.util.SerializableUtils; import org.apache.beam.sdk.values.Row; import org.apache.beam.vendor.grpc.v1p48p1.com.google.protobuf.ByteString; @@ -57,6 +61,7 @@ import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables; import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Maps; import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.io.ByteStreams; +import org.apache.commons.lang3.ClassUtils; import org.checkerframework.checker.nullness.qual.Nullable; /** Utility methods for translating schemas. */ @@ -85,6 +90,10 @@ public class SchemaTranslation { .put(MicrosInstant.IDENTIFIER, MicrosInstant.class) .put(SchemaLogicalType.IDENTIFIER, SchemaLogicalType.class) .put(PythonCallable.IDENTIFIER, PythonCallable.class) + .put(FixedBytes.IDENTIFIER, FixedBytes.class) + .put(VariableBytes.IDENTIFIER, VariableBytes.class) + .put(FixedString.IDENTIFIER, FixedString.class) + .put(VariableString.IDENTIFIER, VariableString.class) .build(); public static SchemaApi.Schema schemaToProto(Schema schema, boolean serializeLogicalType) { @@ -350,7 +359,10 @@ private static FieldType fieldTypeFromProtoWithoutNullable(SchemaApi.FieldType p Object fieldValue = Objects.requireNonNull(fieldValueFromProto(fieldType, logicalType.getArgument())); Class clazz = fieldValue.getClass(); - if (fieldValue instanceof List) { + if (ClassUtils.isPrimitiveWrapper(clazz)) { + // argument is a primitive wrapper type (e.g. Integer) + clazz = ClassUtils.wrapperToPrimitive(clazz); + } else if (fieldValue instanceof List) { // argument is ArrayValue or iterableValue clazz = List.class; } diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/logicaltypes/FixedBytes.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/logicaltypes/FixedBytes.java index 4022c634acdf..886f0851c494 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/logicaltypes/FixedBytes.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/logicaltypes/FixedBytes.java @@ -20,67 +20,68 @@ import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument; import java.util.Arrays; -import org.apache.beam.sdk.annotations.Experimental; -import org.apache.beam.sdk.annotations.Experimental.Kind; +import org.apache.beam.model.pipeline.v1.RunnerApi; +import org.apache.beam.model.pipeline.v1.SchemaApi; import org.apache.beam.sdk.schemas.Schema.FieldType; -import org.apache.beam.sdk.schemas.Schema.LogicalType; +import org.checkerframework.checker.nullness.qual.Nullable; -/** A LogicalType representing a fixed-size byte array. */ -@Experimental(Kind.SCHEMAS) -public class FixedBytes implements LogicalType { - public static final String IDENTIFIER = "FixedBytes"; - private final int byteArraySize; +/** A LogicalType representing a fixed-length byte array. */ +public class FixedBytes extends PassThroughLogicalType { + public static final String IDENTIFIER = + SchemaApi.LogicalTypes.Enum.FIXED_BYTES + .getValueDescriptor() + .getOptions() + .getExtension(RunnerApi.beamUrn); - private FixedBytes(int byteArraySize) { - this.byteArraySize = byteArraySize; - } + private final @Nullable String name; + private final int byteArrayLength; - public static FixedBytes of(int byteArraySize) { - return new FixedBytes(byteArraySize); + /** + * Return an instance of FixedBytes with specified byte array length. + * + *

The name, if set, refers to the TYPE name in the underlying database, for example, BINARY. + */ + public static FixedBytes of(@Nullable String name, int byteArrayLength) { + return new FixedBytes(name, byteArrayLength); } - public int getLength() { - return byteArraySize; + /** Return an instance of FixedBytes with specified byte array length. */ + public static FixedBytes of(int byteArrayLength) { + return of(null, byteArrayLength); } - @Override - public String getIdentifier() { - return IDENTIFIER; + private FixedBytes(@Nullable String name, int byteArrayLength) { + super(IDENTIFIER, FieldType.INT32, byteArrayLength, FieldType.BYTES); + this.name = name; + this.byteArrayLength = byteArrayLength; } - @Override - public FieldType getArgumentType() { - return FieldType.INT32; - } - - @Override - public Integer getArgument() { - return byteArraySize; + public int getLength() { + return byteArrayLength; } - @Override - public FieldType getBaseType() { - return FieldType.BYTES; + public @Nullable String getName() { + return name; } @Override public byte[] toBaseType(byte[] input) { - checkArgument(input.length == byteArraySize); + checkArgument(input.length == byteArrayLength); return input; } @Override public byte[] toInputType(byte[] base) { - checkArgument(base.length <= byteArraySize); - if (base.length == byteArraySize) { + checkArgument(base.length <= byteArrayLength); + if (base.length == byteArrayLength) { return base; } else { - return Arrays.copyOf(base, byteArraySize); + return Arrays.copyOf(base, byteArrayLength); } } @Override public String toString() { - return "FixedBytes: " + byteArraySize; + return "FixedBytes: " + byteArrayLength; } } diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/logicaltypes/FixedString.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/logicaltypes/FixedString.java new file mode 100644 index 000000000000..72dd97fae837 --- /dev/null +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/logicaltypes/FixedString.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.schemas.logicaltypes; + +import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument; + +import org.apache.beam.model.pipeline.v1.RunnerApi; +import org.apache.beam.model.pipeline.v1.SchemaApi; +import org.apache.beam.sdk.schemas.Schema.FieldType; +import org.apache.commons.lang3.StringUtils; +import org.checkerframework.checker.nullness.qual.Nullable; + +/** A LogicalType representing a fixed-length string. */ +public class FixedString extends PassThroughLogicalType { + public static final String IDENTIFIER = + SchemaApi.LogicalTypes.Enum.FIXED_CHAR + .getValueDescriptor() + .getOptions() + .getExtension(RunnerApi.beamUrn); + private final @Nullable String name; + private final int stringLength; + + /** + * Return an instance of FixedString with specified string length. + * + *

The name, if set, refers to the TYPE name in the underlying database, for example, CHAR. + */ + public static FixedString of(@Nullable String name, int stringLength) { + return new FixedString(name, stringLength); + } + + /** Return an instance of FixedString with specified string length. */ + public static FixedString of(int stringLength) { + return new FixedString(null, stringLength); + } + + private FixedString(@Nullable String name, int stringLength) { + super(IDENTIFIER, FieldType.INT32, stringLength, FieldType.STRING); + this.name = name; + this.stringLength = stringLength; + } + + public int getLength() { + return stringLength; + } + + public @Nullable String getName() { + return name; + } + + @Override + public String toInputType(String base) { + checkArgument(base.length() <= stringLength); + + return StringUtils.rightPad(base, stringLength); + } + + @Override + public String toString() { + return "FixedString: " + stringLength; + } +} diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/logicaltypes/VariableBytes.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/logicaltypes/VariableBytes.java new file mode 100644 index 000000000000..4c1cb87f8f9c --- /dev/null +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/logicaltypes/VariableBytes.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.schemas.logicaltypes; + +import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument; + +import org.apache.beam.model.pipeline.v1.RunnerApi; +import org.apache.beam.model.pipeline.v1.SchemaApi; +import org.apache.beam.sdk.schemas.Schema.FieldType; +import org.checkerframework.checker.nullness.qual.Nullable; + +/** A LogicalType representing a variable-length byte array with specified maximum length. */ +public class VariableBytes extends PassThroughLogicalType { + public static final String IDENTIFIER = + SchemaApi.LogicalTypes.Enum.VAR_BYTES + .getValueDescriptor() + .getOptions() + .getExtension(RunnerApi.beamUrn); + private final @Nullable String name; + private final int maxByteArrayLength; + + /** + * Return an instance of VariableBytes with specified max byte array length. + * + *

The name, if set, refers to the TYPE name in the underlying database, for example, VARBINARY + * and LONGVARBINARY. + */ + public static VariableBytes of(@Nullable String name, int maxByteArrayLength) { + return new VariableBytes(name, maxByteArrayLength); + } + + /** Return an instance of VariableBytes with specified max byte array length. */ + public static VariableBytes of(int maxByteArrayLength) { + return of(null, maxByteArrayLength); + } + + private VariableBytes(@Nullable String name, int maxByteArrayLength) { + super(IDENTIFIER, FieldType.INT32, maxByteArrayLength, FieldType.BYTES); + this.name = name; + this.maxByteArrayLength = maxByteArrayLength; + } + + public int getMaxLength() { + return maxByteArrayLength; + } + + public @Nullable String getName() { + return name; + } + + @Override + public byte[] toInputType(byte[] base) { + checkArgument(base.length <= maxByteArrayLength); + return base; + } + + @Override + public String toString() { + return "VariableBytes: " + maxByteArrayLength; + } +} diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/logicaltypes/VariableString.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/logicaltypes/VariableString.java new file mode 100644 index 000000000000..c635e70a625a --- /dev/null +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/logicaltypes/VariableString.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.schemas.logicaltypes; + +import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument; + +import org.apache.beam.model.pipeline.v1.RunnerApi; +import org.apache.beam.model.pipeline.v1.SchemaApi; +import org.apache.beam.sdk.schemas.Schema.FieldType; +import org.checkerframework.checker.nullness.qual.Nullable; + +/** A LogicalType representing a variable-length string with specified maximum length. */ +public class VariableString extends PassThroughLogicalType { + public static final String IDENTIFIER = + SchemaApi.LogicalTypes.Enum.VAR_CHAR + .getValueDescriptor() + .getOptions() + .getExtension(RunnerApi.beamUrn); + private final @Nullable String name; + private final int maxStringLength; + + /** + * Return an instance of VariableString with specified max string length. + * + *

The name, if set, refers to the TYPE name in the underlying database, for example, VARCHAR + * and LONGVARCHAR. + */ + public static VariableString of(@Nullable String name, int maxStringLength) { + return new VariableString(name, maxStringLength); + } + + /** Return an instance of VariableString with specified max string length. */ + public static VariableString of(int maxStringLength) { + return of(null, maxStringLength); + } + + private VariableString(@Nullable String name, int maxStringLength) { + super(IDENTIFIER, FieldType.INT32, maxStringLength, FieldType.STRING); + this.name = name; + this.maxStringLength = maxStringLength; + } + + public int getMaxLength() { + return maxStringLength; + } + + public @Nullable String getName() { + return name; + } + + @Override + public String toInputType(String base) { + checkArgument(base.length() <= maxStringLength); + return base; + } + + @Override + public String toString() { + return "VariableString: " + maxStringLength; + } +} diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/AvroUtils.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/AvroUtils.java index 8636e31300ce..8d01ed0406a0 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/AvroUtils.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/schemas/utils/AvroUtils.java @@ -74,7 +74,10 @@ import org.apache.beam.sdk.schemas.SchemaUserTypeCreator; import org.apache.beam.sdk.schemas.logicaltypes.EnumerationType; import org.apache.beam.sdk.schemas.logicaltypes.FixedBytes; +import org.apache.beam.sdk.schemas.logicaltypes.FixedString; import org.apache.beam.sdk.schemas.logicaltypes.OneOfType; +import org.apache.beam.sdk.schemas.logicaltypes.VariableBytes; +import org.apache.beam.sdk.schemas.logicaltypes.VariableString; import org.apache.beam.sdk.schemas.utils.ByteBuddyUtils.ConvertType; import org.apache.beam.sdk.schemas.utils.ByteBuddyUtils.ConvertValueForGetter; import org.apache.beam.sdk.schemas.utils.ByteBuddyUtils.ConvertValueForSetter; @@ -899,48 +902,44 @@ private static org.apache.avro.Schema getFieldSchema( break; case LOGICAL_TYPE: - switch (fieldType.getLogicalType().getIdentifier()) { - case FixedBytes.IDENTIFIER: - FixedBytesField fixedBytesField = - checkNotNull(FixedBytesField.fromBeamFieldType(fieldType)); - baseType = fixedBytesField.toAvroType("fixed", namespace + "." + fieldName); - break; - case EnumerationType.IDENTIFIER: - EnumerationType enumerationType = fieldType.getLogicalType(EnumerationType.class); - baseType = - org.apache.avro.Schema.createEnum(fieldName, "", "", enumerationType.getValues()); - break; - case OneOfType.IDENTIFIER: - OneOfType oneOfType = fieldType.getLogicalType(OneOfType.class); - baseType = - org.apache.avro.Schema.createUnion( - oneOfType.getOneOfSchema().getFields().stream() - .map(x -> getFieldSchema(x.getType(), x.getName(), namespace)) - .collect(Collectors.toList())); - break; - case "CHAR": - case "NCHAR": - baseType = - buildHiveLogicalTypeSchema("char", (int) fieldType.getLogicalType().getArgument()); - break; - case "NVARCHAR": - case "VARCHAR": - case "LONGNVARCHAR": - case "LONGVARCHAR": - baseType = - buildHiveLogicalTypeSchema( - "varchar", (int) fieldType.getLogicalType().getArgument()); - break; - case "DATE": - baseType = LogicalTypes.date().addToSchema(org.apache.avro.Schema.create(Type.INT)); - break; - case "TIME": - baseType = - LogicalTypes.timeMillis().addToSchema(org.apache.avro.Schema.create(Type.INT)); - break; - default: - throw new RuntimeException( - "Unhandled logical type " + fieldType.getLogicalType().getIdentifier()); + String identifier = fieldType.getLogicalType().getIdentifier(); + if (FixedBytes.IDENTIFIER.equals(identifier)) { + FixedBytesField fixedBytesField = + checkNotNull(FixedBytesField.fromBeamFieldType(fieldType)); + baseType = fixedBytesField.toAvroType("fixed", namespace + "." + fieldName); + } else if (VariableBytes.IDENTIFIER.equals(identifier)) { + // treat VARBINARY as bytes as that is what avro supports + baseType = org.apache.avro.Schema.create(Type.BYTES); + } else if (FixedString.IDENTIFIER.equals(identifier) + || "CHAR".equals(identifier) + || "NCHAR".equals(identifier)) { + baseType = + buildHiveLogicalTypeSchema("char", (int) fieldType.getLogicalType().getArgument()); + } else if (VariableString.IDENTIFIER.equals(identifier) + || "NVARCHAR".equals(identifier) + || "VARCHAR".equals(identifier) + || "LONGNVARCHAR".equals(identifier) + || "LONGVARCHAR".equals(identifier)) { + baseType = + buildHiveLogicalTypeSchema("varchar", (int) fieldType.getLogicalType().getArgument()); + } else if (EnumerationType.IDENTIFIER.equals(identifier)) { + EnumerationType enumerationType = fieldType.getLogicalType(EnumerationType.class); + baseType = + org.apache.avro.Schema.createEnum(fieldName, "", "", enumerationType.getValues()); + } else if (OneOfType.IDENTIFIER.equals(identifier)) { + OneOfType oneOfType = fieldType.getLogicalType(OneOfType.class); + baseType = + org.apache.avro.Schema.createUnion( + oneOfType.getOneOfSchema().getFields().stream() + .map(x -> getFieldSchema(x.getType(), x.getName(), namespace)) + .collect(Collectors.toList())); + } else if ("DATE".equals(identifier)) { + baseType = LogicalTypes.date().addToSchema(org.apache.avro.Schema.create(Type.INT)); + } else if ("TIME".equals(identifier)) { + baseType = LogicalTypes.timeMillis().addToSchema(org.apache.avro.Schema.create(Type.INT)); + } else { + throw new RuntimeException( + "Unhandled logical type " + fieldType.getLogicalType().getIdentifier()); } break; @@ -1022,45 +1021,51 @@ private static org.apache.avro.Schema getFieldSchema( return ByteBuffer.wrap((byte[]) value); case LOGICAL_TYPE: - switch (fieldType.getLogicalType().getIdentifier()) { - case FixedBytes.IDENTIFIER: - FixedBytesField fixedBytesField = - checkNotNull(FixedBytesField.fromBeamFieldType(fieldType)); - byte[] byteArray = (byte[]) value; - if (byteArray.length != fixedBytesField.getSize()) { - throw new IllegalArgumentException("Incorrectly sized byte array."); - } - return GenericData.get().createFixed(null, (byte[]) value, typeWithNullability.type); - case EnumerationType.IDENTIFIER: - EnumerationType enumerationType = fieldType.getLogicalType(EnumerationType.class); - return GenericData.get() - .createEnum( - enumerationType.toString((EnumerationType.Value) value), - typeWithNullability.type); - case OneOfType.IDENTIFIER: - OneOfType oneOfType = fieldType.getLogicalType(OneOfType.class); - OneOfType.Value oneOfValue = (OneOfType.Value) value; - FieldType innerFieldType = oneOfType.getFieldType(oneOfValue); - if (typeWithNullability.nullable && oneOfValue.getValue() == null) { - return null; - } else { - return genericFromBeamField( - innerFieldType.withNullable(false), - typeWithNullability.type.getTypes().get(oneOfValue.getCaseType().getValue()), - oneOfValue.getValue()); - } - case "NVARCHAR": - case "VARCHAR": - case "LONGNVARCHAR": - case "LONGVARCHAR": - return new Utf8((String) value); - case "DATE": - return Days.daysBetween(Instant.EPOCH, (Instant) value).getDays(); - case "TIME": - return (int) ((Instant) value).getMillis(); - default: - throw new RuntimeException( - "Unhandled logical type " + fieldType.getLogicalType().getIdentifier()); + String identifier = fieldType.getLogicalType().getIdentifier(); + if (FixedBytes.IDENTIFIER.equals(identifier)) { + FixedBytesField fixedBytesField = + checkNotNull(FixedBytesField.fromBeamFieldType(fieldType)); + byte[] byteArray = (byte[]) value; + if (byteArray.length != fixedBytesField.getSize()) { + throw new IllegalArgumentException("Incorrectly sized byte array."); + } + return GenericData.get().createFixed(null, (byte[]) value, typeWithNullability.type); + } else if (VariableBytes.IDENTIFIER.equals(identifier)) { + return GenericData.get().createFixed(null, (byte[]) value, typeWithNullability.type); + } else if (FixedString.IDENTIFIER.equals(identifier) + || "CHAR".equals(identifier) + || "NCHAR".equals(identifier)) { + return new Utf8((String) value); + } else if (VariableString.IDENTIFIER.equals(identifier) + || "NVARCHAR".equals(identifier) + || "VARCHAR".equals(identifier) + || "LONGNVARCHAR".equals(identifier) + || "LONGVARCHAR".equals(identifier)) { + return new Utf8((String) value); + } else if (EnumerationType.IDENTIFIER.equals(identifier)) { + EnumerationType enumerationType = fieldType.getLogicalType(EnumerationType.class); + return GenericData.get() + .createEnum( + enumerationType.toString((EnumerationType.Value) value), + typeWithNullability.type); + } else if (OneOfType.IDENTIFIER.equals(identifier)) { + OneOfType oneOfType = fieldType.getLogicalType(OneOfType.class); + OneOfType.Value oneOfValue = (OneOfType.Value) value; + FieldType innerFieldType = oneOfType.getFieldType(oneOfValue); + if (typeWithNullability.nullable && oneOfValue.getValue() == null) { + return null; + } else { + return genericFromBeamField( + innerFieldType.withNullable(false), + typeWithNullability.type.getTypes().get(oneOfValue.getCaseType().getValue()), + oneOfValue.getValue()); + } + } else if ("DATE".equals(identifier)) { + return Days.daysBetween(Instant.EPOCH, (Instant) value).getDays(); + } else if ("TIME".equals(identifier)) { + return (int) ((Instant) value).getMillis(); + } else { + throw new RuntimeException("Unhandled logical type " + identifier); } case ARRAY: diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/testing/TestPipelineOptions.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/testing/TestPipelineOptions.java index 3327ae8fc747..6ff5ded5318d 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/testing/TestPipelineOptions.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/testing/TestPipelineOptions.java @@ -48,7 +48,7 @@ public interface TestPipelineOptions extends PipelineOptions { void setOnSuccessMatcher(SerializableMatcher value); - @Default.Long(10 * 60) + @Default.Long(15 * 60) @Nullable Long getTestTimeoutSeconds(); diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/util/MoreFutures.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/util/MoreFutures.java index 6f053752d3f6..57074a7d6db4 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/util/MoreFutures.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/util/MoreFutures.java @@ -19,6 +19,7 @@ import com.google.auto.value.AutoValue; import edu.umd.cs.findbugs.annotations.SuppressWarnings; +import java.util.Arrays; import java.util.Collection; import java.util.List; import java.util.concurrent.CompletableFuture; @@ -161,17 +162,13 @@ public static CompletionStage runAsync(ThrowingRunnable runnable) { /** Like {@link CompletableFuture#allOf} but returning the result of constituent futures. */ public static CompletionStage> allAsList( Collection> futures) { - // CompletableFuture.allOf completes exceptionally if any of the futures do. // We have to gather the results separately. - CompletionStage blockAndDiscard = - CompletableFuture.allOf(futuresToCompletableFutures(futures)); + CompletableFuture[] f = futuresToCompletableFutures(futures); + CompletionStage blockAndDiscard = CompletableFuture.allOf(f); return blockAndDiscard.thenApply( - nothing -> - futures.stream() - .map(future -> future.toCompletableFuture().join()) - .collect(Collectors.toList())); + nothing -> Arrays.stream(f).map(CompletableFuture::join).collect(Collectors.toList())); } /** @@ -207,25 +204,25 @@ public static ExceptionOrResult result(T result) { } } - /** Like {@link #allAsList} but return a list . */ + /** + * Like {@link #allAsList} but return a list of {@link ExceptionOrResult} of constituent futures. + */ public static CompletionStage>> allAsListWithExceptions( Collection> futures) { - // CompletableFuture.allOf completes exceptionally if any of the futures do. // We have to gather the results separately. - CompletionStage blockAndDiscard = - CompletableFuture.allOf(futuresToCompletableFutures(futures)) - .whenComplete((ignoredValues, arbitraryException) -> {}); + CompletableFuture[] f = futuresToCompletableFutures(futures); + CompletionStage blockAndDiscard = CompletableFuture.allOf(f); return blockAndDiscard.thenApply( nothing -> - futures.stream() + Arrays.stream(f) .map( future -> { // The limited scope of the exceptions wrapped allows CancellationException // to still be thrown. try { - return ExceptionOrResult.result(future.toCompletableFuture().join()); + return ExceptionOrResult.result(future.join()); } catch (CompletionException exc) { return ExceptionOrResult.exception(exc); } diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/logicaltypes/LogicalTypesTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/logicaltypes/LogicalTypesTest.java index 2de9096d2b46..23ebcf1616b0 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/logicaltypes/LogicalTypesTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/schemas/logicaltypes/LogicalTypesTest.java @@ -17,6 +17,7 @@ */ package org.apache.beam.sdk.schemas.logicaltypes; +import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertThrows; @@ -178,4 +179,56 @@ public void testFixedPrecisionNumeric() { row = Row.withSchema(schema).addValues(decimal).build(); assertEquals(decimal, row.getLogicalTypeValue(0, BigDecimal.class)); } + + @Test + public void testFixedBytes() { + FixedBytes fixedBytes = FixedBytes.of(5); + + // check argument valid case, with padding + byte[] resultBytes = fixedBytes.toInputType(new byte[] {0x1, 0x2, 0x3}); + assertArrayEquals(new byte[] {0x1, 0x2, 0x3, 0x0, 0x0}, resultBytes); + + // check argument invalid case + assertThrows( + IllegalArgumentException.class, + () -> fixedBytes.toInputType(new byte[] {0x1, 0x2, 0x3, 0x4, 0x5, 0x6})); + } + + @Test + public void testVariableBytes() { + VariableBytes variableBytes = VariableBytes.of(5); + + // check argument valid case, no padding + byte[] resultBytes = variableBytes.toInputType(new byte[] {0x1, 0x2, 0x3}); + assertArrayEquals(new byte[] {0x1, 0x2, 0x3}, resultBytes); + + // check argument invalid case + assertThrows( + IllegalArgumentException.class, + () -> variableBytes.toInputType(new byte[] {0x1, 0x2, 0x3, 0x4, 0x5, 0x6})); + } + + @Test + public void testFixedString() { + FixedString fixedString = FixedString.of(5); + + // check argument valid case, with padding + String resultString = fixedString.toInputType("123"); + assertEquals("123 ", resultString); + + // check argument invalid case + assertThrows(IllegalArgumentException.class, () -> fixedString.toInputType("123456")); + } + + @Test + public void testVariableString() { + VariableString varibaleString = VariableString.of(5); + + // check argument valid case, no padding + String resultString = varibaleString.toInputType("123"); + assertEquals("123", resultString); + + // check argument invalid case + assertThrows(IllegalArgumentException.class, () -> varibaleString.toInputType("123456")); + } } diff --git a/sdks/java/core/src/test/java/org/apache/beam/sdk/util/MoreFuturesTest.java b/sdks/java/core/src/test/java/org/apache/beam/sdk/util/MoreFuturesTest.java index 4b6790d22c30..b8a107935016 100644 --- a/sdks/java/core/src/test/java/org/apache/beam/sdk/util/MoreFuturesTest.java +++ b/sdks/java/core/src/test/java/org/apache/beam/sdk/util/MoreFuturesTest.java @@ -20,10 +20,16 @@ import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.isA; +import static org.junit.Assert.assertEquals; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; import java.util.concurrent.CompletionStage; +import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutionException; import java.util.concurrent.atomic.AtomicInteger; +import org.apache.beam.sdk.util.MoreFutures.ExceptionOrResult; import org.junit.Rule; import org.junit.Test; import org.junit.rules.ExpectedException; @@ -84,4 +90,72 @@ public void runAsyncFailure() throws Exception { thrown.expectMessage(testMessage); MoreFutures.get(sideEffectFuture); } + + @Test + public void testAllAsListRespectsOriginalList() throws Exception { + CountDownLatch waitTillThreadRunning = new CountDownLatch(1); + CountDownLatch waitTillClearHasHappened = new CountDownLatch(1); + List> stages = new ArrayList<>(); + stages.add(MoreFutures.runAsync(waitTillThreadRunning::countDown)); + stages.add(MoreFutures.runAsync(waitTillClearHasHappened::await)); + + CompletionStage> results = MoreFutures.allAsList(stages); + waitTillThreadRunning.await(); + stages.clear(); + waitTillClearHasHappened.countDown(); + assertEquals(MoreFutures.get(results), Arrays.asList(null, null)); + } + + @Test + public void testAllAsListNoExceptionDueToMutation() throws Exception { + // This loop runs many times trying to exercise a race condition that existed where mutation + // of the passed in completion stages lead to various exceptions (such as a + // ConcurrentModificationException). See https://github.com/apache/beam/issues/23809 + for (int i = 0; i < 10000; ++i) { + CountDownLatch waitTillThreadRunning = new CountDownLatch(1); + List> stages = new ArrayList<>(); + stages.add(MoreFutures.runAsync(waitTillThreadRunning::countDown)); + + CompletionStage> results = MoreFutures.allAsList(stages); + waitTillThreadRunning.await(); + stages.clear(); + MoreFutures.get(results); + } + } + + @Test + public void testAllAsListWithExceptionsRespectsOriginalList() throws Exception { + CountDownLatch waitTillThreadRunning = new CountDownLatch(1); + CountDownLatch waitTillClearHasHappened = new CountDownLatch(1); + List> stages = new ArrayList<>(); + stages.add(MoreFutures.runAsync(waitTillThreadRunning::countDown)); + stages.add(MoreFutures.runAsync(waitTillClearHasHappened::await)); + + CompletionStage>> results = + MoreFutures.allAsListWithExceptions(stages); + waitTillThreadRunning.await(); + stages.clear(); + waitTillClearHasHappened.countDown(); + assertEquals( + MoreFutures.get(results), + Arrays.asList(ExceptionOrResult.result(null), ExceptionOrResult.result(null))); + } + + @Test + public void testAllAsListWithExceptionsNoExceptionDueToMutation() throws Exception { + // This loop runs many times trying to exercise a race condition that existed where mutation + // of the passed in completion stages lead to various exceptions (such as a + // ConcurrentModificationException). See https://github.com/apache/beam/issues/23809 + for (int i = 0; i < 10000; ++i) { + CountDownLatch waitTillThreadRunning = new CountDownLatch(1); + List> stages = new ArrayList<>(); + stages.add(MoreFutures.runAsync(waitTillThreadRunning::countDown)); + + CompletionStage>> results = + MoreFutures.allAsListWithExceptions(stages); + waitTillThreadRunning.await(); + stages.clear(); + MoreFutures.get(results); + } + } } diff --git a/sdks/java/extensions/sorter/src/main/java/org/apache/beam/sdk/extensions/sorter/SortValues.java b/sdks/java/extensions/sorter/src/main/java/org/apache/beam/sdk/extensions/sorter/SortValues.java index 5c489af6e6b4..bc9fb2f89554 100644 --- a/sdks/java/extensions/sorter/src/main/java/org/apache/beam/sdk/extensions/sorter/SortValues.java +++ b/sdks/java/extensions/sorter/src/main/java/org/apache/beam/sdk/extensions/sorter/SortValues.java @@ -76,13 +76,20 @@ SortValues create( @Override public PCollection>>> expand( PCollection>>> input) { + + Coder secondaryKeyCoder = getSecondaryKeyCoder(input.getCoder()); + try { + secondaryKeyCoder.verifyDeterministic(); + } catch (Coder.NonDeterministicException e) { + throw new IllegalStateException( + "the secondary key coder of SortValues must be deterministic", e); + } + return input .apply( ParDo.of( new SortValuesDoFn<>( - sorterOptions, - getSecondaryKeyCoder(input.getCoder()), - getValueCoder(input.getCoder())))) + sorterOptions, secondaryKeyCoder, getValueCoder(input.getCoder())))) .setCoder(input.getCoder()); } diff --git a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/rel/BeamCalcRel.java b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/rel/BeamCalcRel.java index 804715f4362a..368bf8dc4645 100644 --- a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/rel/BeamCalcRel.java +++ b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/rel/BeamCalcRel.java @@ -53,7 +53,11 @@ import org.apache.beam.sdk.extensions.sql.impl.utils.CalciteUtils.TimeWithLocalTzType; import org.apache.beam.sdk.schemas.FieldAccessDescriptor; import org.apache.beam.sdk.schemas.Schema; +import org.apache.beam.sdk.schemas.logicaltypes.FixedBytes; +import org.apache.beam.sdk.schemas.logicaltypes.FixedString; import org.apache.beam.sdk.schemas.logicaltypes.SqlTypes; +import org.apache.beam.sdk.schemas.logicaltypes.VariableBytes; +import org.apache.beam.sdk.schemas.logicaltypes.VariableString; import org.apache.beam.sdk.schemas.utils.SelectHelpers; import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.PTransform; @@ -404,8 +408,13 @@ static Object toBeamObject(Object value, FieldType fieldType, boolean verifyValu return toBeamRow((List) value, fieldType.getRowSchema(), verifyValues); case LOGICAL_TYPE: String identifier = fieldType.getLogicalType().getIdentifier(); - if (CharType.IDENTIFIER.equals(identifier)) { + if (CharType.IDENTIFIER.equals(identifier) + || FixedString.IDENTIFIER.equals(identifier) + || VariableString.IDENTIFIER.equals(identifier)) { return (String) value; + } else if (FixedBytes.IDENTIFIER.equals(identifier) + || VariableBytes.IDENTIFIER.equals(identifier)) { + return (byte[]) value; } else if (TimeWithLocalTzType.IDENTIFIER.equals(identifier)) { return Instant.ofEpochMilli(((Number) value).longValue()); } else if (SqlTypes.DATE.getIdentifier().equals(identifier)) { @@ -552,8 +561,13 @@ private static Expression getBeamField( break; case LOGICAL_TYPE: String identifier = fieldType.getLogicalType().getIdentifier(); - if (CharType.IDENTIFIER.equals(identifier)) { + if (CharType.IDENTIFIER.equals(identifier) + || FixedString.IDENTIFIER.equals(identifier) + || VariableString.IDENTIFIER.equals(identifier)) { value = Expressions.call(expression, "getString", fieldName); + } else if (FixedBytes.IDENTIFIER.equals(identifier) + || VariableBytes.IDENTIFIER.equals(identifier)) { + value = Expressions.call(expression, "getBytes", fieldName); } else if (TimeWithLocalTzType.IDENTIFIER.equals(identifier)) { value = Expressions.call(expression, "getDateTime", fieldName); } else if (SqlTypes.DATE.getIdentifier().equals(identifier)) { @@ -629,8 +643,13 @@ private static Expression toCalciteValue(Expression value, FieldType fieldType) return nullOr(value, toCalciteRow(value, fieldType.getRowSchema())); case LOGICAL_TYPE: String identifier = fieldType.getLogicalType().getIdentifier(); - if (CharType.IDENTIFIER.equals(identifier)) { + if (CharType.IDENTIFIER.equals(identifier) + || FixedString.IDENTIFIER.equals(identifier) + || VariableString.IDENTIFIER.equals(identifier)) { return Expressions.convert_(value, String.class); + } else if (FixedBytes.IDENTIFIER.equals(identifier) + || VariableBytes.IDENTIFIER.equals(identifier)) { + return Expressions.convert_(value, byte[].class); } else if (TimeWithLocalTzType.IDENTIFIER.equals(identifier)) { return nullOr( value, Expressions.call(Expressions.convert_(value, DateTime.class), "getMillis")); diff --git a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/utils/CalciteUtils.java b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/utils/CalciteUtils.java index 50163b03172e..4f8d57a4fbc5 100644 --- a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/utils/CalciteUtils.java +++ b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/utils/CalciteUtils.java @@ -194,6 +194,13 @@ public static SqlTypeName toSqlTypeName(FieldType type) { typeName = BEAM_TO_CALCITE_DEFAULT_MAPPING.get(type); } if (typeName == null) { + if (type.getLogicalType() != null) { + Schema.LogicalType logicalType = type.getLogicalType(); + if (logicalType instanceof PassThroughLogicalType) { + // for pass through logical type, just return its base type + return toSqlTypeName(logicalType.getBaseType()); + } + } throw new IllegalArgumentException( String.format("Cannot find a matching Calcite SqlTypeName for Beam type: %s", type)); } else { diff --git a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/BeamComplexTypeTest.java b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/BeamComplexTypeTest.java index 9d8bb8c68280..6d2905db0b8f 100644 --- a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/BeamComplexTypeTest.java +++ b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/BeamComplexTypeTest.java @@ -17,6 +17,7 @@ */ package org.apache.beam.sdk.extensions.sql; +import java.nio.charset.StandardCharsets; import java.time.LocalDate; import java.time.LocalDateTime; import java.time.LocalTime; @@ -25,11 +26,16 @@ import java.util.Map; import org.apache.beam.sdk.extensions.sql.impl.BeamSqlEnv; import org.apache.beam.sdk.extensions.sql.impl.rel.BeamSqlRelUtils; +import org.apache.beam.sdk.extensions.sql.meta.BeamSqlTable; import org.apache.beam.sdk.extensions.sql.meta.provider.ReadOnlyTableProvider; import org.apache.beam.sdk.extensions.sql.meta.provider.test.TestBoundedTable; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.Schema.FieldType; +import org.apache.beam.sdk.schemas.logicaltypes.FixedBytes; +import org.apache.beam.sdk.schemas.logicaltypes.FixedString; import org.apache.beam.sdk.schemas.logicaltypes.SqlTypes; +import org.apache.beam.sdk.schemas.logicaltypes.VariableBytes; +import org.apache.beam.sdk.schemas.logicaltypes.VariableString; import org.apache.beam.sdk.testing.PAssert; import org.apache.beam.sdk.testing.TestPipeline; import org.apache.beam.sdk.transforms.Create; @@ -82,36 +88,66 @@ public class BeamComplexTypeTest { .addArrayField("field3", FieldType.INT64) .build(); + private static final Schema rowWithLogicalTypeSchema = + Schema.builder() + .addLogicalTypeField("field1", FixedString.of(10)) + .addLogicalTypeField("field2", VariableString.of(10)) + .addLogicalTypeField("field3", FixedBytes.of(10)) + .addLogicalTypeField("field4", VariableBytes.of(10)) + .build(); + private static final ReadOnlyTableProvider readOnlyTableProvider = new ReadOnlyTableProvider( "test_provider", - ImmutableMap.of( - "arrayWithRowTestTable", - TestBoundedTable.of(FieldType.array(FieldType.row(innerRowSchema)), "col") - .addRows( - Arrays.asList(Row.withSchema(innerRowSchema).addValues("str", 1L).build())), - "nestedArrayTestTable", - TestBoundedTable.of(FieldType.array(FieldType.array(FieldType.INT64)), "col") - .addRows(Arrays.asList(Arrays.asList(1L, 2L, 3L), Arrays.asList(4L, 5L))), - "nestedRowTestTable", - TestBoundedTable.of(Schema.FieldType.row(nestedRowSchema), "col") - .addRows( - Row.withSchema(nestedRowSchema) - .addValues( - "str", - Row.withSchema(innerRowSchema).addValues("inner_str_one", 1L).build(), - 2L, - Row.withSchema(innerRowSchema).addValues("inner_str_two", 3L).build()) - .build()), - "basicRowTestTable", - TestBoundedTable.of(Schema.FieldType.row(innerRowSchema), "col") - .addRows(Row.withSchema(innerRowSchema).addValues("innerStr", 1L).build()), - "rowWithArrayTestTable", - TestBoundedTable.of(Schema.FieldType.row(rowWithArraySchema), "col") - .addRows( - Row.withSchema(rowWithArraySchema) - .addValues("str", 4L, Arrays.asList(5L, 6L)) - .build()))); + ImmutableMap.builder() + .put( + "arrayWithRowTestTable", + TestBoundedTable.of(FieldType.array(FieldType.row(innerRowSchema)), "col") + .addRows( + Arrays.asList( + Row.withSchema(innerRowSchema).addValues("str", 1L).build()))) + .put( + "nestedArrayTestTable", + TestBoundedTable.of(FieldType.array(FieldType.array(FieldType.INT64)), "col") + .addRows(Arrays.asList(Arrays.asList(1L, 2L, 3L), Arrays.asList(4L, 5L)))) + .put( + "nestedRowTestTable", + TestBoundedTable.of(FieldType.row(nestedRowSchema), "col") + .addRows( + Row.withSchema(nestedRowSchema) + .addValues( + "str", + Row.withSchema(innerRowSchema) + .addValues("inner_str_one", 1L) + .build(), + 2L, + Row.withSchema(innerRowSchema) + .addValues("inner_str_two", 3L) + .build()) + .build())) + .put( + "basicRowTestTable", + TestBoundedTable.of(FieldType.row(innerRowSchema), "col") + .addRows(Row.withSchema(innerRowSchema).addValues("innerStr", 1L).build())) + .put( + "rowWithArrayTestTable", + TestBoundedTable.of(FieldType.row(rowWithArraySchema), "col") + .addRows( + Row.withSchema(rowWithArraySchema) + .addValues("str", 4L, Arrays.asList(5L, 6L)) + .build())) + .put( + "rowWithLogicalTypeSchema", + TestBoundedTable.of(FieldType.row(rowWithLogicalTypeSchema), "col") + .addRows( + Row.withSchema(rowWithLogicalTypeSchema) + .addValues( + "1234567890", + "1", + "1234567890".getBytes(StandardCharsets.UTF_8), + "1".getBytes(StandardCharsets.UTF_8)) + .build())) + .build()); @Rule public transient TestPipeline pipeline = TestPipeline.create(); @@ -211,6 +247,23 @@ public void testRowWithArray() { pipeline.run().waitUntilFinish(Duration.standardMinutes(2)); } + @Test + public void testRowWithLogicalTypeSchema() { + BeamSqlEnv sqlEnv = BeamSqlEnv.inMemory(readOnlyTableProvider); + PCollection stream = + BeamSqlRelUtils.toPCollection( + pipeline, + sqlEnv.parseQuery( + "SELECT rowWithLogicalTypeSchema.col.field1, rowWithLogicalTypeSchema.col.field4 FROM rowWithLogicalTypeSchema")); + PAssert.that(stream) + .containsInAnyOrder( + Row.withSchema( + Schema.builder().addStringField("field1").addByteArrayField("field2").build()) + .addValues("1234567890", "1".getBytes(StandardCharsets.UTF_8)) + .build()); + pipeline.run().waitUntilFinish(Duration.standardMinutes(2)); + } + @Test public void testFieldAccessToNestedRow() { BeamSqlEnv sqlEnv = BeamSqlEnv.inMemory(readOnlyTableProvider); diff --git a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/pubsub/PubsubTableProviderIT.java b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/pubsub/PubsubTableProviderIT.java index f8d8ff3098a7..7bd872e7c510 100644 --- a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/pubsub/PubsubTableProviderIT.java +++ b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/pubsub/PubsubTableProviderIT.java @@ -78,7 +78,6 @@ import org.hamcrest.Matcher; import org.joda.time.Duration; import org.joda.time.Instant; -import org.junit.Ignore; import org.junit.Rule; import org.junit.Test; import org.junit.runner.RunWith; @@ -196,7 +195,6 @@ public void testSQLSelectsPayloadContent() throws Exception { resultSignal.waitForSuccess(timeout); } - @Ignore("https://github.com/apache/beam/issues/20937") @Test public void testSQLSelectsArrayAttributes() throws Exception { diff --git a/sdks/java/io/amazon-web-services2/build.gradle b/sdks/java/io/amazon-web-services2/build.gradle index 1c5d3dc82683..5b25cde8f0e0 100644 --- a/sdks/java/io/amazon-web-services2/build.gradle +++ b/sdks/java/io/amazon-web-services2/build.gradle @@ -48,6 +48,7 @@ dependencies { implementation library.java.aws_java_sdk2_auth, excludeNetty implementation library.java.aws_java_sdk2_regions, excludeNetty implementation library.java.aws_java_sdk2_utils, excludeNetty + implementation library.java.aws_java_sdk2_profiles, excludeNetty implementation library.java.aws_java_sdk2_http_client_spi, excludeNetty implementation library.java.aws_java_sdk2_apache_client, excludeNetty implementation library.java.aws_java_sdk2_netty_client, excludeNetty diff --git a/sdks/java/io/amazon-web-services2/src/main/java/org/apache/beam/sdk/io/aws2/options/AwsModule.java b/sdks/java/io/amazon-web-services2/src/main/java/org/apache/beam/sdk/io/aws2/options/AwsModule.java index 0f8b138d0b95..d814b395950a 100644 --- a/sdks/java/io/amazon-web-services2/src/main/java/org/apache/beam/sdk/io/aws2/options/AwsModule.java +++ b/sdks/java/io/amazon-web-services2/src/main/java/org/apache/beam/sdk/io/aws2/options/AwsModule.java @@ -49,6 +49,7 @@ import org.apache.beam.sdk.annotations.Experimental.Kind; import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableSet; import org.checkerframework.checker.nullness.qual.NonNull; +import org.slf4j.LoggerFactory; import software.amazon.awssdk.auth.credentials.AwsBasicCredentials; import software.amazon.awssdk.auth.credentials.AwsCredentials; import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider; @@ -60,6 +61,7 @@ import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider; import software.amazon.awssdk.auth.credentials.SystemPropertyCredentialsProvider; import software.amazon.awssdk.http.apache.ProxyConfiguration; +import software.amazon.awssdk.profiles.ProfileFileSystemSetting; import software.amazon.awssdk.regions.Region; import software.amazon.awssdk.services.sts.StsClient; import software.amazon.awssdk.services.sts.auth.StsAssumeRoleCredentialsProvider; @@ -75,6 +77,7 @@ public class AwsModule extends SimpleModule { private static final String ACCESS_KEY_ID = "accessKeyId"; private static final String SECRET_ACCESS_KEY = "secretAccessKey"; private static final String SESSION_TOKEN = "sessionToken"; + private static final String PROFILE_NAME = "profileName"; public AwsModule() { super("AwsModule"); @@ -160,7 +163,9 @@ public AwsCredentialsProvider deserializeWithType( } else if (hasName(SystemPropertyCredentialsProvider.class, typeName)) { return SystemPropertyCredentialsProvider.create(); } else if (hasName(ProfileCredentialsProvider.class, typeName)) { - return ProfileCredentialsProvider.create(); + return json.has(PROFILE_NAME) + ? ProfileCredentialsProvider.create(getNotNull(json, PROFILE_NAME, typeName)) + : ProfileCredentialsProvider.create(); } else if (hasName(ContainerCredentialsProvider.class, typeName)) { return ContainerCredentialsProvider.builder().build(); } else if (typeName.equals(StsAssumeRoleCredentialsProvider.class.getSimpleName())) { @@ -195,7 +200,6 @@ private static class AWSCredentialsProviderSerializer DefaultCredentialsProvider.class, EnvironmentVariableCredentialsProvider.class, SystemPropertyCredentialsProvider.class, - ProfileCredentialsProvider.class, ContainerCredentialsProvider.class); @Override @@ -228,6 +232,23 @@ public void serializeWithType( jsonGenerator.writeStringField(ACCESS_KEY_ID, credentials.accessKeyId()); jsonGenerator.writeStringField(SECRET_ACCESS_KEY, credentials.secretAccessKey()); } + } else if (providerClass.equals(ProfileCredentialsProvider.class)) { + String profileName = (String) readField(credentialsProvider, PROFILE_NAME); + String envProfileName = ProfileFileSystemSetting.AWS_PROFILE.getStringValueOrThrow(); + if (profileName != null && !profileName.equals(envProfileName)) { + jsonGenerator.writeStringField(PROFILE_NAME, profileName); + } + try { + Exception exception = (Exception) readField(credentialsProvider, "loadException"); + if (exception != null) { + LoggerFactory.getLogger(AwsModule.class) + .warn("Serialized ProfileCredentialsProvider in faulty state.", exception); + } + } catch (RuntimeException e) { + LoggerFactory.getLogger(AwsModule.class) + .warn("Failed to check ProfileCredentialsProvider for loadException.", e); + } + } else if (providerClass.equals(StsAssumeRoleCredentialsProvider.class)) { Supplier reqSupplier = (Supplier) diff --git a/sdks/java/io/amazon-web-services2/src/main/java/org/apache/beam/sdk/io/aws2/options/AwsOptions.java b/sdks/java/io/amazon-web-services2/src/main/java/org/apache/beam/sdk/io/aws2/options/AwsOptions.java index d2e02217d15a..ae86c27b78e2 100644 --- a/sdks/java/io/amazon-web-services2/src/main/java/org/apache/beam/sdk/io/aws2/options/AwsOptions.java +++ b/sdks/java/io/amazon-web-services2/src/main/java/org/apache/beam/sdk/io/aws2/options/AwsOptions.java @@ -78,21 +78,20 @@ public Region create(PipelineOptions options) { *

The class name of the provider must be set in the {@code @type} field. Note: Not all * available providers are supported and some configuration options might be ignored. * - *

Most providers rely on system's environment to follow AWS conventions, there's no further - * configuration: + *

Most providers must use the system environment following AWS conventions. Programmatic + * configuration for these providers is NOT supported: *

  • {@link DefaultCredentialsProvider} *
  • {@link EnvironmentVariableCredentialsProvider} *
  • {@link SystemPropertyCredentialsProvider} - *
  • {@link ProfileCredentialsProvider} *
  • {@link ContainerCredentialsProvider} * *

    Example: * - *

    {@code --awsCredentialsProvider={"@type": "ProfileCredentialsProvider"}}
    + *
    {@code --awsCredentialsProvider={"@type": "EnvironmentVariableCredentialsProvider"}}
    +   *     
    * - *

    Some other providers require additional configuration: + *

    Some other providers support additional configuration: *

  • {@link StaticCredentialsProvider} - *
  • {@link StsAssumeRoleCredentialsProvider} * *

    Examples: * @@ -107,9 +106,27 @@ public Region create(PipelineOptions options) { * "awsAccessKeyId": "key_id_value", * "awsSecretKey": "secret_value", * "sessionToken": "token_value" + * }} + * + *

  • {@link ProfileCredentialsProvider} + * + *

    {@code profileName} is optional, if not set the environment default is used. Be careful + * if using this provider programmatically, it can behave unexpectedly. + * + *

    Examples: + * + *

    {@code --awsCredentialsProvider={
    +   *   "@type": "ProfileCredentialsProvider"
        * }
        *
        * --awsCredentialsProvider={
    +   *   "@type": "ProfileCredentialsProvider",
    +   *   "profileName": "my_profile"
    +   * }}
    + * + *
  • {@link StsAssumeRoleCredentialsProvider} + * + *
    {@code --awsCredentialsProvider={
        *   "@type": "StsAssumeRoleCredentialsProvider",
        *   "roleArn": "role_arn_Value",
        *   "roleSessionName": "session_name_value",
    diff --git a/sdks/java/io/amazon-web-services2/src/test/java/org/apache/beam/sdk/io/aws2/options/AwsModuleTest.java b/sdks/java/io/amazon-web-services2/src/test/java/org/apache/beam/sdk/io/aws2/options/AwsModuleTest.java
    index e5962812e64b..17e6f528f969 100644
    --- a/sdks/java/io/amazon-web-services2/src/test/java/org/apache/beam/sdk/io/aws2/options/AwsModuleTest.java
    +++ b/sdks/java/io/amazon-web-services2/src/test/java/org/apache/beam/sdk/io/aws2/options/AwsModuleTest.java
    @@ -18,6 +18,7 @@
     package org.apache.beam.sdk.io.aws2.options;
     
     import static org.apache.beam.repackaged.core.org.apache.commons.lang3.reflect.FieldUtils.readField;
    +import static org.apache.beam.sdk.io.aws2.options.SerializationTestUtil.serialize;
     import static org.assertj.core.api.Assertions.assertThat;
     import static org.hamcrest.Matchers.hasItem;
     import static org.hamcrest.Matchers.instanceOf;
    @@ -25,21 +26,32 @@
     import static software.amazon.awssdk.core.SdkSystemSetting.AWS_ACCESS_KEY_ID;
     import static software.amazon.awssdk.core.SdkSystemSetting.AWS_REGION;
     import static software.amazon.awssdk.core.SdkSystemSetting.AWS_SECRET_ACCESS_KEY;
    +import static software.amazon.awssdk.profiles.ProfileFileSystemSetting.AWS_CONFIG_FILE;
    +import static software.amazon.awssdk.profiles.ProfileFileSystemSetting.AWS_PROFILE;
     
     import com.amazonaws.regions.Regions;
     import com.fasterxml.jackson.databind.Module;
     import com.fasterxml.jackson.databind.ObjectMapper;
    +import java.io.IOException;
     import java.net.URI;
    +import java.nio.file.Files;
    +import java.nio.file.Path;
    +import java.util.Arrays;
     import java.util.List;
     import java.util.Properties;
     import java.util.function.Supplier;
    +import org.apache.beam.sdk.testing.ExpectedLogs;
     import org.apache.beam.sdk.util.ThrowingSupplier;
     import org.apache.beam.sdk.util.common.ReflectHelpers;
     import org.hamcrest.MatcherAssert;
    +import org.junit.ClassRule;
    +import org.junit.Rule;
     import org.junit.Test;
    +import org.junit.rules.ExternalResource;
     import org.junit.runner.RunWith;
     import org.junit.runners.JUnit4;
     import software.amazon.awssdk.auth.credentials.AwsBasicCredentials;
    +import software.amazon.awssdk.auth.credentials.AwsCredentials;
     import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider;
     import software.amazon.awssdk.auth.credentials.AwsSessionCredentials;
     import software.amazon.awssdk.auth.credentials.ContainerCredentialsProvider;
    @@ -57,6 +69,24 @@
     @RunWith(JUnit4.class)
     public class AwsModuleTest {
     
    +  @ClassRule
    +  public static final ProfileFile PROFILE =
    +      new ProfileFile(
    +          "[default]",
    +          "aws_access_key_id=defaultkey",
    +          "aws_secret_access_key=123",
    +          "[profile other]",
    +          "aws_access_key_id=otherkey",
    +          "aws_secret_access_key=abc");
    +
    +  private static final AwsCredentials DEFAULT_CREDENTIALS =
    +      AwsBasicCredentials.create("defaultkey", "123");
    +
    +  private static final AwsCredentials OTHER_CREDENTIALS =
    +      AwsBasicCredentials.create("otherkey", "abc");
    +
    +  @Rule public final ExpectedLogs logs = ExpectedLogs.none(AwsModule.class);
    +
       @Test
       public void testObjectMapperIsAbleToFindModule() {
         List modules = ObjectMapper.findModules(ReflectHelpers.findClassLoader());
    @@ -68,7 +98,7 @@ private  T serializeAndDeserialize(T obj) {
       }
     
       @Test
    -  public void testStaticCredentialsProviderSerializationDeserialization() {
    +  public void testStaticCredentialsProviderSerDe() {
         AwsCredentialsProvider provider =
             StaticCredentialsProvider.create(AwsBasicCredentials.create("key", "secret"));
     
    @@ -84,7 +114,7 @@ public void testStaticCredentialsProviderSerializationDeserialization() {
       }
     
       @Test
    -  public void testAwsCredentialsProviderSerializationDeserialization() {
    +  public void testAwsCredentialsProviderSerDe() {
         AwsCredentialsProvider provider = DefaultCredentialsProvider.create();
         AwsCredentialsProvider deserializedProvider = serializeAndDeserialize(provider);
         assertEquals(provider.getClass(), deserializedProvider.getClass());
    @@ -97,17 +127,90 @@ public void testAwsCredentialsProviderSerializationDeserialization() {
         deserializedProvider = serializeAndDeserialize(provider);
         assertEquals(provider.getClass(), deserializedProvider.getClass());
     
    -    provider = ProfileCredentialsProvider.create();
    -    deserializedProvider = serializeAndDeserialize(provider);
    -    assertEquals(provider.getClass(), deserializedProvider.getClass());
    -
         provider = ContainerCredentialsProvider.builder().build();
         deserializedProvider = serializeAndDeserialize(provider);
         assertEquals(provider.getClass(), deserializedProvider.getClass());
       }
     
       @Test
    -  public void testStsAssumeRoleCredentialsProviderSerializationDeserialization() throws Exception {
    +  public void testProfileCredentialsProviderSerDeWithDefaultProfile() throws Exception {
    +    withSystemProperties(
    +        PROFILE.properties("default"),
    +        () -> {
    +          AwsCredentialsProvider provider = ProfileCredentialsProvider.create();
    +          String serializedProvider = serialize(provider);
    +
    +          assertThat(serializedProvider).isEqualTo("{\"@type\":\"ProfileCredentialsProvider\"}");
    +
    +          AwsCredentialsProvider actual = deserialize(serializedProvider);
    +          assertThat(actual.resolveCredentials())
    +              .isEqualToComparingFieldByField(DEFAULT_CREDENTIALS);
    +          return assertThat(actual)
    +              .isExactlyInstanceOf(ProfileCredentialsProvider.class)
    +              .isEqualToComparingFieldByFieldRecursively(provider);
    +        });
    +  }
    +
    +  @Test
    +  public void testProfileCredentialsProviderSerDeWithCustomProfile() throws Exception {
    +    withSystemProperties(
    +        PROFILE.properties("default"),
    +        () -> {
    +          AwsCredentialsProvider provider = ProfileCredentialsProvider.create("other");
    +          String serializedProvider = serialize(provider);
    +
    +          assertThat(serializedProvider)
    +              .isEqualTo("{\"@type\":\"ProfileCredentialsProvider\",\"profileName\":\"other\"}");
    +
    +          AwsCredentialsProvider actual = deserialize(serializedProvider);
    +          assertThat(actual.resolveCredentials()).isEqualToComparingFieldByField(OTHER_CREDENTIALS);
    +          return assertThat(actual)
    +              .isExactlyInstanceOf(ProfileCredentialsProvider.class)
    +              .isEqualToComparingFieldByFieldRecursively(provider);
    +        });
    +  }
    +
    +  @Test
    +  public void testProfileCredentialsProviderSerDeWithCustomDefaultProfile() throws Exception {
    +    withSystemProperties(
    +        PROFILE.properties("other"),
    +        () -> {
    +          AwsCredentialsProvider provider = ProfileCredentialsProvider.create("other");
    +          String serializedProvider = serialize(provider);
    +
    +          assertThat(serializedProvider).isEqualTo("{\"@type\":\"ProfileCredentialsProvider\"}");
    +
    +          AwsCredentialsProvider actual = deserialize(serializedProvider);
    +          assertThat(actual.resolveCredentials())
    +              .isEqualToComparingFieldByFieldRecursively(OTHER_CREDENTIALS);
    +          return assertThat(actual)
    +              .isExactlyInstanceOf(ProfileCredentialsProvider.class)
    +              .isEqualToComparingFieldByFieldRecursively(provider);
    +        });
    +  }
    +
    +  @Test
    +  public void testProfileCredentialsProviderSerDeWithUnknownProfile() throws Exception {
    +    withSystemProperties(
    +        PROFILE.properties("default"),
    +        () -> {
    +          AwsCredentialsProvider provider = ProfileCredentialsProvider.create("unknown");
    +          String serializedProvider = serialize(provider);
    +
    +          // ProfileCredentialsProvider SILENTLY drops unknown profiles
    +          assertThat(serializedProvider).isEqualTo("{\"@type\":\"ProfileCredentialsProvider\"}");
    +
    +          AwsCredentialsProvider actual = deserialize(serializedProvider);
    +          // NOTE: This documents the unexpected behavior in case a faulty provider is serialized
    +          return assertThat(actual.resolveCredentials())
    +              .isEqualToComparingFieldByField(DEFAULT_CREDENTIALS);
    +        });
    +
    +    logs.verifyWarn("Serialized ProfileCredentialsProvider in faulty state.");
    +  }
    +
    +  @Test
    +  public void testStsAssumeRoleCredentialsProviderSerDe() throws Exception {
         AssumeRoleRequest req = AssumeRoleRequest.builder().roleArn("roleArn").policy("policy").build();
         Supplier provider =
             () ->
    @@ -123,7 +226,7 @@ public void testStsAssumeRoleCredentialsProviderSerializationDeserialization() t
     
         // Region and credentials for STS client are resolved using default providers
         AwsCredentialsProvider deserializedProvider =
    -        withSystemPropertyOverrides(overrides, () -> serializeAndDeserialize(provider.get()));
    +        withSystemProperties(overrides, () -> serializeAndDeserialize(provider.get()));
     
         Supplier requestSupplier =
             (Supplier)
    @@ -132,7 +235,7 @@ public void testStsAssumeRoleCredentialsProviderSerializationDeserialization() t
       }
     
       @Test
    -  public void testProxyConfigurationSerializationDeserialization() {
    +  public void testProxyConfigurationSerDe() {
         ProxyConfiguration proxyConfiguration =
             ProxyConfiguration.builder()
                 .endpoint(URI.create("http://localhost:8080"))
    @@ -147,7 +250,7 @@ public void testProxyConfigurationSerializationDeserialization() {
         assertEquals("password", deserializedProxyConfiguration.password());
       }
     
    -  private  T withSystemPropertyOverrides(Properties overrides, ThrowingSupplier fun)
    +  private  T withSystemProperties(Properties overrides, ThrowingSupplier fun)
           throws Exception {
         Properties systemProps = System.getProperties();
     
    @@ -164,4 +267,39 @@ private  T withSystemPropertyOverrides(Properties overrides, ThrowingSupplier
           previousProps.forEach(systemProps::put);
         }
       }
    +
    +  private static AwsCredentialsProvider deserialize(String provider) {
    +    return SerializationTestUtil.deserialize(provider, AwsCredentialsProvider.class);
    +  }
    +
    +  static class ProfileFile extends ExternalResource {
    +    private String[] lines;
    +    private Path path;
    +
    +    public ProfileFile(String... lines) {
    +      this.lines = lines;
    +    }
    +
    +    public Properties properties(String defaultProfile) {
    +      Properties props = new Properties();
    +      props.setProperty(AWS_CONFIG_FILE.property(), path.toString());
    +      props.setProperty(AWS_PROFILE.property(), defaultProfile);
    +      return props;
    +    }
    +
    +    @Override
    +    protected void before() throws Throwable {
    +      path = Files.createTempFile("profile", ".conf");
    +      Files.write(path, Arrays.asList(lines));
    +    }
    +
    +    @Override
    +    protected void after() {
    +      try {
    +        Files.delete(path);
    +      } catch (IOException e) {
    +        // ignore
    +      }
    +    }
    +  }
     }
    diff --git a/sdks/java/io/amazon-web-services2/src/test/java/org/apache/beam/sdk/io/aws2/options/SerializationTestUtil.java b/sdks/java/io/amazon-web-services2/src/test/java/org/apache/beam/sdk/io/aws2/options/SerializationTestUtil.java
    index 0f5daf0bc92c..6cf79c958090 100644
    --- a/sdks/java/io/amazon-web-services2/src/test/java/org/apache/beam/sdk/io/aws2/options/SerializationTestUtil.java
    +++ b/sdks/java/io/amazon-web-services2/src/test/java/org/apache/beam/sdk/io/aws2/options/SerializationTestUtil.java
    @@ -28,11 +28,22 @@ public class SerializationTestUtil {
               .registerModules(ObjectMapper.findModules(ReflectHelpers.findClassLoader()));
     
       public static  T serializeDeserialize(Class clazz, T obj) {
    +    return deserialize(serialize(obj), clazz);
    +  }
    +
    +  public static  String serialize(T obj) {
    +    try {
    +      return MAPPER.writeValueAsString(obj);
    +    } catch (JsonProcessingException e) {
    +      throw new RuntimeException("Failed to serialize " + obj.getClass().getSimpleName(), e);
    +    }
    +  }
    +
    +  public static  T deserialize(String jsonString, Class clazz) {
         try {
    -      String jsonString = MAPPER.writeValueAsString(obj);
           return MAPPER.readValue(jsonString, clazz);
         } catch (JsonProcessingException e) {
    -      throw new RuntimeException("Failed to serialize/deserialize " + clazz.getSimpleName(), e);
    +      throw new RuntimeException("Failed to deserialize " + clazz.getSimpleName(), e);
         }
       }
     }
    diff --git a/sdks/java/io/bigquery-io-perf-tests/src/test/java/org/apache/beam/sdk/bigqueryioperftests/BigQueryIOIT.java b/sdks/java/io/bigquery-io-perf-tests/src/test/java/org/apache/beam/sdk/bigqueryioperftests/BigQueryIOIT.java
    index c67dc936705e..0dfc7addc6f7 100644
    --- a/sdks/java/io/bigquery-io-perf-tests/src/test/java/org/apache/beam/sdk/bigqueryioperftests/BigQueryIOIT.java
    +++ b/sdks/java/io/bigquery-io-perf-tests/src/test/java/org/apache/beam/sdk/bigqueryioperftests/BigQueryIOIT.java
    @@ -17,6 +17,7 @@
      */
     package org.apache.beam.sdk.bigqueryioperftests;
     
    +import static org.junit.Assert.assertEquals;
     import static org.junit.Assert.assertNotEquals;
     
     import com.google.api.services.bigquery.model.TableFieldSchema;
    @@ -28,6 +29,7 @@
     import com.google.cloud.bigquery.TableId;
     import java.io.IOException;
     import java.nio.ByteBuffer;
    +import java.util.Base64;
     import java.util.Collections;
     import java.util.List;
     import java.util.UUID;
    @@ -43,7 +45,10 @@
     import org.apache.beam.sdk.io.synthetic.SyntheticBoundedSource;
     import org.apache.beam.sdk.io.synthetic.SyntheticOptions;
     import org.apache.beam.sdk.io.synthetic.SyntheticSourceOptions;
    +import org.apache.beam.sdk.metrics.Counter;
    +import org.apache.beam.sdk.metrics.Metrics;
     import org.apache.beam.sdk.options.Description;
    +import org.apache.beam.sdk.options.StreamingOptions;
     import org.apache.beam.sdk.options.Validation;
     import org.apache.beam.sdk.options.ValueProvider;
     import org.apache.beam.sdk.testutils.NamedTestResult;
    @@ -54,6 +59,7 @@
     import org.apache.beam.sdk.transforms.DoFn;
     import org.apache.beam.sdk.transforms.ParDo;
     import org.apache.beam.sdk.values.KV;
    +import org.joda.time.Duration;
     import org.junit.AfterClass;
     import org.junit.BeforeClass;
     import org.junit.Test;
    @@ -87,6 +93,7 @@ public class BigQueryIOIT {
       private static final String READ_TIME_METRIC_NAME = "read_time";
       private static final String WRITE_TIME_METRIC_NAME = "write_time";
       private static final String AVRO_WRITE_TIME_METRIC_NAME = "avro_write_time";
    +  private static final String READ_ELEMENT_METRIC_NAME = "read_count";
       private static String testBigQueryDataset;
       private static String testBigQueryTable;
       private static SyntheticSourceOptions sourceOptions;
    @@ -141,10 +148,11 @@ public void testWriteThenRead() {
       private void testJsonWrite() {
         BigQueryIO.Write writeIO =
             BigQueryIO.write()
    +            .withSuccessfulInsertsPropagation(false)
                 .withFormatFunction(
                     input -> {
                       TableRow tableRow = new TableRow();
    -                  tableRow.set("data", input);
    +                  tableRow.set("data", Base64.getEncoder().encodeToString(input));
                       return tableRow;
                     });
         testWrite(writeIO, WRITE_TIME_METRIC_NAME);
    @@ -165,9 +173,13 @@ private void testAvroWrite() {
       }
     
       private void testWrite(BigQueryIO.Write writeIO, String metricName) {
    -    Pipeline pipeline = Pipeline.create(options);
    -
         BigQueryIO.Write.Method method = BigQueryIO.Write.Method.valueOf(options.getWriteMethod());
    +    if (method == BigQueryIO.Write.Method.STREAMING_INSERTS) {
    +      // set streaming for STREAMING_INSERTS write
    +      options.as(StreamingOptions.class).setStreaming(true);
    +    }
    +
    +    Pipeline pipeline = Pipeline.create(options);
         pipeline
             .apply("Read from source", Read.from(new SyntheticBoundedSource(sourceOptions)))
             .apply("Gather time", ParDo.of(new TimeMonitor<>(NAMESPACE, metricName)))
    @@ -185,19 +197,30 @@ private void testWrite(BigQueryIO.Write writeIO, String metricName) {
                                     new TableFieldSchema().setName("data").setType("BYTES")))));
     
         PipelineResult pipelineResult = pipeline.run();
    -    PipelineResult.State pipelineState = pipelineResult.waitUntilFinish();
    +    PipelineResult.State pipelineState =
    +        options.getPipelineTimeout() == null
    +            ? pipelineResult.waitUntilFinish()
    +            : pipelineResult.waitUntilFinish(
    +                Duration.standardSeconds(options.getPipelineTimeout()));
         extractAndPublishTime(pipelineResult, metricName);
         // Fail the test if pipeline failed.
         assertNotEquals(pipelineState, PipelineResult.State.FAILED);
    +
    +    // set back streaming
    +    options.as(StreamingOptions.class).setStreaming(false);
       }
     
       private void testRead() {
         Pipeline pipeline = Pipeline.create(options);
         pipeline
             .apply("Read from BQ", BigQueryIO.readTableRows().from(tableQualifier))
    -        .apply("Gather time", ParDo.of(new TimeMonitor<>(NAMESPACE, READ_TIME_METRIC_NAME)));
    +        .apply("Gather time", ParDo.of(new TimeMonitor<>(NAMESPACE, READ_TIME_METRIC_NAME)))
    +        .apply("Counting element", ParDo.of(new CountingFn<>(NAMESPACE, READ_ELEMENT_METRIC_NAME)));
         PipelineResult result = pipeline.run();
         PipelineResult.State pipelineState = result.waitUntilFinish();
    +
    +    assertEquals(
    +        sourceOptions.numRecords, readElementMetric(result, NAMESPACE, READ_ELEMENT_METRIC_NAME));
         extractAndPublishTime(result, READ_TIME_METRIC_NAME);
         // Fail the test if pipeline failed.
         assertNotEquals(pipelineState, PipelineResult.State.FAILED);
    @@ -219,6 +242,11 @@ private static Function getMetricSupplier(String
         };
       }
     
    +  private long readElementMetric(PipelineResult result, String namespace, String name) {
    +    MetricsReader metricsReader = new MetricsReader(result, namespace);
    +    return metricsReader.getCounterMetric(name);
    +  }
    +
       /** Options for this io performance test. */
       public interface BigQueryPerfTestOptions extends IOTestPipelineOptions {
         @Description("Synthetic source options")
    @@ -256,6 +284,11 @@ public interface BigQueryPerfTestOptions extends IOTestPipelineOptions {
     
         @Description("Write Avro or JSON to BQ")
         void setWriteFormat(String value);
    +
    +    Integer getPipelineTimeout();
    +
    +    @Description("Time to wait for the events to be processed by the pipeline (in seconds)")
    +    void setPipelineTimeout(Integer writeTimeout);
       }
     
       private static class MapKVToV extends DoFn, byte[]> {
    @@ -265,6 +298,20 @@ public void process(ProcessContext context) {
         }
       }
     
    +  private static class CountingFn extends DoFn {
    +
    +    private final Counter elementCounter;
    +
    +    CountingFn(String namespace, String name) {
    +      elementCounter = Metrics.counter(namespace, name);
    +    }
    +
    +    @ProcessElement
    +    public void processElement() {
    +      elementCounter.inc(1L);
    +    }
    +  }
    +
       private enum WriteFormat {
         AVRO,
         JSON
    diff --git a/sdks/java/io/cdap/build.gradle b/sdks/java/io/cdap/build.gradle
    index 1bcc0ece146b..3cfc01f79f7a 100644
    --- a/sdks/java/io/cdap/build.gradle
    +++ b/sdks/java/io/cdap/build.gradle
    @@ -52,14 +52,17 @@ dependencies {
         implementation library.java.cdap_plugin_zendesk
         implementation library.java.commons_lang3
         implementation library.java.guava
    +    implementation library.java.google_code_gson
         implementation library.java.hadoop_common
         implementation library.java.hadoop_mapreduce_client_core
         implementation library.java.jackson_core
         implementation library.java.jackson_databind
         implementation library.java.slf4j_api
    +    implementation library.java.spark_streaming
         implementation library.java.tephra
         implementation library.java.vendored_guava_26_0_jre
         implementation project(path: ":sdks:java:core", configuration: "shadow")
    +    implementation project(":sdks:java:io:sparkreceiver")
         implementation project(":sdks:java:io:hadoop-format")
         testImplementation library.java.cdap_plugin_service_now
         testImplementation library.java.cdap_etl_api
    diff --git a/sdks/java/io/cdap/src/main/java/org/apache/beam/sdk/io/cdap/CdapIO.java b/sdks/java/io/cdap/src/main/java/org/apache/beam/sdk/io/cdap/CdapIO.java
    index f2655507cf56..5590bb061654 100644
    --- a/sdks/java/io/cdap/src/main/java/org/apache/beam/sdk/io/cdap/CdapIO.java
    +++ b/sdks/java/io/cdap/src/main/java/org/apache/beam/sdk/io/cdap/CdapIO.java
    @@ -17,27 +17,163 @@
      */
     package org.apache.beam.sdk.io.cdap;
     
    -import static org.apache.beam.sdk.util.Preconditions.checkArgumentNotNull;
    +import static org.apache.beam.sdk.io.cdap.MappingUtils.getOffsetFnForPluginClass;
    +import static org.apache.beam.sdk.io.cdap.MappingUtils.getPluginByClass;
    +import static org.apache.beam.sdk.io.cdap.MappingUtils.getReceiverBuilderByPluginClass;
    +import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull;
     import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument;
     
     import com.google.auto.value.AutoValue;
     import io.cdap.cdap.api.plugin.PluginConfig;
    +import java.util.Map;
     import org.apache.beam.sdk.annotations.Experimental;
     import org.apache.beam.sdk.annotations.Experimental.Kind;
    +import org.apache.beam.sdk.coders.CannotProvideCoderException;
    +import org.apache.beam.sdk.coders.Coder;
     import org.apache.beam.sdk.io.hadoop.format.HDFSSynchronization;
     import org.apache.beam.sdk.io.hadoop.format.HadoopFormatIO;
    +import org.apache.beam.sdk.io.sparkreceiver.SparkReceiverIO;
    +import org.apache.beam.sdk.transforms.MapElements;
     import org.apache.beam.sdk.transforms.PTransform;
    +import org.apache.beam.sdk.transforms.SerializableFunction;
     import org.apache.beam.sdk.values.KV;
     import org.apache.beam.sdk.values.PBegin;
     import org.apache.beam.sdk.values.PCollection;
     import org.apache.beam.sdk.values.PDone;
    +import org.apache.beam.sdk.values.TypeDescriptor;
     import org.apache.commons.lang3.NotImplementedException;
     import org.apache.hadoop.conf.Configuration;
    +import org.apache.hadoop.mapreduce.InputFormat;
    +import org.apache.hadoop.mapreduce.OutputFormat;
     import org.checkerframework.checker.nullness.qual.Nullable;
     
     /**
    - * An unbounded/bounded sources and sinks from CDAP plugins.
    + * A {@link CdapIO} is a Transform for reading data from source or writing data to sink of a Cdap
    + * Plugin. It uses {@link HadoopFormatIO} for Batch and SparkReceiverIO for Streaming.
    + *
    + * 

    Read from Cdap Plugin Bounded Source

    + * + *

    To configure {@link CdapIO} source, you must specify Cdap {@link Plugin}, Cdap {@link + * PluginConfig}, key and value classes. + * + *

    {@link Plugin} is the Wrapper class for the Cdap Plugin. It contains main information about + * the Plugin. The object of the {@link Plugin} class can be created with the {@link + * Plugin#createBatch(Class, Class, Class)} method. Method requires the following parameters: + * + *

      + *
    • {@link io.cdap.cdap.etl.api.batch.BatchSource} class + *
    • {@link InputFormat} class + *
    • {@link io.cdap.cdap.api.data.batch.InputFormatProvider} class + *
    + * + *

    For more information about the InputFormat and InputFormatProvider, see {@link + * HadoopFormatIO}. + * + *

    Every Cdap Plugin has its {@link PluginConfig} class with necessary fields to configure the + * Plugin. You can set the {@link Map} of your parameters with the {@link + * ConfigWrapper#withParams(Map)} method where the key is the field name. + * + *

    For example, to create a basic {@link CdapIO#read()} transform: + * + *

    {@code
    + * Pipeline p = ...; // Create pipeline.
    + *
    + * // Create PluginConfig for specific plugin
    + * EmployeeConfig pluginConfig =
    + *         new ConfigWrapper<>(EmployeeConfig.class).withParams(TEST_EMPLOYEE_PARAMS_MAP).build();
    + *
    + * // Read using CDAP batch plugin
    + * p.apply("ReadBatch",
    + * CdapIO.read()
    + *             .withCdapPlugin(
    + *                 Plugin.createBatch(
    + *                     EmployeeBatchSource.class,
    + *                     EmployeeInputFormat.class,
    + *                     EmployeeInputFormatProvider.class))
    + *             .withPluginConfig(pluginConfig)
    + *             .withKeyClass(String.class)
    + *             .withValueClass(String.class));
    + * }
    + * + *

    Write to Cdap Plugin Bounded Sink

    + * + *

    To configure {@link CdapIO} sink, just as {@link CdapIO#read()} Cdap {@link Plugin}, Cdap + * {@link PluginConfig}, key, value classes must be specified. In addition, it's necessary to + * determine locks directory path {@link CdapIO.Write#withLocksDirPath(String)}. It's used for + * {@link HDFSSynchronization} configuration for {@link HadoopFormatIO}. More info can be found in + * {@link HadoopFormatIO} documentation. + * + *

    To create the object of the {@link Plugin} class with the {@link Plugin#createBatch(Class, + * Class, Class)} method, need to specify the following parameters: + * + *

      + *
    • {@link io.cdap.cdap.etl.api.batch.BatchSink} class + *
    • {@link OutputFormat} class + *
    • {@link io.cdap.cdap.api.data.batch.OutputFormatProvider} class + *
    + * + *

    For more information about the OutputFormat and OutputFormatProvider, see {@link + * HadoopFormatIO}. + * + *

    Example of {@link CdapIO#write()} usage: + * + *

    {@code
    + * Pipeline p = ...; // Create pipeline.
    + *
    + * // Get or create data to write
    + * PCollection> input = p.apply(Create.of(data));
    + *
    + * // Create PluginConfig for specific plugin
    + * EmployeeConfig pluginConfig =
    + *         new ConfigWrapper<>(EmployeeConfig.class).withParams(TEST_EMPLOYEE_PARAMS_MAP).build();
    + *
    + * // Write using CDAP batch plugin
    + * input.apply(
    + *         "WriteBatch",
    + *         CdapIO.write()
    + *             .withCdapPlugin(
    + *                 Plugin.createBatch(
    + *                     EmployeeBatchSink.class,
    + *                     EmployeeOutputFormat.class,
    + *                     EmployeeOutputFormatProvider.class))
    + *             .withPluginConfig(pluginConfig)
    + *             .withKeyClass(String.class)
    + *             .withValueClass(String.class)
    + *             .withLocksDirPath(tmpFolder.getRoot().getAbsolutePath()));
    + *     p.run();
    + * }
    + * + *

    Read from Cdap Plugin Streaming Source

    + * + *

    To configure {@link CdapIO} source, you must specify Cdap {@link Plugin}, Cdap {@link + * PluginConfig}, key and value classes. + * + *

    {@link Plugin} is the Wrapper class for the Cdap Plugin. It contains main information about + * the Plugin. The object of the {@link Plugin} class can be created with the {@link + * Plugin#createStreaming(Class)} method. Method requires {@link + * io.cdap.cdap.etl.api.streaming.StreamingSource} class parameter. + * + *

    Every Cdap Plugin has its {@link PluginConfig} class with necessary fields to configure the + * Plugin. You can set the {@link Map} of your parameters with the {@link + * ConfigWrapper#withParams(Map)} method where the key is the field name. + * + *

    For example, to create a basic {@link CdapIO#read()} transform: + * + *

    {@code
    + * Pipeline p = ...; // Create pipeline.
    + *
    + * // Create PluginConfig for specific plugin
    + * EmployeeConfig pluginConfig =
    + *         new ConfigWrapper<>(EmployeeConfig.class).withParams(TEST_EMPLOYEE_PARAMS_MAP).build();
    + *
    + * // Read using CDAP streaming plugin
    + * p.apply("ReadStreaming",
    + * CdapIO.read()
    + *             .withCdapPlugin(Plugin.createStreaming(EmployeeStreamingSource.class))
    + *             .withPluginConfig(pluginConfig)
    + *             .withKeyClass(String.class)
    + *             .withValueClass(String.class));
    + * }
    */ @Experimental(Kind.SOURCE_SINK) public class CdapIO { @@ -54,12 +190,25 @@ public static Write write() { @AutoValue @AutoValue.CopyAnnotations public abstract static class Read extends PTransform>> { + abstract @Nullable PluginConfig getPluginConfig(); abstract @Nullable Plugin getCdapPlugin(); + /** + * Depending on selected {@link HadoopFormatIO} type ({@link InputFormat} or {@link + * OutputFormat}), appropriate key class ("key.class") in Hadoop {@link Configuration} must be + * provided. If you set different Format key class than Format's actual key class then, it may + * result in an error. More info can be found in {@link HadoopFormatIO} documentation. + */ abstract @Nullable Class getKeyClass(); + /** + * Depending on selected {@link HadoopFormatIO} type ({@link InputFormat} or {@link + * OutputFormat}), appropriate value class ("value.class") in Hadoop {@link Configuration} must + * be provided. If you set different Format value class than Format's actual value class then, + * it may result in an error. More info can be found in {@link HadoopFormatIO} documentation. + */ abstract @Nullable Class getValueClass(); abstract Builder toBuilder(); @@ -79,27 +228,32 @@ abstract static class Builder { abstract Read build(); } + /** Sets a CDAP {@link Plugin}. */ public Read withCdapPlugin(Plugin plugin) { checkArgument(plugin != null, "Cdap plugin can not be null"); return toBuilder().setCdapPlugin(plugin).build(); } + /** Sets a CDAP Plugin class. */ public Read withCdapPluginClass(Class cdapPluginClass) { checkArgument(cdapPluginClass != null, "Cdap plugin class can not be null"); Plugin plugin = MappingUtils.getPluginByClass(cdapPluginClass); return toBuilder().setCdapPlugin(plugin).build(); } + /** Sets a {@link PluginConfig}. */ public Read withPluginConfig(PluginConfig pluginConfig) { checkArgument(pluginConfig != null, "Plugin config can not be null"); return toBuilder().setPluginConfig(pluginConfig).build(); } + /** Sets a key class. */ public Read withKeyClass(Class keyClass) { checkArgument(keyClass != null, "Key class can not be null"); return toBuilder().setKeyClass(keyClass).build(); } + /** Sets a value class. */ public Read withValueClass(Class valueClass) { checkArgument(valueClass != null, "Value class can not be null"); return toBuilder().setValueClass(valueClass).build(); @@ -107,19 +261,38 @@ public Read withValueClass(Class valueClass) { @Override public PCollection> expand(PBegin input) { - Plugin plugin = checkArgumentNotNull(getCdapPlugin(), "withCdapPluginClass() is required"); - PluginConfig pluginConfig = - checkArgumentNotNull(getPluginConfig(), "withPluginConfig() is required"); - Class keyClass = checkArgumentNotNull(getKeyClass(), "withKeyClass() is required"); - Class valueClass = checkArgumentNotNull(getValueClass(), "withValueClass() is required"); - - plugin.withConfig(pluginConfig).withHadoopConfiguration(keyClass, valueClass).prepareRun(); - - if (plugin.isUnbounded()) { - // TODO: implement SparkReceiverIO.<~>read() - throw new NotImplementedException("Support for unbounded plugins is not implemented!"); + Plugin cdapPlugin = getCdapPlugin(); + checkStateNotNull(cdapPlugin, "withCdapPluginClass() is required"); + + PluginConfig pluginConfig = getPluginConfig(); + checkStateNotNull(pluginConfig, "withPluginConfig() is required"); + + Class valueClass = getValueClass(); + checkStateNotNull(valueClass, "withValueClass() is required"); + + Class keyClass = getKeyClass(); + checkStateNotNull(keyClass, "withKeyClass() is required"); + + cdapPlugin.withConfig(pluginConfig); + + if (cdapPlugin.isUnbounded()) { + SparkReceiverIO.Read reader = + SparkReceiverIO.read() + .withGetOffsetFn(getOffsetFnForPluginClass(cdapPlugin.getPluginClass(), valueClass)) + .withSparkReceiverBuilder( + getReceiverBuilderByPluginClass( + cdapPlugin.getPluginClass(), pluginConfig, valueClass)); + try { + Coder coder = input.getPipeline().getCoderRegistry().getCoder(valueClass); + PCollection values = input.apply(reader).setCoder(coder); + SerializableFunction> fn = input1 -> KV.of(null, input1); + return values.apply(MapElements.into(new TypeDescriptor>() {}).via(fn)); + } catch (CannotProvideCoderException e) { + throw new IllegalStateException("Could not get value Coder", e); + } } else { - Configuration hConf = plugin.getHadoopConfiguration(); + cdapPlugin.withHadoopConfiguration(keyClass, valueClass).prepareRun(); + Configuration hConf = cdapPlugin.getHadoopConfiguration(); HadoopFormatIO.Read readFromHadoop = HadoopFormatIO.read().withConfiguration(hConf); return input.apply(readFromHadoop); @@ -127,7 +300,7 @@ public PCollection> expand(PBegin input) { } } - /** A {@link PTransform} to read from CDAP source. */ + /** A {@link PTransform} to write to CDAP sink. */ @AutoValue @AutoValue.CopyAnnotations public abstract static class Write extends PTransform>, PDone> { @@ -136,10 +309,28 @@ public abstract static class Write extends PTransform abstract @Nullable Plugin getCdapPlugin(); + /** + * Depending on selected {@link HadoopFormatIO} type ({@link InputFormat} or {@link + * OutputFormat}), appropriate key class ("key.class") in Hadoop {@link Configuration} must be + * provided. If you set different Format key class than Format's actual key class then, it may + * result in an error. More info can be found in {@link HadoopFormatIO} documentation. + */ abstract @Nullable Class getKeyClass(); + /** + * Depending on selected {@link HadoopFormatIO} type ({@link InputFormat} or {@link + * OutputFormat}), appropriate value class ("value.class") in Hadoop {@link Configuration} must + * be provided. If you set different Format value class than Format's actual value class then, + * it may result in an error. More info can be found in {@link HadoopFormatIO} documentation. + */ abstract @Nullable Class getValueClass(); + /** + * Directory where locks will be stored. This directory MUST be different that directory which + * is possibly stored under FileOutputFormat.outputDir key. Used for {@link HDFSSynchronization} + * configuration for {@link HadoopFormatIO}. More info can be found in {@link HadoopFormatIO} + * documentation. + */ abstract @Nullable String getLocksDirPath(); abstract Builder toBuilder(); @@ -161,32 +352,38 @@ abstract static class Builder { abstract Write build(); } + /** Sets a CDAP {@link Plugin}. */ public Write withCdapPlugin(Plugin plugin) { checkArgument(plugin != null, "Cdap plugin can not be null"); return toBuilder().setCdapPlugin(plugin).build(); } + /** Sets a CDAP Plugin class. */ public Write withCdapPluginClass(Class cdapPluginClass) { checkArgument(cdapPluginClass != null, "Cdap plugin class can not be null"); - Plugin plugin = MappingUtils.getPluginByClass(cdapPluginClass); + Plugin plugin = getPluginByClass(cdapPluginClass); return toBuilder().setCdapPlugin(plugin).build(); } + /** Sets a {@link PluginConfig}. */ public Write withPluginConfig(PluginConfig pluginConfig) { checkArgument(pluginConfig != null, "Plugin config can not be null"); return toBuilder().setPluginConfig(pluginConfig).build(); } + /** Sets a key class. */ public Write withKeyClass(Class keyClass) { checkArgument(keyClass != null, "Key class can not be null"); return toBuilder().setKeyClass(keyClass).build(); } + /** Sets path to directory where locks will be stored. */ public Write withLocksDirPath(String locksDirPath) { checkArgument(locksDirPath != null, "Locks dir path can not be null"); return toBuilder().setLocksDirPath(locksDirPath).build(); } + /** Sets a value class. */ public Write withValueClass(Class valueClass) { checkArgument(valueClass != null, "Value class can not be null"); return toBuilder().setValueClass(valueClass).build(); @@ -194,21 +391,30 @@ public Write withValueClass(Class valueClass) { @Override public PDone expand(PCollection> input) { - Plugin plugin = checkArgumentNotNull(getCdapPlugin(), "withKeyClass() is required"); - PluginConfig pluginConfig = - checkArgumentNotNull(getPluginConfig(), "withKeyClass() is required"); - Class keyClass = checkArgumentNotNull(getKeyClass(), "withKeyClass() is required"); - Class valueClass = checkArgumentNotNull(getValueClass(), "withValueClass() is required"); - String locksDirPath = - checkArgumentNotNull(getLocksDirPath(), "withLocksDirPath() is required"); + Plugin cdapPlugin = getCdapPlugin(); + checkStateNotNull(cdapPlugin, "withCdapPluginClass() is required"); + + PluginConfig pluginConfig = getPluginConfig(); + checkStateNotNull(pluginConfig, "withPluginConfig() is required"); + + Class keyClass = getKeyClass(); + checkStateNotNull(keyClass, "withKeyClass() is required"); + Class valueClass = getValueClass(); + checkStateNotNull(valueClass, "withValueClass() is required"); + + String locksDirPath = getLocksDirPath(); + checkStateNotNull(locksDirPath, "withLocksDirPath() is required"); - plugin.withConfig(pluginConfig).withHadoopConfiguration(keyClass, valueClass).prepareRun(); + cdapPlugin + .withConfig(pluginConfig) + .withHadoopConfiguration(keyClass, valueClass) + .prepareRun(); - if (plugin.isUnbounded()) { + if (cdapPlugin.isUnbounded()) { // TODO: implement SparkReceiverIO.<~>write() throw new NotImplementedException("Support for unbounded plugins is not implemented!"); } else { - Configuration hConf = plugin.getHadoopConfiguration(); + Configuration hConf = cdapPlugin.getHadoopConfiguration(); HadoopFormatIO.Write writeHadoop = HadoopFormatIO.write() .withConfiguration(hConf) diff --git a/sdks/java/io/cdap/src/main/java/org/apache/beam/sdk/io/cdap/ConfigWrapper.java b/sdks/java/io/cdap/src/main/java/org/apache/beam/sdk/io/cdap/ConfigWrapper.java index 9a2124e21b46..b073e275be38 100644 --- a/sdks/java/io/cdap/src/main/java/org/apache/beam/sdk/io/cdap/ConfigWrapper.java +++ b/sdks/java/io/cdap/src/main/java/org/apache/beam/sdk/io/cdap/ConfigWrapper.java @@ -41,6 +41,7 @@ public ConfigWrapper(Class configClass) { this.configClass = configClass; } + /** Gets {@link ConfigWrapper} by JSON string. */ public ConfigWrapper fromJsonString(String jsonString) throws IOException { TypeReference> typeRef = new TypeReference>() {}; @@ -53,6 +54,7 @@ public ConfigWrapper fromJsonString(String jsonString) throws IOException { return this; } + /** Gets {@link ConfigWrapper} by JSON file. */ public ConfigWrapper fromJsonFile(File jsonFile) throws IOException { TypeReference> typeRef = new TypeReference>() {}; @@ -65,11 +67,13 @@ public ConfigWrapper fromJsonFile(File jsonFile) throws IOException { return this; } + /** Sets a {@link Plugin} parameters {@link Map}. */ public ConfigWrapper withParams(Map paramsMap) { this.paramsMap = new HashMap<>(paramsMap); return this; } + /** Sets a {@link Plugin} single parameter. */ public ConfigWrapper setParam(String paramName, Object param) { getParamsMap().put(paramName, param); return this; diff --git a/sdks/java/io/cdap/src/main/java/org/apache/beam/sdk/io/cdap/MappingUtils.java b/sdks/java/io/cdap/src/main/java/org/apache/beam/sdk/io/cdap/MappingUtils.java index f8c7ce5d7550..463cc501a982 100644 --- a/sdks/java/io/cdap/src/main/java/org/apache/beam/sdk/io/cdap/MappingUtils.java +++ b/sdks/java/io/cdap/src/main/java/org/apache/beam/sdk/io/cdap/MappingUtils.java @@ -17,14 +17,19 @@ */ package org.apache.beam.sdk.io.cdap; +import static org.apache.beam.sdk.util.Preconditions.checkArgumentNotNull; import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument; +import com.google.gson.Gson; +import io.cdap.cdap.api.plugin.PluginConfig; import io.cdap.plugin.common.SourceInputFormatProvider; import io.cdap.plugin.hubspot.sink.batch.HubspotBatchSink; import io.cdap.plugin.hubspot.sink.batch.HubspotOutputFormat; import io.cdap.plugin.hubspot.source.batch.HubspotBatchSource; import io.cdap.plugin.hubspot.source.batch.HubspotInputFormat; import io.cdap.plugin.hubspot.source.batch.HubspotInputFormatProvider; +import io.cdap.plugin.hubspot.source.streaming.HubspotReceiver; +import io.cdap.plugin.hubspot.source.streaming.HubspotStreamingSource; import io.cdap.plugin.salesforce.plugin.source.batch.SalesforceBatchSource; import io.cdap.plugin.salesforce.plugin.source.batch.SalesforceInputFormat; import io.cdap.plugin.salesforce.plugin.source.batch.SalesforceInputFormatProvider; @@ -33,23 +38,118 @@ import io.cdap.plugin.zendesk.source.batch.ZendeskBatchSource; import io.cdap.plugin.zendesk.source.batch.ZendeskInputFormat; import io.cdap.plugin.zendesk.source.batch.ZendeskInputFormatProvider; +import java.util.HashMap; +import java.util.Map; +import org.apache.beam.sdk.io.sparkreceiver.ReceiverBuilder; +import org.apache.beam.sdk.transforms.SerializableFunction; +import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.reflect.TypeToken; +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.spark.streaming.receiver.Receiver; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +/** Util class for mapping plugins. */ public class MappingUtils { - public static Plugin getPluginByClass(Class pluginClass) { + private static final Logger LOG = LoggerFactory.getLogger(MappingUtils.class); + private static final String HUBSPOT_ID_FIELD = "vid"; + private static final Gson GSON = new Gson(); + + private static final Map< + Class, Pair, ReceiverBuilder>>> + REGISTERED_PLUGINS; + + static { + REGISTERED_PLUGINS = new HashMap<>(); + } + + /** Gets a {@link Plugin} by its class. */ + static Plugin getPluginByClass(Class pluginClass) { checkArgument(pluginClass != null, "Plugin class can not be null!"); if (pluginClass.equals(SalesforceBatchSource.class)) { - return Plugin.create( + return Plugin.createBatch( pluginClass, SalesforceInputFormat.class, SalesforceInputFormatProvider.class); } else if (pluginClass.equals(HubspotBatchSource.class)) { - return Plugin.create(pluginClass, HubspotInputFormat.class, HubspotInputFormatProvider.class); + return Plugin.createBatch( + pluginClass, HubspotInputFormat.class, HubspotInputFormatProvider.class); } else if (pluginClass.equals(ZendeskBatchSource.class)) { - return Plugin.create(pluginClass, ZendeskInputFormat.class, ZendeskInputFormatProvider.class); + return Plugin.createBatch( + pluginClass, ZendeskInputFormat.class, ZendeskInputFormatProvider.class); } else if (pluginClass.equals(HubspotBatchSink.class)) { - return Plugin.create(pluginClass, HubspotOutputFormat.class, SourceInputFormatProvider.class); + return Plugin.createBatch( + pluginClass, HubspotOutputFormat.class, SourceInputFormatProvider.class); } else if (pluginClass.equals(ServiceNowSource.class)) { - return Plugin.create( + return Plugin.createBatch( pluginClass, ServiceNowInputFormat.class, SourceInputFormatProvider.class); + } else if (pluginClass.equals(HubspotStreamingSource.class)) { + return Plugin.createStreaming(pluginClass); + } + throw new UnsupportedOperationException( + String.format("Given plugin class '%s' is not supported!", pluginClass.getName())); + } + + /** Gets a {@link ReceiverBuilder} by CDAP {@link Plugin} class. */ + @SuppressWarnings("unchecked") + static ReceiverBuilder> getReceiverBuilderByPluginClass( + Class pluginClass, PluginConfig pluginConfig, Class valueClass) { + checkArgument(pluginClass != null, "Plugin class can not be null!"); + checkArgument(pluginConfig != null, "Plugin config can not be null!"); + checkArgument(valueClass != null, "Value class can not be null!"); + if (pluginClass.equals(HubspotStreamingSource.class) && String.class.equals(valueClass)) { + ReceiverBuilder> receiverBuilder = + new ReceiverBuilder<>(HubspotReceiver.class).withConstructorArgs(pluginConfig); + return (ReceiverBuilder>) receiverBuilder; + } + if (REGISTERED_PLUGINS.containsKey(pluginClass)) { + return (ReceiverBuilder>) + REGISTERED_PLUGINS.get(pluginClass).getRight(); + } + throw new UnsupportedOperationException( + String.format("Given plugin class '%s' is not supported!", pluginClass.getName())); + } + + /** + * Register new CDAP Streaming {@link Plugin} class providing corresponding {@param getOffsetFn} + * and {@param receiverBuilder} params. + */ + public static void registerStreamingPlugin( + Class pluginClass, + SerializableFunction getOffsetFn, + ReceiverBuilder> receiverBuilder) { + REGISTERED_PLUGINS.put(pluginClass, new ImmutablePair<>(getOffsetFn, receiverBuilder)); + } + + private static SerializableFunction getOffsetFnForHubspot() { + return input -> { + if (input != null) { + try { + HashMap json = + GSON.fromJson(input, new TypeToken>() {}.getType()); + checkArgumentNotNull(json, "Can not get JSON from Hubspot input string"); + Object id = json.get(HUBSPOT_ID_FIELD); + checkArgumentNotNull(id, "Can not get ID from Hubspot input string"); + return ((Integer) id).longValue(); + } catch (Exception e) { + LOG.error("Can not get offset from json", e); + } + } + return 0L; + }; + } + + /** + * Gets a {@link SerializableFunction} that defines how to get record offset for CDAP {@link + * Plugin} class. + */ + @SuppressWarnings("unchecked") + static SerializableFunction getOffsetFnForPluginClass( + Class pluginClass, Class valueClass) { + if (pluginClass.equals(HubspotStreamingSource.class) && String.class.equals(valueClass)) { + return (SerializableFunction) getOffsetFnForHubspot(); + } + if (REGISTERED_PLUGINS.containsKey(pluginClass)) { + return (SerializableFunction) REGISTERED_PLUGINS.get(pluginClass).getLeft(); } throw new UnsupportedOperationException( String.format("Given plugin class '%s' is not supported!", pluginClass.getName())); diff --git a/sdks/java/io/cdap/src/main/java/org/apache/beam/sdk/io/cdap/Plugin.java b/sdks/java/io/cdap/src/main/java/org/apache/beam/sdk/io/cdap/Plugin.java index 31deb9d258db..6da476b56f3e 100644 --- a/sdks/java/io/cdap/src/main/java/org/apache/beam/sdk/io/cdap/Plugin.java +++ b/sdks/java/io/cdap/src/main/java/org/apache/beam/sdk/io/cdap/Plugin.java @@ -26,6 +26,7 @@ import io.cdap.cdap.etl.api.batch.BatchSinkContext; import io.cdap.cdap.etl.api.batch.BatchSource; import io.cdap.cdap.etl.api.batch.BatchSourceContext; +import io.cdap.cdap.etl.api.streaming.StreamingSource; import java.lang.annotation.Annotation; import java.lang.reflect.Constructor; import java.lang.reflect.Method; @@ -37,6 +38,7 @@ import org.apache.beam.sdk.io.cdap.context.BatchContextImpl; import org.apache.beam.sdk.io.cdap.context.BatchSinkContextImpl; import org.apache.beam.sdk.io.cdap.context.BatchSourceContextImpl; +import org.apache.beam.sdk.io.cdap.context.StreamingSourceContextImpl; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.MRJobConfig; import org.slf4j.Logger; @@ -49,6 +51,7 @@ public abstract class Plugin { private static final Logger LOG = LoggerFactory.getLogger(Plugin.class); private static final String PREPARE_RUN_METHOD_NAME = "prepareRun"; + private static final String GET_STREAM_METHOD_NAME = "getStream"; protected @Nullable PluginConfig pluginConfig; protected @Nullable Configuration hadoopConfiguration; @@ -61,10 +64,10 @@ public abstract class Plugin { public abstract Class getPluginClass(); /** Gets InputFormat or OutputFormat class for a plugin. */ - public abstract Class getFormatClass(); + public @Nullable abstract Class getFormatClass(); /** Gets InputFormatProvider or OutputFormatProvider class for a plugin. */ - public abstract Class getFormatProviderClass(); + public @Nullable abstract Class getFormatProviderClass(); /** Sets a plugin config. */ public Plugin withConfig(PluginConfig pluginConfig) { @@ -83,46 +86,57 @@ public Plugin withConfig(PluginConfig pluginConfig) { * validating connection to the CDAP sink/source and performing initial tuning. */ public void prepareRun() { - PluginConfig pluginConfig = getPluginConfig(); - checkStateNotNull(pluginConfig, "PluginConfig should be not null!"); + if (isUnbounded()) { + // Not needed for unbounded plugins + return; + } if (cdapPluginObj == null) { - try { - Constructor constructor = - getPluginClass().getDeclaredConstructor(pluginConfig.getClass()); - constructor.setAccessible(true); - cdapPluginObj = (SubmitterLifecycle) constructor.newInstance(pluginConfig); - } catch (Exception e) { - LOG.error("Can not instantiate CDAP plugin class", e); - throw new IllegalStateException("Can not call prepareRun"); - } + instantiateCdapPluginObj(); } + checkStateNotNull(cdapPluginObj, "Cdap Plugin object can't be null!"); try { cdapPluginObj.prepareRun(getContext()); - if (getPluginType().equals(PluginConstants.PluginType.SOURCE)) { - for (Map.Entry entry : - getContext().getInputFormatProvider().getInputFormatConfiguration().entrySet()) { - getHadoopConfiguration().set(entry.getKey(), entry.getValue()); - } - } else { - for (Map.Entry entry : - getContext().getOutputFormatProvider().getOutputFormatConfiguration().entrySet()) { - getHadoopConfiguration().set(entry.getKey(), entry.getValue()); - } - getHadoopConfiguration().set(MRJobConfig.ID, String.valueOf(1)); - } } catch (Exception e) { LOG.error("Error while prepareRun", e); throw new IllegalStateException("Error while prepareRun"); } + if (getPluginType().equals(PluginConstants.PluginType.SOURCE)) { + for (Map.Entry entry : + getContext().getInputFormatProvider().getInputFormatConfiguration().entrySet()) { + getHadoopConfiguration().set(entry.getKey(), entry.getValue()); + } + } else { + for (Map.Entry entry : + getContext().getOutputFormatProvider().getOutputFormatConfiguration().entrySet()) { + getHadoopConfiguration().set(entry.getKey(), entry.getValue()); + } + getHadoopConfiguration().set(MRJobConfig.ID, String.valueOf(1)); + } + } + + /** Creates an instance of {@link #cdapPluginObj} using {@link #pluginConfig}. */ + private void instantiateCdapPluginObj() { + PluginConfig pluginConfig = getPluginConfig(); + checkStateNotNull(pluginConfig, "PluginConfig should be not null!"); + try { + Constructor constructor = getPluginClass().getDeclaredConstructor(pluginConfig.getClass()); + constructor.setAccessible(true); + cdapPluginObj = (SubmitterLifecycle) constructor.newInstance(pluginConfig); + } catch (Exception e) { + LOG.error("Can not instantiate CDAP plugin class", e); + throw new IllegalStateException("Can not call prepareRun"); + } } /** Sets a plugin Hadoop configuration. */ public Plugin withHadoopConfiguration(Class formatKeyClass, Class formatValueClass) { + Class formatClass = getFormatClass(); + checkStateNotNull(formatClass, "Format class can't be null!"); PluginConstants.Format formatType = getFormatType(); PluginConstants.Hadoop hadoopType = getHadoopType(); getHadoopConfiguration() - .setClass(hadoopType.getFormatClass(), getFormatClass(), formatType.getFormatClass()); + .setClass(hadoopType.getFormatClass(), formatClass, formatType.getFormatClass()); getHadoopConfiguration().setClass(hadoopType.getKeyClass(), formatKeyClass, Object.class); getHadoopConfiguration().setClass(hadoopType.getValueClass(), formatValueClass, Object.class); @@ -163,7 +177,8 @@ private PluginConstants.Hadoop getHadoopType() { /** Gets value of a plugin type. */ public static PluginConstants.PluginType initPluginType(Class pluginClass) throws IllegalArgumentException { - if (BatchSource.class.isAssignableFrom(pluginClass)) { + if (StreamingSource.class.isAssignableFrom(pluginClass) + || BatchSource.class.isAssignableFrom(pluginClass)) { return PluginConstants.PluginType.SOURCE; } else if (BatchSink.class.isAssignableFrom(pluginClass)) { return PluginConstants.PluginType.SINK; @@ -188,6 +203,8 @@ public static BatchContextImpl initContext(Class cdapPluginClass) { } else if (contextClass.equals(BatchSinkContext.class)) { return new BatchSinkContextImpl(); } + } else if (method.getName().equals(GET_STREAM_METHOD_NAME)) { + return new StreamingSourceContextImpl(); } } throw new IllegalStateException("Cannot determine context class"); @@ -209,8 +226,8 @@ public Boolean isUnbounded() { return isUnbounded; } - /** Creates a plugin instance. */ - public static Plugin create( + /** Creates a batch plugin instance. */ + public static Plugin createBatch( Class newPluginClass, Class newFormatClass, Class newFormatProviderClass) { return builder() .setPluginClass(newPluginClass) @@ -221,6 +238,15 @@ public static Plugin create( .build(); } + /** Creates a streaming plugin instance. */ + public static Plugin createStreaming(Class newPluginClass) { + return builder() + .setPluginClass(newPluginClass) + .setPluginType(Plugin.initPluginType(newPluginClass)) + .setContext(Plugin.initContext(newPluginClass)) + .build(); + } + /** Creates a plugin builder instance. */ public static Builder builder() { return new AutoValue_Plugin.Builder(); diff --git a/sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/CdapIOIT.java b/sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/CdapIOIT.java index bb5f205fc517..8f2a987a5cda 100644 --- a/sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/CdapIOIT.java +++ b/sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/CdapIOIT.java @@ -162,7 +162,8 @@ private CdapIO.Write writeToDB(Mapwrite() .withCdapPlugin( - Plugin.create(DBBatchSink.class, DBOutputFormat.class, DBOutputFormatProvider.class)) + Plugin.createBatch( + DBBatchSink.class, DBOutputFormat.class, DBOutputFormatProvider.class)) .withPluginConfig(pluginConfig) .withKeyClass(TestRowDBWritable.class) .withValueClass(NullWritable.class) @@ -174,7 +175,8 @@ private CdapIO.Read readFromDB(Mapread() .withCdapPlugin( - Plugin.create(DBBatchSource.class, DBInputFormat.class, DBInputFormatProvider.class)) + Plugin.createBatch( + DBBatchSource.class, DBInputFormat.class, DBInputFormatProvider.class)) .withPluginConfig(pluginConfig) .withKeyClass(LongWritable.class) .withValueClass(TestRowDBWritable.class); diff --git a/sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/CdapIOTest.java b/sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/CdapIOTest.java index e978f5b8fcad..e18126e69acf 100644 --- a/sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/CdapIOTest.java +++ b/sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/CdapIOTest.java @@ -27,18 +27,34 @@ import java.util.ArrayList; import java.util.List; import java.util.Map; +import org.apache.beam.runners.direct.DirectOptions; +import org.apache.beam.runners.direct.DirectRunner; +import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.coders.KvCoder; +import org.apache.beam.sdk.coders.NullableCoder; import org.apache.beam.sdk.coders.StringUtf8Coder; +import org.apache.beam.sdk.io.cdap.batch.EmployeeBatchSink; +import org.apache.beam.sdk.io.cdap.batch.EmployeeBatchSource; +import org.apache.beam.sdk.io.cdap.batch.EmployeeInputFormat; +import org.apache.beam.sdk.io.cdap.batch.EmployeeInputFormatProvider; +import org.apache.beam.sdk.io.cdap.batch.EmployeeOutputFormat; +import org.apache.beam.sdk.io.cdap.batch.EmployeeOutputFormatProvider; import org.apache.beam.sdk.io.cdap.context.BatchSinkContextImpl; import org.apache.beam.sdk.io.cdap.context.BatchSourceContextImpl; +import org.apache.beam.sdk.io.cdap.streaming.EmployeeReceiver; +import org.apache.beam.sdk.io.cdap.streaming.EmployeeStreamingSource; +import org.apache.beam.sdk.io.sparkreceiver.ReceiverBuilder; +import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.testing.PAssert; import org.apache.beam.sdk.testing.TestPipeline; import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.transforms.Values; import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PBegin; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap; import org.apache.hadoop.mapreduce.OutputCommitter; +import org.joda.time.Duration; import org.junit.Before; import org.junit.Rule; import org.junit.Test; @@ -74,7 +90,7 @@ public void testReadBuildsCorrectly() { CdapIO.Read read = CdapIO.read() .withCdapPlugin( - Plugin.create( + Plugin.createBatch( EmployeeBatchSource.class, EmployeeInputFormat.class, EmployeeInputFormatProvider.class)) @@ -125,7 +141,7 @@ public void testReadObjectCreationFailsIfValueClassIsNull() { public void testReadExpandingFailsMissingCdapPluginClass() { PBegin testPBegin = PBegin.in(TestPipeline.create()); CdapIO.Read read = CdapIO.read(); - assertThrows(IllegalArgumentException.class, () -> read.expand(testPBegin)); + assertThrows(IllegalStateException.class, () -> read.expand(testPBegin)); } @Test @@ -136,13 +152,13 @@ public void testReadObjectCreationFailsIfCdapPluginClassIsNotSupported() { } @Test - public void testReadingData() { + public void testReadFromCdapBatchPlugin() { EmployeeConfig pluginConfig = new ConfigWrapper<>(EmployeeConfig.class).withParams(TEST_EMPLOYEE_PARAMS_MAP).build(); CdapIO.Read read = CdapIO.read() .withCdapPlugin( - Plugin.create( + Plugin.createBatch( EmployeeBatchSource.class, EmployeeInputFormat.class, EmployeeInputFormatProvider.class)) @@ -154,11 +170,43 @@ public void testReadingData() { for (int i = 1; i < EmployeeInputFormat.NUM_OF_TEST_EMPLOYEE_RECORDS; i++) { expected.add(KV.of(String.valueOf(i), EmployeeInputFormat.EMPLOYEE_NAME_PREFIX + i)); } - PCollection> actual = p.apply("ReadTest", read); + PCollection> actual = p.apply("ReadBatchTest", read); PAssert.that(actual).containsInAnyOrder(expected); p.run(); } + @Test + public void testReadFromCdapStreamingPlugin() { + DirectOptions options = PipelineOptionsFactory.as(DirectOptions.class); + options.setBlockOnRun(false); + options.setRunner(DirectRunner.class); + Pipeline p = Pipeline.create(options); + + EmployeeConfig pluginConfig = + new ConfigWrapper<>(EmployeeConfig.class).withParams(TEST_EMPLOYEE_PARAMS_MAP).build(); + MappingUtils.registerStreamingPlugin( + EmployeeStreamingSource.class, + Long::valueOf, + new ReceiverBuilder<>(EmployeeReceiver.class).withConstructorArgs(pluginConfig)); + + CdapIO.Read read = + CdapIO.read() + .withCdapPlugin(Plugin.createStreaming(EmployeeStreamingSource.class)) + .withPluginConfig(pluginConfig) + .withKeyClass(String.class) + .withValueClass(String.class); + + List storedRecords = EmployeeReceiver.getStoredRecords(); + + PCollection actual = + p.apply("ReadStreamingTest", read) + .setCoder(KvCoder.of(NullableCoder.of(StringUtf8Coder.of()), StringUtf8Coder.of())) + .apply(Values.create()); + + PAssert.that(actual).containsInAnyOrder(storedRecords); + p.run().waitUntilFinish(Duration.standardSeconds(15)); + } + @Test public void testWriteBuildsCorrectly() { EmployeeConfig pluginConfig = @@ -167,7 +215,7 @@ public void testWriteBuildsCorrectly() { CdapIO.Write write = CdapIO.write() .withCdapPlugin( - Plugin.create( + Plugin.createBatch( EmployeeBatchSink.class, EmployeeOutputFormat.class, EmployeeOutputFormatProvider.class)) @@ -230,7 +278,7 @@ public void testWriteExpandingFailsMissingCdapPluginClass() { PCollection> testPCollection = Create.empty(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of())).expand(testPBegin); CdapIO.Write write = CdapIO.write(); - assertThrows(IllegalArgumentException.class, () -> write.expand(testPCollection)); + assertThrows(IllegalStateException.class, () -> write.expand(testPCollection)); } @Test @@ -241,7 +289,7 @@ public void testWriteObjectCreationFailsIfCdapPluginClassIsNotSupported() { } @Test - public void testWritingData() throws IOException { + public void testWriteWithCdapBatchSinkPlugin() throws IOException { List> data = new ArrayList<>(); for (int i = 0; i < EmployeeInputFormat.NUM_OF_TEST_EMPLOYEE_RECORDS; i++) { data.add(KV.of(String.valueOf(i), EmployeeInputFormat.EMPLOYEE_NAME_PREFIX + i)); @@ -254,10 +302,10 @@ public void testWritingData() throws IOException { "Write", CdapIO.write() .withCdapPlugin( - Plugin.create( + Plugin.createBatch( EmployeeBatchSink.class, EmployeeOutputFormat.class, - EmployeeInputFormatProvider.class)) + EmployeeOutputFormatProvider.class)) .withPluginConfig(pluginConfig) .withKeyClass(String.class) .withValueClass(String.class) diff --git a/sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/EmployeeConfig.java b/sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/EmployeeConfig.java index d02f4548cd3a..547af887fc5d 100644 --- a/sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/EmployeeConfig.java +++ b/sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/EmployeeConfig.java @@ -25,10 +25,13 @@ import io.cdap.plugin.common.ReferencePluginConfig; import java.util.HashSet; import java.util.Set; +import org.apache.beam.sdk.io.cdap.batch.EmployeeBatchSink; +import org.apache.beam.sdk.io.cdap.batch.EmployeeBatchSource; /** * {@link io.cdap.cdap.api.plugin.PluginConfig} for {@link EmployeeBatchSource} and {@link - * EmployeeBatchSink} CDAP plugins. Used to test {@link CdapIO#read()} and {@link CdapIO#write()}. + * EmployeeBatchSink} CDAP plugins. Used to test {@link org.apache.beam.sdk.io.cdap.CdapIO#read()} + * and {@link org.apache.beam.sdk.io.cdap.CdapIO#write()}. */ public class EmployeeConfig extends ReferencePluginConfig { diff --git a/sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/PluginTest.java b/sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/PluginTest.java index 501c91b6cdaf..2fcfe6f36c0b 100644 --- a/sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/PluginTest.java +++ b/sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/PluginTest.java @@ -65,7 +65,7 @@ public class PluginTest { public void testBuildingSourcePluginWithCDAPClasses() { try { Plugin serviceNowSourcePlugin = - Plugin.create( + Plugin.createBatch( ServiceNowSource.class, ServiceNowInputFormat.class, SourceInputFormatProvider.class) @@ -93,7 +93,7 @@ public void testBuildingSourcePluginWithCDAPClasses() { @Test public void testSettingPluginType() { Plugin serviceNowSourcePlugin = - Plugin.create( + Plugin.createBatch( ServiceNowSource.class, ServiceNowInputFormat.class, SourceInputFormatProvider.class) @@ -108,7 +108,7 @@ public void testSettingPluginType() { public void testSettingPluginTypeFailed() { try { Plugin serviceNowSourcePlugin = - Plugin.create(Object.class, Object.class, Object.class) + Plugin.createBatch(Object.class, Object.class, Object.class) .withConfig(serviceNowSourceConfig) .withHadoopConfiguration(Schema.class, MapWritable.class); fail("This should have thrown an exception"); diff --git a/sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/EmployeeBatchSink.java b/sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/batch/EmployeeBatchSink.java similarity index 95% rename from sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/EmployeeBatchSink.java rename to sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/batch/EmployeeBatchSink.java index 1e0b835fac77..052d9ab0f6a8 100644 --- a/sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/EmployeeBatchSink.java +++ b/sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/batch/EmployeeBatchSink.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.sdk.io.cdap; +package org.apache.beam.sdk.io.cdap.batch; import io.cdap.cdap.api.annotation.Description; import io.cdap.cdap.api.annotation.Name; @@ -28,6 +28,8 @@ import io.cdap.cdap.etl.api.PipelineConfigurer; import io.cdap.cdap.etl.api.batch.BatchSink; import io.cdap.cdap.etl.api.batch.BatchSinkContext; +import org.apache.beam.sdk.io.cdap.CdapIO; +import org.apache.beam.sdk.io.cdap.EmployeeConfig; /** Imitation of CDAP {@link BatchSink} plugin. Used to test {@link CdapIO#write()}. */ @Plugin(type = BatchSink.PLUGIN_TYPE) diff --git a/sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/EmployeeBatchSource.java b/sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/batch/EmployeeBatchSource.java similarity index 94% rename from sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/EmployeeBatchSource.java rename to sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/batch/EmployeeBatchSource.java index 27494c8ce9c8..3daf2fb69b98 100644 --- a/sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/EmployeeBatchSource.java +++ b/sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/batch/EmployeeBatchSource.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.sdk.io.cdap; +package org.apache.beam.sdk.io.cdap.batch; import io.cdap.cdap.api.annotation.Description; import io.cdap.cdap.api.annotation.Name; @@ -32,6 +32,8 @@ import io.cdap.plugin.common.IdUtils; import io.cdap.plugin.common.LineageRecorder; import java.util.stream.Collectors; +import org.apache.beam.sdk.io.cdap.CdapIO; +import org.apache.beam.sdk.io.cdap.EmployeeConfig; /** Imitation of CDAP {@link BatchSource} plugin. Used to test {@link CdapIO#read()}. */ @Plugin(type = BatchSource.PLUGIN_TYPE) @@ -41,7 +43,7 @@ public class EmployeeBatchSource extends BatchSource()); } - static List> getWrittenOutput() { + public static List> getWrittenOutput() { return output; } - static OutputCommitter getOutputCommitter() { + public static OutputCommitter getOutputCommitter() { return outputCommitter; } } diff --git a/sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/EmployeeOutputFormatProvider.java b/sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/batch/EmployeeOutputFormatProvider.java similarity index 93% rename from sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/EmployeeOutputFormatProvider.java rename to sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/batch/EmployeeOutputFormatProvider.java index 826b3177d302..a42c0c89aca1 100644 --- a/sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/EmployeeOutputFormatProvider.java +++ b/sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/batch/EmployeeOutputFormatProvider.java @@ -15,12 +15,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.beam.sdk.io.cdap; +package org.apache.beam.sdk.io.cdap.batch; import com.google.gson.Gson; import com.google.gson.GsonBuilder; import io.cdap.cdap.api.data.batch.OutputFormatProvider; import java.util.Map; +import org.apache.beam.sdk.io.cdap.CdapIO; +import org.apache.beam.sdk.io.cdap.EmployeeConfig; import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap; /** diff --git a/sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/streaming/EmployeeReceiver.java b/sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/streaming/EmployeeReceiver.java new file mode 100644 index 000000000000..fcd0fa7b8d76 --- /dev/null +++ b/sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/streaming/EmployeeReceiver.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.cdap.streaming; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import org.apache.beam.sdk.io.cdap.EmployeeConfig; +import org.apache.beam.sdk.io.sparkreceiver.HasOffset; +import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.util.concurrent.ThreadFactoryBuilder; +import org.apache.spark.storage.StorageLevel; +import org.apache.spark.streaming.receiver.Receiver; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Imitation of Spark {@link Receiver} for {@link EmployeeStreamingSource} CDAP plugin. Used to test + * {@link org.apache.beam.sdk.io.cdap.CdapIO#read()}. + */ +public class EmployeeReceiver extends Receiver implements HasOffset { + + public static final int RECORDS_COUNT = 20; + + private static final Logger LOG = LoggerFactory.getLogger(EmployeeReceiver.class); + private static final int TIMEOUT_MS = 500; + private static final List STORED_RECORDS = new ArrayList<>(); + private final EmployeeConfig config; + private Long startOffset; + + EmployeeReceiver(EmployeeConfig config) { + super(StorageLevel.MEMORY_AND_DISK_2()); + this.config = config; + LOG.info("Created EmployeeReceiver with objectType = {}", this.config.objectType); + } + + @Override + public void setStartOffset(Long startOffset) { + if (startOffset != null) { + this.startOffset = startOffset; + } + } + + @Override + @SuppressWarnings("FutureReturnValueIgnored") + public void onStart() { + Executors.newSingleThreadExecutor(new ThreadFactoryBuilder().build()).submit(this::receive); + } + + @Override + public void onStop() {} + + @Override + public Long getEndOffset() { + return Long.MAX_VALUE; + } + + private void receive() { + Long currentOffset = startOffset; + while (!isStopped()) { + if (currentOffset <= RECORDS_COUNT) { + STORED_RECORDS.add(currentOffset.toString()); + store((currentOffset++).toString()); + } + try { + TimeUnit.MILLISECONDS.sleep(TIMEOUT_MS); + } catch (InterruptedException e) { + LOG.error("Interrupted", e); + } + } + } + + public static List getStoredRecords() { + return STORED_RECORDS; + } +} diff --git a/sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/streaming/EmployeeStreamingSource.java b/sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/streaming/EmployeeStreamingSource.java new file mode 100644 index 000000000000..e73688e97725 --- /dev/null +++ b/sdks/java/io/cdap/src/test/java/org/apache/beam/sdk/io/cdap/streaming/EmployeeStreamingSource.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.cdap.streaming; + +import io.cdap.cdap.api.annotation.Description; +import io.cdap.cdap.api.annotation.Name; +import io.cdap.cdap.api.annotation.Plugin; +import io.cdap.cdap.api.data.format.StructuredRecord; +import io.cdap.cdap.etl.api.FailureCollector; +import io.cdap.cdap.etl.api.PipelineConfigurer; +import io.cdap.cdap.etl.api.streaming.StreamingContext; +import io.cdap.cdap.etl.api.streaming.StreamingSource; +import java.io.IOException; +import org.apache.beam.sdk.io.cdap.CdapIO; +import org.apache.beam.sdk.io.cdap.EmployeeConfig; +import org.apache.spark.streaming.api.java.JavaDStream; +import org.apache.spark.streaming.api.java.JavaStreamingContext; + +/** Imitation of CDAP {@link StreamingSource} plugin. Used to test {@link CdapIO#read()}. */ +@Plugin(type = StreamingSource.PLUGIN_TYPE) +@Name(EmployeeStreamingSource.NAME) +@Description("Plugin reads Employee in streaming") +public class EmployeeStreamingSource extends StreamingSource { + + public static final String NAME = "EmployeeStreamingSource"; + + private final EmployeeConfig config; + + public EmployeeStreamingSource(EmployeeConfig config) { + this.config = config; + } + + @Override + public void configurePipeline(PipelineConfigurer pipelineConfigurer) { + FailureCollector collector = pipelineConfigurer.getStageConfigurer().getFailureCollector(); + config.validate(collector); // validate when macros are not substituted + collector.getOrThrowException(); + + pipelineConfigurer.getStageConfigurer().setOutputSchema(config.getSchema()); + } + + @Override + public JavaDStream getStream(StreamingContext streamingContext) + throws IOException { + FailureCollector collector = streamingContext.getFailureCollector(); + config.validate(collector); // validate when macros are substituted + collector.getOrThrowException(); + + JavaStreamingContext jssc = streamingContext.getSparkStreamingContext(); + + return jssc.receiverStream(new EmployeeReceiver(config)) + .map(jsonString -> transform(jsonString, config)); + } + + public static StructuredRecord transform(String value, EmployeeConfig config) { + StructuredRecord.Builder builder = StructuredRecord.builder(config.getSchema()); + builder.set("id", value); + builder.set("name", "Employee " + value); + return builder.build(); + } +} diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryOptions.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryOptions.java index 953d1237d9c9..53cb27136412 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryOptions.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryOptions.java @@ -150,4 +150,10 @@ public interface BigQueryOptions Integer getStorageApiAppendThresholdRecordCount(); void setStorageApiAppendThresholdRecordCount(Integer value); + + @Description("Maximum request size allowed by the storage write API. ") + @Default.Long(10 * 1000 * 1000) + Long getStorageWriteApiMaxRequestSize(); + + void setStorageWriteApiMaxRequestSize(Long value); } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryServicesImpl.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryServicesImpl.java index 9df75a5be943..5ec5549ed4ca 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryServicesImpl.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryServicesImpl.java @@ -29,7 +29,6 @@ import com.google.api.client.util.ExponentialBackOff; import com.google.api.client.util.Sleeper; import com.google.api.core.ApiFuture; -import com.google.api.gax.core.ExecutorProvider; import com.google.api.gax.core.FixedCredentialsProvider; import com.google.api.gax.rpc.ApiException; import com.google.api.gax.rpc.FixedHeaderProvider; @@ -106,7 +105,6 @@ import java.util.concurrent.ExecutionException; import java.util.concurrent.Executors; import java.util.concurrent.Future; -import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.Semaphore; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLong; @@ -122,7 +120,6 @@ import org.apache.beam.sdk.extensions.gcp.util.Transport; import org.apache.beam.sdk.metrics.Counter; import org.apache.beam.sdk.metrics.Metrics; -import org.apache.beam.sdk.options.ExecutorOptions; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.transforms.SerializableFunction; import org.apache.beam.sdk.util.FluentBackoff; @@ -1209,8 +1206,14 @@ long insertAll( } rowsToPublish = retryRows; idsToPublish = retryIds; + // print first 5 failures + int numErrorToLog = Math.min(allErrors.size(), 5); + LOG.info( + "Retrying {} failed inserts to BigQuery. First {} fails: {}", + rowsToPublish.size(), + numErrorToLog, + allErrors.subList(0, numErrorToLog)); allErrors.clear(); - LOG.info("Retrying {} failed inserts to BigQuery", rowsToPublish.size()); } if (successfulRows != null) { for (int i = 0; i < rowsToPublish.size(); i++) { @@ -1488,36 +1491,12 @@ private static BigQueryWriteClient newBigQueryWriteClient(BigQueryOptions option return BigQueryWriteClient.create( BigQueryWriteSettings.newBuilder() .setCredentialsProvider(() -> options.as(GcpOptions.class).getGcpCredential()) - .setBackgroundExecutorProvider(new OptionsExecutionProvider(options)) .build()); } catch (Exception e) { throw new RuntimeException(e); } } - /** - * OptionsExecutionProvider is a utility class used to wrap the Pipeline-wide {@link - * ScheduledExecutorService} into a supplier for the {@link BigQueryWriteClient}. - */ - private static class OptionsExecutionProvider implements ExecutorProvider { - - private final BigQueryOptions options; - - public OptionsExecutionProvider(BigQueryOptions options) { - this.options = options; - } - - @Override - public boolean shouldAutoClose() { - return false; - } - - @Override - public ScheduledExecutorService getExecutor() { - return options.as(ExecutorOptions.class).getScheduledExecutorService(); - } - } - public static CustomHttpErrors createBigQueryClientCustomErrors() { CustomHttpErrors.Builder builder = new CustomHttpErrors.Builder(); // 403 errors, to list tables, matching this URL: diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryStorageTableSource.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryStorageTableSource.java index 017fcd6c7e7d..26a9bed20c72 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryStorageTableSource.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryStorageTableSource.java @@ -185,7 +185,7 @@ protected Table getTargetTable(BigQueryOptions options) throws Exception { : options.getBigQueryProject()); } try (DatasetService datasetService = bqServices.getDatasetService(options)) { - Table table = bqServices.getDatasetService(options).getTable(tableReference); + Table table = datasetService.getTable(tableReference); if (table == null) { throw new IllegalArgumentException("Table not found" + table); } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiLoads.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiLoads.java index e48b9a196902..20ab251c9c0c 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiLoads.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiLoads.java @@ -24,6 +24,7 @@ import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; import org.apache.beam.sdk.schemas.NoSuchSchemaException; import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.Flatten; import org.apache.beam.sdk.transforms.GroupIntoBatches; import org.apache.beam.sdk.transforms.PTransform; import org.apache.beam.sdk.transforms.ParDo; @@ -32,6 +33,7 @@ import org.apache.beam.sdk.util.ShardedKey; import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PCollectionList; import org.apache.beam.sdk.values.PCollectionTuple; import org.apache.beam.sdk.values.TupleTag; import org.joda.time.Duration; @@ -101,7 +103,7 @@ public WriteResult expandInconsistent( PCollection> inputInGlobalWindow = input.apply("rewindowIntoGlobal", Window.into(new GlobalWindows())); - PCollectionTuple convertedRecords = + PCollectionTuple convertMessagesResult = inputInGlobalWindow .apply( "CreateTables", @@ -116,20 +118,23 @@ public WriteResult expandInconsistent( successfulRowsTag, BigQueryStorageApiInsertErrorCoder.of(), successCoder)); - convertedRecords - .get(successfulRowsTag) - .apply( - "StorageApiWriteInconsistent", - new StorageApiWriteRecordsInconsistent<>(dynamicDestinations, bqServices)); + PCollectionTuple writeRecordsResult = + convertMessagesResult + .get(successfulRowsTag) + .apply( + "StorageApiWriteInconsistent", + new StorageApiWriteRecordsInconsistent<>( + dynamicDestinations, + bqServices, + failedRowsTag, + BigQueryStorageApiInsertErrorCoder.of())); + + PCollection insertErrors = + PCollectionList.of(convertMessagesResult.get(failedRowsTag)) + .and(writeRecordsResult.get(failedRowsTag)) + .apply("flattenErrors", Flatten.pCollections()); return WriteResult.in( - input.getPipeline(), - null, - null, - null, - null, - null, - failedRowsTag, - convertedRecords.get(failedRowsTag)); + input.getPipeline(), null, null, null, null, null, failedRowsTag, insertErrors); } public WriteResult expandTriggered( @@ -139,7 +144,7 @@ public WriteResult expandTriggered( // Handle triggered, low-latency loads into BigQuery. PCollection> inputInGlobalWindow = input.apply("rewindowIntoGlobal", Window.into(new GlobalWindows())); - PCollectionTuple result = + PCollectionTuple convertMessagesResult = inputInGlobalWindow .apply( "CreateTables", @@ -159,7 +164,7 @@ public WriteResult expandTriggered( if (this.allowAutosharding) { groupedRecords = - result + convertMessagesResult .get(successfulRowsTag) .apply( "GroupIntoBatches", @@ -171,7 +176,7 @@ public WriteResult expandTriggered( } else { PCollection, StorageApiWritePayload>> shardedRecords = - createShardedKeyValuePairs(result) + createShardedKeyValuePairs(convertMessagesResult) .setCoder(KvCoder.of(ShardedKey.Coder.of(destinationCoder), payloadCoder)); groupedRecords = shardedRecords.apply( @@ -181,20 +186,25 @@ public WriteResult expandTriggered( (StorageApiWritePayload e) -> (long) e.getPayload().length) .withMaxBufferingDuration(triggeringFrequency)); } - groupedRecords.apply( - "StorageApiWriteSharded", - new StorageApiWritesShardedRecords<>( - dynamicDestinations, createDisposition, kmsKey, bqServices, destinationCoder)); + PCollectionTuple writeRecordsResult = + groupedRecords.apply( + "StorageApiWriteSharded", + new StorageApiWritesShardedRecords<>( + dynamicDestinations, + createDisposition, + kmsKey, + bqServices, + destinationCoder, + BigQueryStorageApiInsertErrorCoder.of(), + failedRowsTag)); + + PCollection insertErrors = + PCollectionList.of(convertMessagesResult.get(failedRowsTag)) + .and(writeRecordsResult.get(failedRowsTag)) + .apply("flattenErrors", Flatten.pCollections()); return WriteResult.in( - input.getPipeline(), - null, - null, - null, - null, - null, - failedRowsTag, - result.get(failedRowsTag)); + input.getPipeline(), null, null, null, null, null, failedRowsTag, insertErrors); } private PCollection, StorageApiWritePayload>> @@ -232,7 +242,7 @@ public WriteResult expandUntriggered( PCollection> inputInGlobalWindow = input.apply( "rewindowIntoGlobal", Window.>into(new GlobalWindows())); - PCollectionTuple convertedRecords = + PCollectionTuple convertMessagesResult = inputInGlobalWindow .apply( "CreateTables", @@ -247,20 +257,24 @@ public WriteResult expandUntriggered( successfulRowsTag, BigQueryStorageApiInsertErrorCoder.of(), successCoder)); - convertedRecords - .get(successfulRowsTag) - .apply( - "StorageApiWriteUnsharded", - new StorageApiWriteUnshardedRecords<>(dynamicDestinations, bqServices)); + + PCollectionTuple writeRecordsResult = + convertMessagesResult + .get(successfulRowsTag) + .apply( + "StorageApiWriteUnsharded", + new StorageApiWriteUnshardedRecords<>( + dynamicDestinations, + bqServices, + failedRowsTag, + BigQueryStorageApiInsertErrorCoder.of())); + + PCollection insertErrors = + PCollectionList.of(convertMessagesResult.get(failedRowsTag)) + .and(writeRecordsResult.get(failedRowsTag)) + .apply("flattenErrors", Flatten.pCollections()); return WriteResult.in( - input.getPipeline(), - null, - null, - null, - null, - null, - failedRowsTag, - convertedRecords.get(failedRowsTag)); + input.getPipeline(), null, null, null, null, null, failedRowsTag, insertErrors); } } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWriteRecordsInconsistent.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWriteRecordsInconsistent.java index 35b3ddfd080a..190525925aec 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWriteRecordsInconsistent.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWriteRecordsInconsistent.java @@ -17,12 +17,14 @@ */ package org.apache.beam.sdk.io.gcp.bigquery; -import org.apache.beam.sdk.coders.VoidCoder; -import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.transforms.PTransform; import org.apache.beam.sdk.transforms.ParDo; import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PCollectionTuple; +import org.apache.beam.sdk.values.TupleTag; +import org.apache.beam.sdk.values.TupleTagList; /** * A transform to write sharded records to BigQuery using the Storage API. This transform uses the @@ -32,34 +34,46 @@ */ @SuppressWarnings("FutureReturnValueIgnored") public class StorageApiWriteRecordsInconsistent - extends PTransform>, PCollection> { + extends PTransform>, PCollectionTuple> { private final StorageApiDynamicDestinations dynamicDestinations; private final BigQueryServices bqServices; + private final TupleTag failedRowsTag; + private final TupleTag> finalizeTag = new TupleTag<>("finalizeTag"); + private final Coder failedRowsCoder; public StorageApiWriteRecordsInconsistent( StorageApiDynamicDestinations dynamicDestinations, - BigQueryServices bqServices) { + BigQueryServices bqServices, + TupleTag failedRowsTag, + Coder failedRowsCoder) { this.dynamicDestinations = dynamicDestinations; this.bqServices = bqServices; + this.failedRowsTag = failedRowsTag; + this.failedRowsCoder = failedRowsCoder; } @Override - public PCollection expand(PCollection> input) { + public PCollectionTuple expand(PCollection> input) { String operationName = input.getName() + "/" + getName(); BigQueryOptions bigQueryOptions = input.getPipeline().getOptions().as(BigQueryOptions.class); // Append records to the Storage API streams. - input.apply( - "Write Records", - ParDo.of( - new StorageApiWriteUnshardedRecords.WriteRecordsDoFn<>( - operationName, - dynamicDestinations, - bqServices, - true, - bigQueryOptions.getStorageApiAppendThresholdBytes(), - bigQueryOptions.getStorageApiAppendThresholdRecordCount(), - bigQueryOptions.getNumStorageWriteApiStreamAppendClients())) - .withSideInputs(dynamicDestinations.getSideInputs())); - return input.getPipeline().apply("voids", Create.empty(VoidCoder.of())); + PCollectionTuple result = + input.apply( + "Write Records", + ParDo.of( + new StorageApiWriteUnshardedRecords.WriteRecordsDoFn<>( + operationName, + dynamicDestinations, + bqServices, + true, + bigQueryOptions.getStorageApiAppendThresholdBytes(), + bigQueryOptions.getStorageApiAppendThresholdRecordCount(), + bigQueryOptions.getNumStorageWriteApiStreamAppendClients(), + finalizeTag, + failedRowsTag)) + .withOutputTags(finalizeTag, TupleTagList.of(failedRowsTag)) + .withSideInputs(dynamicDestinations.getSideInputs())); + result.get(failedRowsTag).setCoder(failedRowsCoder); + return result; } } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWriteUnshardedRecords.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWriteUnshardedRecords.java index 871fc73698af..0f86b8871f0e 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWriteUnshardedRecords.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWriteUnshardedRecords.java @@ -20,26 +20,31 @@ import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument; import com.google.api.core.ApiFuture; +import com.google.api.core.ApiFutures; +import com.google.api.services.bigquery.model.TableRow; import com.google.cloud.bigquery.storage.v1.AppendRowsResponse; +import com.google.cloud.bigquery.storage.v1.Exceptions; import com.google.cloud.bigquery.storage.v1.ProtoRows; import com.google.cloud.bigquery.storage.v1.WriteStream.Type; import com.google.protobuf.ByteString; import com.google.protobuf.DynamicMessage; +import com.google.protobuf.InvalidProtocolBufferException; import java.io.IOException; import java.time.Instant; +import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.Random; +import java.util.Set; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; -import java.util.stream.StreamSupport; +import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.KvCoder; import org.apache.beam.sdk.coders.StringUtf8Coder; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.DatasetService; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.StreamAppendClient; -import org.apache.beam.sdk.io.gcp.bigquery.RetryManager.Operation.Context; import org.apache.beam.sdk.io.gcp.bigquery.RetryManager.RetryType; import org.apache.beam.sdk.io.gcp.bigquery.StorageApiDynamicDestinations.DescriptorWrapper; import org.apache.beam.sdk.io.gcp.bigquery.StorageApiDynamicDestinations.MessageConverter; @@ -51,14 +56,18 @@ import org.apache.beam.sdk.transforms.PTransform; import org.apache.beam.sdk.transforms.ParDo; import org.apache.beam.sdk.transforms.Reshuffle; -import org.apache.beam.sdk.transforms.windowing.BoundedWindow; import org.apache.beam.sdk.transforms.windowing.GlobalWindow; import org.apache.beam.sdk.util.Preconditions; import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PCollectionTuple; +import org.apache.beam.sdk.values.TupleTag; +import org.apache.beam.sdk.values.TupleTagList; +import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Strings; import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.Cache; import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.CacheBuilder; import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.RemovalNotification; +import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables; import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Lists; import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Maps; import org.checkerframework.checker.nullness.qual.NonNull; @@ -75,11 +84,14 @@ */ @SuppressWarnings({"FutureReturnValueIgnored"}) public class StorageApiWriteUnshardedRecords - extends PTransform>, PCollection> { + extends PTransform>, PCollectionTuple> { private static final Logger LOG = LoggerFactory.getLogger(StorageApiWriteUnshardedRecords.class); private final StorageApiDynamicDestinations dynamicDestinations; private final BigQueryServices bqServices; + private final TupleTag failedRowsTag; + private final TupleTag> finalizeTag = new TupleTag<>("finalizeTag"); + private final Coder failedRowsCoder; private static final ExecutorService closeWriterExecutor = Executors.newCachedThreadPool(); /** @@ -87,6 +99,8 @@ public class StorageApiWriteUnshardedRecords * StreamAppendClient after looking up the cache, and we must ensure that the cache is not * accessed in between the lookup and the pin (any access of the cache could trigger element * expiration). Therefore most used of APPEND_CLIENTS should synchronize. + * + *

    TODO(reuvenlax); Once all uses of StreamWriter are using */ private static final Cache APPEND_CLIENTS = CacheBuilder.newBuilder() @@ -122,20 +136,24 @@ private static void runAsyncIgnoreFailure(ExecutorService executor, ThrowingRunn public StorageApiWriteUnshardedRecords( StorageApiDynamicDestinations dynamicDestinations, - BigQueryServices bqServices) { + BigQueryServices bqServices, + TupleTag failedRowsTag, + Coder failedRowsCoder) { this.dynamicDestinations = dynamicDestinations; this.bqServices = bqServices; + this.failedRowsTag = failedRowsTag; + this.failedRowsCoder = failedRowsCoder; } @Override - public PCollection expand(PCollection> input) { + public PCollectionTuple expand(PCollection> input) { String operationName = input.getName() + "/" + getName(); BigQueryOptions options = input.getPipeline().getOptions().as(BigQueryOptions.class); org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument( !options.getUseStorageApiConnectionPool(), "useStorageApiConnectionPool only supported " + "when using STORAGE_API_AT_LEAST_ONCE"); - return input - .apply( + PCollectionTuple writeResults = + input.apply( "Write Records", ParDo.of( new WriteRecordsDoFn<>( @@ -145,19 +163,39 @@ public PCollection expand(PCollection extends DoFn, KV> { private final Counter forcedFlushes = Metrics.counter(WriteRecordsDoFn.class, "forcedFlushes"); + private final TupleTag> finalizeTag; + private final TupleTag failedRowsTag; + + static class AppendRowsContext extends RetryManager.Operation.Context { + long offset; + ProtoRows protoRows; + + public AppendRowsContext(long offset, ProtoRows protoRows) { + this.offset = offset; + this.protoRows = protoRows; + } + } class DestinationState { private final String tableUrn; @@ -175,11 +213,17 @@ class DestinationState { Metrics.counter(WriteRecordsDoFn.class, "schemaMismatches"); private final Distribution inflightWaitSecondsDistribution = Metrics.distribution(WriteRecordsDoFn.class, "streamWriterWaitSeconds"); + private final Counter rowsSentToFailedRowsCollection = + Metrics.counter( + StorageApiWritesShardedRecords.WriteRecordsDoFn.class, + "rowsSentToFailedRowsCollection"); + private final boolean useDefaultStream; private DescriptorWrapper descriptorWrapper; private Instant nextCacheTickle = Instant.MAX; private final int clientNumber; private final boolean usingMultiplexing; + private final long maxRequestSize; public DestinationState( String tableUrn, @@ -187,7 +231,8 @@ public DestinationState( DatasetService datasetService, boolean useDefaultStream, int streamAppendClientCount, - BigQueryOptions bigQueryOptions) { + boolean usingMultiplexing, + long maxRequestSize) { this.tableUrn = tableUrn; this.messageConverter = messageConverter; this.pendingMessages = Lists.newArrayList(); @@ -195,7 +240,8 @@ public DestinationState( this.useDefaultStream = useDefaultStream; this.descriptorWrapper = messageConverter.getSchemaDescriptor(); this.clientNumber = new Random().nextInt(streamAppendClientCount); - this.usingMultiplexing = bigQueryOptions.getUseStorageApiConnectionPool(); + this.usingMultiplexing = usingMultiplexing; + this.maxRequestSize = maxRequestSize; } void teardown() { @@ -217,7 +263,7 @@ String getStreamAppendClientCacheEntryKey() { return this.streamName; } - String createStreamIfNeeded() { + String getOrCreateStreamName() { try { if (!useDefaultStream) { this.streamName = @@ -242,7 +288,7 @@ StreamAppendClient generateClient() throws Exception { StreamAppendClient getStreamAppendClient(boolean lookupCache) { try { if (this.streamAppendClient == null) { - createStreamIfNeeded(); + getOrCreateStreamName(); final StreamAppendClient newStreamAppendClient; synchronized (APPEND_CLIENTS) { if (lookupCache) { @@ -313,7 +359,8 @@ void addMessage(StorageApiWritePayload payload) throws Exception { invalidateWriteStream(); if (useDefaultStream) { // Since the default stream client is shared across many bundles and threads, we can't - // simply look it upfrom the cache, as another thread may have recreated it with the old + // simply look it up from the cache, as another thread may have recreated it with the + // old // schema. getStreamAppendClient(false); } @@ -328,29 +375,62 @@ void addMessage(StorageApiWritePayload payload) throws Exception { pendingMessages.add(ByteString.copyFrom(payload.getPayload())); } - void flush(RetryManager> retryManager) + long flush( + RetryManager retryManager, + OutputReceiver failedRowsReceiver) throws Exception { if (pendingMessages.isEmpty()) { - return; + return 0; } - final ProtoRows.Builder inserts = ProtoRows.newBuilder(); - inserts.addAllSerializedRows(pendingMessages); - ProtoRows protoRows = inserts.build(); + final ProtoRows.Builder insertsBuilder = ProtoRows.newBuilder(); + insertsBuilder.addAllSerializedRows(pendingMessages); + final ProtoRows inserts = insertsBuilder.build(); pendingMessages.clear(); + // Handle the case where the request is too large. + if (inserts.getSerializedSize() >= maxRequestSize) { + if (inserts.getSerializedRowsCount() > 1) { + // TODO(reuvenlax): Is it worth trying to handle this case by splitting the protoRows? + // Given that we split + // the ProtoRows iterable at 2MB and the max request size is 10MB, this scenario seems + // nearly impossible. + LOG.error( + "A request containing more than one row is over the request size limit of " + + maxRequestSize + + ". This is unexpected. All rows in the request will be sent to the failed-rows PCollection."); + } + for (ByteString rowBytes : inserts.getSerializedRowsList()) { + TableRow failedRow = + TableRowToStorageApiProto.tableRowFromMessage( + DynamicMessage.parseFrom(descriptorWrapper.descriptor, rowBytes)); + failedRowsReceiver.output( + new BigQueryStorageApiInsertError( + failedRow, "Row payload too large. Maximum size " + maxRequestSize)); + } + return 0; + } + + long offset = -1; + if (!this.useDefaultStream) { + offset = this.currentOffset; + this.currentOffset += inserts.getSerializedRowsCount(); + } + AppendRowsContext appendRowsContext = new AppendRowsContext(offset, inserts); + retryManager.addOperation( c -> { + if (c.protoRows.getSerializedRowsCount() == 0) { + // This might happen if all rows in a batch failed and were sent to the failed-rows + // PCollection. + return ApiFutures.immediateFuture(AppendRowsResponse.newBuilder().build()); + } try { StreamAppendClient writeStream = getStreamAppendClient(true); - long offset = -1; - if (!this.useDefaultStream) { - offset = this.currentOffset; - this.currentOffset += inserts.getSerializedRowsCount(); - } - ApiFuture response = writeStream.appendRows(offset, protoRows); + ApiFuture response = + writeStream.appendRows(c.offset, c.protoRows); + inflightWaitSecondsDistribution.update(writeStream.getInflightWaitSeconds()); if (!usingMultiplexing) { - inflightWaitSecondsDistribution.update(writeStream.getInflightWaitSeconds()); if (writeStream.getInflightWaitSeconds() > 5) { LOG.warn( "Storage Api write delay more than {} seconds.", @@ -363,33 +443,78 @@ void flush(RetryManager> retryMa } }, contexts -> { + AppendRowsContext failedContext = + Preconditions.checkStateNotNull(Iterables.getFirst(contexts, null)); + if (failedContext.getError() != null + && failedContext.getError() instanceof Exceptions.AppendSerializtionError) { + Exceptions.AppendSerializtionError error = + Preconditions.checkStateNotNull( + (Exceptions.AppendSerializtionError) failedContext.getError()); + Set failedRowIndices = error.getRowIndexToErrorMessage().keySet(); + for (int failedIndex : failedRowIndices) { + // Convert the message to a TableRow and send it to the failedRows collection. + ByteString protoBytes = failedContext.protoRows.getSerializedRows(failedIndex); + try { + TableRow failedRow = + TableRowToStorageApiProto.tableRowFromMessage( + DynamicMessage.parseFrom(descriptorWrapper.descriptor, protoBytes)); + new BigQueryStorageApiInsertError( + failedRow, error.getRowIndexToErrorMessage().get(failedIndex)); + failedRowsReceiver.output( + new BigQueryStorageApiInsertError( + failedRow, error.getRowIndexToErrorMessage().get(failedIndex))); + } catch (InvalidProtocolBufferException e) { + LOG.error("Failed to insert row and could not parse the result!"); + } + } + rowsSentToFailedRowsCollection.inc(failedRowIndices.size()); + + // Remove the failed row from the payload, so we retry the batch without the failed + // rows. + ProtoRows.Builder retryRows = ProtoRows.newBuilder(); + for (int i = 0; i < failedContext.protoRows.getSerializedRowsCount(); ++i) { + if (!failedRowIndices.contains(i)) { + ByteString rowBytes = failedContext.protoRows.getSerializedRows(i); + retryRows.addSerializedRows(rowBytes); + } + } + failedContext.protoRows = retryRows.build(); + + // Since we removed rows, we need to update the insert offsets for all remaining + // rows. + long newOffset = failedContext.offset; + for (AppendRowsContext context : contexts) { + context.offset = newOffset; + newOffset += context.protoRows.getSerializedRowsCount(); + } + this.currentOffset = newOffset; + return RetryType.RETRY_ALL_OPERATIONS; + } + LOG.warn( "Append to stream {} by client #{} failed with error, operations will be retried. Details: {}", streamName, clientNumber, - retrieveErrorDetails(contexts)); + retrieveErrorDetails(failedContext)); invalidateWriteStream(); appendFailures.inc(); return RetryType.RETRY_ALL_OPERATIONS; }, - response -> { - recordsAppended.inc(protoRows.getSerializedRowsCount()); + c -> { + recordsAppended.inc(c.protoRows.getSerializedRowsCount()); }, - new Context<>()); + appendRowsContext); maybeTickleCache(); + return inserts.getSerializedRowsCount(); } - String retrieveErrorDetails(Iterable> contexts) { - return StreamSupport.stream(contexts.spliterator(), false) - .<@Nullable Throwable>map(ctx -> ctx.getError()) - .map( - err -> - (err == null) - ? "no error" - : Lists.newArrayList(err.getStackTrace()).stream() - .map(se -> se.toString()) - .collect(Collectors.joining("\n"))) - .collect(Collectors.joining(",")); + String retrieveErrorDetails(AppendRowsContext failedContext) { + return (failedContext.getError() != null) + ? Arrays.stream( + Preconditions.checkStateNotNull(failedContext.getError()).getStackTrace()) + .map(StackTraceElement::toString) + .collect(Collectors.joining("\n")) + : "no execption"; } } @@ -412,7 +537,9 @@ String retrieveErrorDetails(Iterable> contexts) { boolean useDefaultStream, int flushThresholdBytes, int flushThresholdCount, - int streamAppendClientCount) { + int streamAppendClientCount, + TupleTag> finalizeTag, + TupleTag failedRowsTag) { this.messageConverters = new TwoLevelMessageConverterCache<>(operationName); this.dynamicDestinations = dynamicDestinations; this.bqServices = bqServices; @@ -420,31 +547,47 @@ String retrieveErrorDetails(Iterable> contexts) { this.flushThresholdBytes = flushThresholdBytes; this.flushThresholdCount = flushThresholdCount; this.streamAppendClientCount = streamAppendClientCount; + this.finalizeTag = finalizeTag; + this.failedRowsTag = failedRowsTag; } boolean shouldFlush() { return numPendingRecords > flushThresholdCount || numPendingRecordBytes > flushThresholdBytes; } - void flushIfNecessary() throws Exception { + void flushIfNecessary(OutputReceiver failedRowsReceiver) + throws Exception { if (shouldFlush()) { forcedFlushes.inc(); // Too much memory being used. Flush the state and wait for it to drain out. // TODO(reuvenlax): Consider waiting for memory usage to drop instead of waiting for all the // appends to finish. - flushAll(); + flushAll(failedRowsReceiver); } } - void flushAll() throws Exception { - RetryManager> - retryManager = - new RetryManager<>(Duration.standardSeconds(1), Duration.standardSeconds(10), 1000); - Preconditions.checkStateNotNull(destinations); - for (DestinationState destinationState : destinations.values()) { - destinationState.flush(retryManager); + void flushAll(OutputReceiver failedRowsReceiver) + throws Exception { + List> retryManagers = + Lists.newArrayListWithCapacity(Preconditions.checkStateNotNull(destinations).size()); + long numRowsWritten = 0; + for (DestinationState destinationState : + Preconditions.checkStateNotNull(destinations).values()) { + RetryManager retryManager = + new RetryManager<>(Duration.standardSeconds(1), Duration.standardSeconds(10), 1000); + retryManagers.add(retryManager); + numRowsWritten += destinationState.flush(retryManager, failedRowsReceiver); + retryManager.run(false); + } + if (numRowsWritten > 0) { + // TODO(reuvenlax): Can we await in parallel instead? Failure retries aren't triggered until + // await is called, so + // this approach means that if one call fais, it has to wait for all prior calls to complete + // before a retry happens. + for (RetryManager retryManager : retryManagers) { + retryManager.await(); + } } - retryManager.run(true); numPendingRecords = 0; numPendingRecordBytes = 0; } @@ -488,14 +631,16 @@ DestinationState createDestinationState( datasetService, useDefaultStream, streamAppendClientCount, - bigQueryOptions); + bigQueryOptions.getUseStorageApiConnectionPool(), + bigQueryOptions.getStorageWriteApiMaxRequestSize()); } @ProcessElement public void process( ProcessContext c, PipelineOptions pipelineOptions, - @Element KV element) + @Element KV element, + MultiOutputReceiver o) throws Exception { DatasetService initializedDatasetService = initializeDatasetService(pipelineOptions); dynamicDestinations.setSideInputAccessorFromProcessContext(c); @@ -506,7 +651,7 @@ public void process( k -> createDestinationState( c, k, initializedDatasetService, pipelineOptions.as(BigQueryOptions.class))); - flushIfNecessary(); + flushIfNecessary(o.get(failedRowsTag)); state.addMessage(element.getValue()); ++numPendingRecords; numPendingRecordBytes += element.getValue().getPayload().length; @@ -514,14 +659,28 @@ public void process( @FinishBundle public void finishBundle(FinishBundleContext context) throws Exception { - flushAll(); + flushAll( + new OutputReceiver() { + @Override + public void output(BigQueryStorageApiInsertError output) { + outputWithTimestamp(output, GlobalWindow.INSTANCE.maxTimestamp()); + } + + @Override + public void outputWithTimestamp( + BigQueryStorageApiInsertError output, org.joda.time.Instant timestamp) { + context.output(failedRowsTag, output, timestamp, GlobalWindow.INSTANCE); + } + }); + final Map destinations = Preconditions.checkStateNotNull(this.destinations); for (DestinationState state : destinations.values()) { - if (!useDefaultStream) { + if (!useDefaultStream && !Strings.isNullOrEmpty(state.streamName)) { context.output( + finalizeTag, KV.of(state.tableUrn, state.streamName), - BoundedWindow.TIMESTAMP_MAX_VALUE.minus(Duration.millis(1)), + GlobalWindow.INSTANCE.maxTimestamp(), GlobalWindow.INSTANCE); } state.teardown(); diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWritesShardedRecords.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWritesShardedRecords.java index c8bb805b6e8f..af0ae5169bc9 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWritesShardedRecords.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiWritesShardedRecords.java @@ -20,16 +20,23 @@ import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument; import com.google.api.core.ApiFuture; +import com.google.api.core.ApiFutures; +import com.google.api.services.bigquery.model.TableRow; import com.google.cloud.bigquery.storage.v1.AppendRowsResponse; +import com.google.cloud.bigquery.storage.v1.Exceptions; import com.google.cloud.bigquery.storage.v1.Exceptions.StreamFinalizedException; import com.google.cloud.bigquery.storage.v1.ProtoRows; import com.google.cloud.bigquery.storage.v1.WriteStream.Type; +import com.google.protobuf.ByteString; +import com.google.protobuf.DynamicMessage; +import com.google.protobuf.InvalidProtocolBufferException; import io.grpc.Status; import io.grpc.Status.Code; import java.io.IOException; import java.time.Instant; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; @@ -74,6 +81,9 @@ import org.apache.beam.sdk.util.ShardedKey; import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.PCollectionTuple; +import org.apache.beam.sdk.values.TupleTag; +import org.apache.beam.sdk.values.TupleTagList; import org.apache.beam.sdk.values.TypeDescriptor; import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.MoreObjects; import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Strings; @@ -99,7 +109,7 @@ public class StorageApiWritesShardedRecords extends PTransform< PCollection, Iterable>>, - PCollection> { + PCollectionTuple> { private static final Logger LOG = LoggerFactory.getLogger(StorageApiWritesShardedRecords.class); private static final Duration DEFAULT_STREAM_IDLE_TIME = Duration.standardHours(1); @@ -108,7 +118,10 @@ public class StorageApiWritesShardedRecords destinationCoder; + private final Coder failedRowsCoder; private final Duration streamIdleTime = DEFAULT_STREAM_IDLE_TIME; + private final TupleTag failedRowsTag; + private final TupleTag> flushTag = new TupleTag<>("flushTag"); private static final ExecutorService closeWriterExecutor = Executors.newCachedThreadPool(); private static final Cache APPEND_CLIENTS = @@ -147,24 +160,29 @@ public StorageApiWritesShardedRecords( CreateDisposition createDisposition, String kmsKey, BigQueryServices bqServices, - Coder destinationCoder) { + Coder destinationCoder, + Coder failedRowsCoder, + TupleTag failedRowsTag) { this.dynamicDestinations = dynamicDestinations; this.createDisposition = createDisposition; this.kmsKey = kmsKey; this.bqServices = bqServices; this.destinationCoder = destinationCoder; + this.failedRowsCoder = failedRowsCoder; + this.failedRowsTag = failedRowsTag; } @Override - public PCollection expand( + public PCollectionTuple expand( PCollection, Iterable>> input) { String operationName = input.getName() + "/" + getName(); // Append records to the Storage API streams. - PCollection> written = + PCollectionTuple writeRecordsResult = input.apply( "Write Records", ParDo.of(new WriteRecordsDoFn(operationName, streamIdleTime)) - .withSideInputs(dynamicDestinations.getSideInputs())); + .withSideInputs(dynamicDestinations.getSideInputs()) + .withOutputTags(flushTag, TupleTagList.of(failedRowsTag))); SchemaCoder operationCoder; try { @@ -180,7 +198,8 @@ public PCollection expand( } // Send all successful writes to be flushed. - return written + writeRecordsResult + .get(flushTag) .setCoder(KvCoder.of(StringUtf8Coder.of(), operationCoder)) .apply( Window.>configure() @@ -192,6 +211,8 @@ public PCollection expand( .apply("maxFlushPosition", Combine.perKey(Max.naturalOrder(new Operation(-1, false)))) .apply( "Flush and finalize writes", ParDo.of(new StorageApiFlushAndFinalizeDoFn(bqServices))); + writeRecordsResult.get(failedRowsTag).setCoder(failedRowsCoder); + return writeRecordsResult; } class WriteRecordsDoFn @@ -215,6 +236,8 @@ class WriteRecordsDoFn Metrics.distribution(WriteRecordsDoFn.class, "appendSizeDistribution"); private final Distribution appendSplitDistribution = Metrics.distribution(WriteRecordsDoFn.class, "appendSplitDistribution"); + private final Counter rowsSentToFailedRowsCollection = + Metrics.counter(WriteRecordsDoFn.class, "rowsSentToFailedRowsCollection"); private TwoLevelMessageConverterCache messageConverters; @@ -297,8 +320,10 @@ public void process( final @AlwaysFetched @StateId("streamName") ValueState streamName, final @AlwaysFetched @StateId("streamOffset") ValueState streamOffset, @TimerId("idleTimer") Timer idleTimer, - final OutputReceiver> o) + final MultiOutputReceiver o) throws Exception { + BigQueryOptions bigQueryOptions = pipelineOptions.as(BigQueryOptions.class); + dynamicDestinations.setSideInputAccessorFromProcessContext(c); TableDestination tableDestination = destinations.computeIfAbsent( @@ -323,7 +348,7 @@ public void process( // Each ProtoRows object contains at most 1MB of rows. // TODO: Push messageFromTableRow up to top level. That we we cans skip TableRow entirely if // already proto or already schema. - final long oneMb = 1024 * 1024; + final long splitSize = bigQueryOptions.getStorageApiAppendThresholdBytes(); // Called if the schema does not match. Function updateSchemaHash = (Long expectedHash) -> { @@ -343,7 +368,7 @@ public void process( } }; Iterable messages = - new SplittingIterable(element.getValue(), oneMb, descriptor.get(), updateSchemaHash); + new SplittingIterable(element.getValue(), splitSize, descriptor.get(), updateSchemaHash); class AppendRowsContext extends RetryManager.Operation.Context { final ShardedKey key; @@ -352,9 +377,11 @@ class AppendRowsContext extends RetryManager.Operation.Context key) { + AppendRowsContext(ShardedKey key, ProtoRows protoRows) { this.key = key; + this.protoRows = protoRows; } @Override @@ -396,7 +423,7 @@ public String toString() { context.client = appendClient; context.offset = streamOffset.read(); ++context.tryIteration; - streamOffset.write(context.offset + context.numRows); + streamOffset.write(context.offset + context.protoRows.getSerializedRowsCount()); } } catch (Exception e) { throw new RuntimeException(e); @@ -415,114 +442,200 @@ public String toString() { } }; - Instant now = Instant.now(); - List contexts = Lists.newArrayList(); - RetryManager retryManager = - new RetryManager<>(Duration.standardSeconds(1), Duration.standardSeconds(10), 1000); - int numSplits = 0; - for (ProtoRows protoRows : messages) { - ++numSplits; - Function> run = - context -> { - try { - StreamAppendClient appendClient = - APPEND_CLIENTS.get( - context.streamName, - () -> - datasetService.getStreamAppendClient( - context.streamName, descriptor.get().descriptor, false)); - return appendClient.appendRows(context.offset, protoRows); - } catch (Exception e) { - throw new RuntimeException(e); + Function> runOperation = + context -> { + if (context.protoRows.getSerializedRowsCount() == 0) { + // This might happen if all rows in a batch failed and were sent to the failed-rows + // PCollection. + return ApiFutures.immediateFuture(AppendRowsResponse.newBuilder().build()); + } + try { + StreamAppendClient appendClient = + APPEND_CLIENTS.get( + context.streamName, + () -> + datasetService.getStreamAppendClient( + context.streamName, descriptor.get().descriptor, false)); + return appendClient.appendRows(context.offset, context.protoRows); + } catch (Exception e) { + throw new RuntimeException(e); + } + }; + + Function, RetryType> onError = + failedContexts -> { + // The first context is always the one that fails. + AppendRowsContext failedContext = + Preconditions.checkStateNotNull(Iterables.getFirst(failedContexts, null)); + + // AppendSerializationError means that BigQuery detected errors on individual rows, e.g. + // a row not conforming + // to bigQuery invariants. These errors are persistent, so we redirect those rows to the + // failedInserts + // PCollection, and retry with the remaining rows. + if (failedContext.getError() != null + && failedContext.getError() instanceof Exceptions.AppendSerializtionError) { + Exceptions.AppendSerializtionError error = + Preconditions.checkArgumentNotNull( + (Exceptions.AppendSerializtionError) failedContext.getError()); + Set failedRowIndices = error.getRowIndexToErrorMessage().keySet(); + for (int failedIndex : failedRowIndices) { + // Convert the message to a TableRow and send it to the failedRows collection. + ByteString protoBytes = failedContext.protoRows.getSerializedRows(failedIndex); + try { + TableRow failedRow = + TableRowToStorageApiProto.tableRowFromMessage( + DynamicMessage.parseFrom(descriptor.get().descriptor, protoBytes)); + new BigQueryStorageApiInsertError( + failedRow, error.getRowIndexToErrorMessage().get(failedIndex)); + o.get(failedRowsTag) + .output( + new BigQueryStorageApiInsertError( + failedRow, error.getRowIndexToErrorMessage().get(failedIndex))); + } catch (InvalidProtocolBufferException e) { + LOG.error("Failed to insert row and could not parse the result!"); + } } - }; - - // RetryManager - Function, RetryType> onError = - failedContexts -> { - // The first context is always the one that fails. - AppendRowsContext failedContext = - Preconditions.checkStateNotNull(Iterables.getFirst(failedContexts, null)); - // Invalidate the StreamWriter and force a new one to be created. - LOG.error( - "Got error " + failedContext.getError() + " closing " + failedContext.streamName); - clearClients.accept(contexts); - appendFailures.inc(); - - boolean explicitStreamFinalized = - failedContext.getError() instanceof StreamFinalizedException; - Throwable error = Preconditions.checkStateNotNull(failedContext.getError()); - Status.Code statusCode = Status.fromThrowable(error).getCode(); - // This means that the offset we have stored does not match the current end of - // the stream in the Storage API. Usually this happens because a crash or a bundle - // failure - // happened after an append but before the worker could checkpoint it's - // state. The records that were appended in a failed bundle will be retried, - // meaning that the unflushed tail of the stream must be discarded to prevent - // duplicates. - boolean offsetMismatch = - statusCode.equals(Code.OUT_OF_RANGE) || statusCode.equals(Code.ALREADY_EXISTS); - // This implies that the stream doesn't exist or has already been finalized. In this - // case we have no choice but to create a new stream. - boolean streamDoesNotExist = - explicitStreamFinalized - || statusCode.equals(Code.INVALID_ARGUMENT) - || statusCode.equals(Code.NOT_FOUND) - || statusCode.equals(Code.FAILED_PRECONDITION); - if (offsetMismatch || streamDoesNotExist) { - appendOffsetFailures.inc(); - LOG.warn( - "Append to " - + failedContext - + " failed with " - + failedContext.getError() - + " Will retry with a new stream"); - // Finalize the stream and clear streamName so a new stream will be created. - o.output( - KV.of(failedContext.streamName, new Operation(failedContext.offset - 1, true))); - // Reinitialize all contexts with the new stream and new offsets. - initializeContexts.accept(failedContexts, true); - - // Offset failures imply that all subsequent parallel appends will also fail. - // Retry them all. - return RetryType.RETRY_ALL_OPERATIONS; + rowsSentToFailedRowsCollection.inc(failedRowIndices.size()); + + // Remove the failed row from the payload, so we retry the batch without the failed + // rows. + ProtoRows.Builder retryRows = ProtoRows.newBuilder(); + for (int i = 0; i < failedContext.protoRows.getSerializedRowsCount(); ++i) { + if (!failedRowIndices.contains(i)) { + ByteString rowBytes = failedContext.protoRows.getSerializedRows(i); + retryRows.addSerializedRows(rowBytes); + } } + failedContext.protoRows = retryRows.build(); + // Since we removed rows, we need to update the insert offsets for all remaining rows. + long offset = failedContext.offset; + for (AppendRowsContext context : failedContexts) { + context.offset = offset; + offset += context.protoRows.getSerializedRowsCount(); + } + streamOffset.write(offset); return RetryType.RETRY_ALL_OPERATIONS; - }; + } - Consumer onSuccess = - context -> { - o.output( - KV.of( - context.streamName, - new Operation(context.offset + context.numRows - 1, false))); - flushesScheduled.inc(protoRows.getSerializedRowsCount()); - }; - - AppendRowsContext context = new AppendRowsContext(element.getKey()); - context.numRows = protoRows.getSerializedRowsCount(); - contexts.add(context); - retryManager.addOperation(run, onError, onSuccess, context); - recordsAppended.inc(protoRows.getSerializedRowsCount()); - appendSizeDistribution.update(context.numRows); - } - initializeContexts.accept(contexts, false); + // Invalidate the StreamWriter and force a new one to be created. + LOG.error( + "Got error " + failedContext.getError() + " closing " + failedContext.streamName); + clearClients.accept(failedContexts); + appendFailures.inc(); + + boolean explicitStreamFinalized = + failedContext.getError() instanceof StreamFinalizedException; + Throwable error = Preconditions.checkStateNotNull(failedContext.getError()); + Status.Code statusCode = Status.fromThrowable(error).getCode(); + // This means that the offset we have stored does not match the current end of + // the stream in the Storage API. Usually this happens because a crash or a bundle + // failure + // happened after an append but before the worker could checkpoint it's + // state. The records that were appended in a failed bundle will be retried, + // meaning that the unflushed tail of the stream must be discarded to prevent + // duplicates. + boolean offsetMismatch = + statusCode.equals(Code.OUT_OF_RANGE) || statusCode.equals(Code.ALREADY_EXISTS); + // This implies that the stream doesn't exist or has already been finalized. In this + // case we have no choice but to create a new stream. + boolean streamDoesNotExist = + explicitStreamFinalized + || statusCode.equals(Code.INVALID_ARGUMENT) + || statusCode.equals(Code.NOT_FOUND) + || statusCode.equals(Code.FAILED_PRECONDITION); + if (offsetMismatch || streamDoesNotExist) { + appendOffsetFailures.inc(); + LOG.warn( + "Append to " + + failedContext + + " failed with " + + failedContext.getError() + + " Will retry with a new stream"); + // Finalize the stream and clear streamName so a new stream will be created. + o.get(flushTag) + .output( + KV.of( + failedContext.streamName, new Operation(failedContext.offset - 1, true))); + // Reinitialize all contexts with the new stream and new offsets. + initializeContexts.accept(failedContexts, true); + + // Offset failures imply that all subsequent parallel appends will also fail. + // Retry them all. + return RetryType.RETRY_ALL_OPERATIONS; + } - try { - retryManager.run(true); - } finally { - // Make sure that all pins are removed. - for (AppendRowsContext context : contexts) { - if (context.client != null) { - runAsyncIgnoreFailure(closeWriterExecutor, context.client::unpin); + return RetryType.RETRY_ALL_OPERATIONS; + }; + + Consumer onSuccess = + context -> { + o.get(flushTag) + .output( + KV.of( + context.streamName, + new Operation( + context.offset + context.protoRows.getSerializedRowsCount() - 1, + false))); + flushesScheduled.inc(context.protoRows.getSerializedRowsCount()); + }; + long maxRequestSize = bigQueryOptions.getStorageWriteApiMaxRequestSize(); + Instant now = Instant.now(); + List contexts = Lists.newArrayList(); + RetryManager retryManager = + new RetryManager<>(Duration.standardSeconds(1), Duration.standardSeconds(10), 1000); + int numAppends = 0; + for (ProtoRows protoRows : messages) { + // Handle the case of a row that is too large. + if (protoRows.getSerializedSize() >= maxRequestSize) { + if (protoRows.getSerializedRowsCount() > 1) { + // TODO(reuvenlax): Is it worth trying to handle this case by splitting the protoRows? + // Given that we split + // the ProtoRows iterable at 2MB and the max request size is 10MB, this scenario seems + // nearly impossible. + LOG.error( + "A request containing more than one row is over the request size limit of " + + maxRequestSize + + ". This is unexpected. All rows in the request will be sent to the failed-rows PCollection."); + } + for (ByteString rowBytes : protoRows.getSerializedRowsList()) { + TableRow failedRow = + TableRowToStorageApiProto.tableRowFromMessage( + DynamicMessage.parseFrom(descriptor.get().descriptor, rowBytes)); + o.get(failedRowsTag) + .output( + new BigQueryStorageApiInsertError( + failedRow, "Row payload too large. Maximum size " + maxRequestSize)); } + } else { + ++numAppends; + // RetryManager + AppendRowsContext context = new AppendRowsContext(element.getKey(), protoRows); + contexts.add(context); + retryManager.addOperation(runOperation, onError, onSuccess, context); + recordsAppended.inc(protoRows.getSerializedRowsCount()); + appendSizeDistribution.update(context.protoRows.getSerializedRowsCount()); } } - appendSplitDistribution.update(numSplits); - java.time.Duration timeElapsed = java.time.Duration.between(now, Instant.now()); - appendLatencyDistribution.update(timeElapsed.toMillis()); + if (numAppends > 0) { + initializeContexts.accept(contexts, false); + try { + retryManager.run(true); + } finally { + // Make sure that all pins are removed. + for (AppendRowsContext context : contexts) { + if (context.client != null) { + runAsyncIgnoreFailure(closeWriterExecutor, context.client::unpin); + } + } + } + appendSplitDistribution.update(numAppends); + + java.time.Duration timeElapsed = java.time.Duration.between(now, Instant.now()); + appendLatencyDistribution.update(timeElapsed.toMillis()); + } idleTimer.offset(streamIdleTime).withNoOutputTimestamp().setRelative(); } @@ -530,15 +643,16 @@ public String toString() { private void finalizeStream( @AlwaysFetched @StateId("streamName") ValueState streamName, @AlwaysFetched @StateId("streamOffset") ValueState streamOffset, - OutputReceiver> o, + MultiOutputReceiver o, org.joda.time.Instant finalizeElementTs) { String stream = MoreObjects.firstNonNull(streamName.read(), ""); if (!Strings.isNullOrEmpty(stream)) { // Finalize the stream long nextOffset = MoreObjects.firstNonNull(streamOffset.read(), 0L); - o.outputWithTimestamp( - KV.of(stream, new Operation(nextOffset - 1, true)), finalizeElementTs); + o.get(flushTag) + .outputWithTimestamp( + KV.of(stream, new Operation(nextOffset - 1, true)), finalizeElementTs); streamName.clear(); streamOffset.clear(); // Make sure that the stream object is closed. @@ -550,7 +664,7 @@ private void finalizeStream( public void onTimer( @AlwaysFetched @StateId("streamName") ValueState streamName, @AlwaysFetched @StateId("streamOffset") ValueState streamOffset, - OutputReceiver> o, + MultiOutputReceiver o, BoundedWindow window) { // Stream is idle - clear it. // Note: this is best effort. We are explicitly emiting a timestamp that is before @@ -566,7 +680,7 @@ public void onTimer( public void onWindowExpiration( @AlwaysFetched @StateId("streamName") ValueState streamName, @AlwaysFetched @StateId("streamOffset") ValueState streamOffset, - OutputReceiver> o, + MultiOutputReceiver o, BoundedWindow window) { // Window is done - usually because the pipeline has been drained. Make sure to clean up // streams so that they are not leaked. diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/datastore/DatastoreV1.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/datastore/DatastoreV1.java index 06b7c5292fd5..6a7dc725fbc3 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/datastore/DatastoreV1.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/datastore/DatastoreV1.java @@ -247,7 +247,7 @@ public class DatastoreV1 { /** * When choosing the number of updates in a single RPC, do not go below this value. The actual * number of entities per request may be lower when we flush for the end of a bundle or if we hit - * {@link DatastoreV1.DATASTORE_BATCH_UPDATE_BYTES_LIMIT}. + * {@link #DATASTORE_BATCH_UPDATE_BYTES_LIMIT}. */ @VisibleForTesting static final int DATASTORE_BATCH_UPDATE_ENTITIES_MIN = 5; @@ -398,7 +398,7 @@ private static long queryLatestStatisticsTimestamp( throw new NoSuchElementException("Datastore total statistics unavailable"); } Entity entity = batch.getEntityResults(0).getEntity(); - return entity.getProperties().get("timestamp").getTimestampValue().getSeconds() * 1000000; + return entity.getPropertiesOrThrow("timestamp").getTimestampValue().getSeconds() * 1000000; } /** @@ -451,7 +451,7 @@ static long getEstimatedSizeBytes( throws DatastoreException { String ourKind = query.getKind(0).getName(); Entity entity = getLatestTableStats(ourKind, namespace, datastore, readTime); - return entity.getProperties().get("entity_bytes").getIntegerValue(); + return entity.getPropertiesOrThrow("entity_bytes").getIntegerValue(); } private static PartitionId.Builder forNamespace(@Nullable String namespace) { @@ -684,7 +684,7 @@ public long getNumEntities( options, v1Options.getProjectId(), v1Options.getLocalhost()); Entity entity = getLatestTableStats(ourKind, namespace, datastore, getReadTime()); - return entity.getProperties().get("count").getIntegerValue(); + return entity.getPropertiesOrThrow("count").getIntegerValue(); } catch (Exception e) { return -1; } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/testing/BigqueryClient.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/testing/BigqueryClient.java index 6224729aa91a..f5752797acd6 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/testing/BigqueryClient.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/testing/BigqueryClient.java @@ -288,7 +288,8 @@ private QueryResponse getTypedTableRows(QueryResponse response) { /** Performs a query without flattening results. */ @Nonnull - public List queryUnflattened(String query, String projectId, boolean typed) + public List queryUnflattened( + String query, String projectId, boolean typed, boolean useStandardSql) throws IOException, InterruptedException { Random rnd = new Random(System.currentTimeMillis()); String temporaryDatasetId = "_dataflow_temporary_dataset_" + rnd.nextInt(1000000); @@ -308,6 +309,7 @@ public List queryUnflattened(String query, String projectId, boolean t .setFlattenResults(false) .setAllowLargeResults(true) .setDestinationTable(tempTableReference) + .setUseLegacySql(!useStandardSql) .setQuery(query); JobConfiguration jc = new JobConfiguration().setQuery(jcQuery); diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/testing/FakeDatasetService.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/testing/FakeDatasetService.java index 44f73bd56cb2..948c75cb756d 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/testing/FakeDatasetService.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/testing/FakeDatasetService.java @@ -32,6 +32,7 @@ import com.google.api.services.bigquery.model.TableSchema; import com.google.cloud.bigquery.storage.v1.AppendRowsResponse; import com.google.cloud.bigquery.storage.v1.BatchCommitWriteStreamsResponse; +import com.google.cloud.bigquery.storage.v1.Exceptions; import com.google.cloud.bigquery.storage.v1.FinalizeWriteStreamResponse; import com.google.cloud.bigquery.storage.v1.FlushRowsResponse; import com.google.cloud.bigquery.storage.v1.ProtoRows; @@ -43,6 +44,7 @@ import com.google.protobuf.Descriptors.Descriptor; import com.google.protobuf.DynamicMessage; import com.google.protobuf.Timestamp; +import com.google.rpc.Code; import java.io.IOException; import java.io.Serializable; import java.util.HashMap; @@ -50,6 +52,7 @@ import java.util.Map; import java.util.UUID; import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Function; import java.util.regex.Pattern; import javax.annotation.Nullable; import org.apache.beam.sdk.annotations.Internal; @@ -148,6 +151,8 @@ void commit() { } } + Function shouldFailRow = + (Function & Serializable) tr -> false; Map> insertErrors = Maps.newHashMap(); // The counter for the number of insertions performed. @@ -162,6 +167,10 @@ public static void setUp() { } } + public void setShouldFailRow(Function shouldFailRow) { + this.shouldFailRow = shouldFailRow; + } + @Override public Table getTable(TableReference tableRef) throws InterruptedException, IOException { if (tableRef.getProjectId() == null) { @@ -504,6 +513,7 @@ public StreamAppendClient getStreamAppendClient( @Override public ApiFuture appendRows(long offset, ProtoRows rows) throws Exception { + AppendRowsResponse.Builder responseBuilder = AppendRowsResponse.newBuilder(); synchronized (FakeDatasetService.class) { Stream stream = writeStreams.get(streamName); if (stream == null) { @@ -511,18 +521,32 @@ public ApiFuture appendRows(long offset, ProtoRows rows) } List tableRows = Lists.newArrayListWithExpectedSize(rows.getSerializedRowsCount()); - for (ByteString bytes : rows.getSerializedRowsList()) { + Map rowIndexToErrorMessage = Maps.newHashMap(); + for (int i = 0; i < rows.getSerializedRowsCount(); ++i) { + ByteString bytes = rows.getSerializedRows(i); DynamicMessage msg = DynamicMessage.parseFrom(protoDescriptor, bytes); if (msg.getUnknownFields() != null && !msg.getUnknownFields().asMap().isEmpty()) { throw new RuntimeException("Unknown fields set in append! " + msg.getUnknownFields()); } - tableRows.add( + TableRow tableRow = TableRowToStorageApiProto.tableRowFromMessage( - DynamicMessage.parseFrom(protoDescriptor, bytes))); + DynamicMessage.parseFrom(protoDescriptor, bytes)); + if (shouldFailRow.apply(tableRow)) { + rowIndexToErrorMessage.put(i, "Failing row " + tableRow.toPrettyString()); + } + tableRows.add(tableRow); + } + if (!rowIndexToErrorMessage.isEmpty()) { + return ApiFutures.immediateFailedFuture( + new Exceptions.AppendSerializtionError( + Code.INVALID_ARGUMENT.getNumber(), + "Append serialization failed for writer: " + streamName, + stream.streamName, + rowIndexToErrorMessage)); } stream.appendRows(offset, tableRows); } - return ApiFutures.immediateFuture(AppendRowsResponse.newBuilder().build()); + return ApiFutures.immediateFuture(responseBuilder.build()); } @Override diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOWriteTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOWriteTest.java index 7f529bfa3489..1e1749e8569a 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOWriteTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIOWriteTest.java @@ -64,6 +64,7 @@ import java.util.Set; import java.util.concurrent.ExecutionException; import java.util.concurrent.ThreadLocalRandom; +import java.util.function.Function; import java.util.function.LongFunction; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -2583,11 +2584,15 @@ public void testStorageApiErrors() throws Exception { TableRow goodNested = new TableRow().set("number", "42"); TableRow badNested = new TableRow().set("number", "nAn"); + final String failValue = "failme"; List goodRows = ImmutableList.of( new TableRow().set("name", "n1").set("number", "1"), + new TableRow().set("name", failValue).set("number", "1"), new TableRow().set("name", "n2").set("number", "2"), - new TableRow().set("name", "parent1").set("nested", goodNested)); + new TableRow().set("name", failValue).set("number", "2"), + new TableRow().set("name", "parent1").set("nested", goodNested), + new TableRow().set("name", failValue).set("number", "1")); List badRows = ImmutableList.of( // Unknown field. @@ -2614,6 +2619,11 @@ public void testStorageApiErrors() throws Exception { // Invalid nested row new TableRow().set("name", "parent2").set("nested", badNested)); + Function shouldFailRow = + (Function & Serializable) + tr -> tr.containsKey("name") && tr.get("name").equals(failValue); + fakeDatasetService.setShouldFailRow(shouldFailRow); + WriteResult result = p.apply(Create.of(Iterables.concat(goodRows, badRows))) .apply( @@ -2632,12 +2642,17 @@ public void testStorageApiErrors() throws Exception { .apply( MapElements.into(TypeDescriptor.of(TableRow.class)) .via(BigQueryStorageApiInsertError::getRow)); - PAssert.that(deadRows).containsInAnyOrder(badRows); + + PAssert.that(deadRows) + .containsInAnyOrder( + Iterables.concat(badRows, Iterables.filter(goodRows, shouldFailRow::apply))); p.run(); assertThat( fakeDatasetService.getAllRows("project-id", "dataset-id", "table"), - containsInAnyOrder(Iterables.toArray(goodRows, TableRow.class))); + containsInAnyOrder( + Iterables.toArray( + Iterables.filter(goodRows, r -> !shouldFailRow.apply(r)), TableRow.class))); } @Test diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryNestedRecordsIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryNestedRecordsIT.java index 698ef660293c..b85dc62c5fe9 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryNestedRecordsIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryNestedRecordsIT.java @@ -97,12 +97,13 @@ private static void runPipeline(Options options) throws Exception { TableRow queryUnflattened = bigQueryClient - .queryUnflattened(options.getInput(), bigQueryOptions.getProject(), true) + .queryUnflattened(options.getInput(), bigQueryOptions.getProject(), true, false) .get(0); TableRow queryUnflattenable = bigQueryClient - .queryUnflattened(options.getUnflattenableInput(), bigQueryOptions.getProject(), true) + .queryUnflattened( + options.getUnflattenableInput(), bigQueryOptions.getProject(), true, false) .get(0); // Verify that the results are the same. diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiSinkFailedRowsIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiSinkFailedRowsIT.java new file mode 100644 index 000000000000..465bebbf1389 --- /dev/null +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/StorageApiSinkFailedRowsIT.java @@ -0,0 +1,266 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.gcp.bigquery; + +import static org.hamcrest.MatcherAssert.assertThat; + +import com.google.api.services.bigquery.model.Table; +import com.google.api.services.bigquery.model.TableFieldSchema; +import com.google.api.services.bigquery.model.TableReference; +import com.google.api.services.bigquery.model.TableRow; +import com.google.api.services.bigquery.model.TableSchema; +import java.io.IOException; +import java.util.List; +import org.apache.beam.sdk.Pipeline; +import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; +import org.apache.beam.sdk.io.gcp.testing.BigqueryClient; +import org.apache.beam.sdk.testing.PAssert; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.transforms.Create; +import org.apache.beam.sdk.transforms.MapElements; +import org.apache.beam.sdk.values.PCollection; +import org.apache.beam.sdk.values.TypeDescriptor; +import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList; +import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables; +import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Lists; +import org.hamcrest.Matchers; +import org.joda.time.Duration; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** Integration test for failed-rows handling when using the storage API. */ +@RunWith(Parameterized.class) +public class StorageApiSinkFailedRowsIT { + @Parameterized.Parameters + public static Iterable data() { + return ImmutableList.of( + new Object[] {true, false, false}, + new Object[] {false, true, false}, + new Object[] {false, false, true}, + new Object[] {true, false, true}); + } + + @Parameterized.Parameter(0) + public boolean useStreamingExactlyOnce; + + @Parameterized.Parameter(1) + public boolean useAtLeastOnce; + + @Parameterized.Parameter(2) + public boolean useBatch; + + private static final Logger LOG = LoggerFactory.getLogger(StorageApiSinkFailedRowsIT.class); + private static final BigqueryClient BQ_CLIENT = new BigqueryClient("StorageApiSinkFailedRowsIT"); + private static final String PROJECT = + TestPipeline.testingPipelineOptions().as(GcpOptions.class).getProject(); + private static final String BIG_QUERY_DATASET_ID = + "storage_api_sink_failed_rows" + System.nanoTime(); + + private static final List FIELDS = + ImmutableList.builder() + .add(new TableFieldSchema().setType("STRING").setName("str")) + .add(new TableFieldSchema().setType("INT64").setName("i64")) + .add(new TableFieldSchema().setType("DATE").setName("date")) + .add(new TableFieldSchema().setType("STRING").setMaxLength(1L).setName("strone")) + .add(new TableFieldSchema().setType("BYTES").setName("bytes")) + .add(new TableFieldSchema().setType("JSON").setName("json")) + .add( + new TableFieldSchema() + .setType("STRING") + .setMaxLength(1L) + .setMode("REPEATED") + .setName("stronearray")) + .build(); + + private static final TableSchema BASE_TABLE_SCHEMA = + new TableSchema() + .setFields( + ImmutableList.builder() + .addAll(FIELDS) + .add(new TableFieldSchema().setType("STRUCT").setFields(FIELDS).setName("inner")) + .build()); + + private static final byte[] BIG_BYTES = new byte[11 * 1024 * 1024]; + + private BigQueryIO.Write.Method getMethod() { + return useAtLeastOnce + ? BigQueryIO.Write.Method.STORAGE_API_AT_LEAST_ONCE + : BigQueryIO.Write.Method.STORAGE_WRITE_API; + } + + @BeforeClass + public static void setUpTestEnvironment() throws IOException, InterruptedException { + // Create one BQ dataset for all test cases. + BQ_CLIENT.createNewDataset(PROJECT, BIG_QUERY_DATASET_ID); + } + + @AfterClass + public static void cleanup() { + LOG.info("Start to clean up tables and datasets."); + BQ_CLIENT.deleteDataset(PROJECT, BIG_QUERY_DATASET_ID); + } + + @Test + public void testSchemaMismatchCaughtByBeam() throws IOException, InterruptedException { + String tableSpec = createTable(BASE_TABLE_SCHEMA); + TableRow good1 = new TableRow().set("str", "foo").set("i64", "42"); + TableRow good2 = new TableRow().set("str", "foo").set("i64", "43"); + Iterable goodRows = + ImmutableList.of( + good1.clone().set("inner", new TableRow()), + good2.clone().set("inner", new TableRow()), + new TableRow().set("inner", good1), + new TableRow().set("inner", good2)); + + TableRow bad1 = new TableRow().set("str", "foo").set("i64", "baad"); + TableRow bad2 = new TableRow().set("str", "foo").set("i64", "42").set("unknown", "foobar"); + Iterable badRows = + ImmutableList.of( + bad1, bad2, new TableRow().set("inner", bad1), new TableRow().set("inner", bad2)); + + runPipeline( + getMethod(), + useStreamingExactlyOnce, + tableSpec, + Iterables.concat(goodRows, badRows), + badRows); + assertGoodRowsWritten(tableSpec, goodRows); + } + + @Test + public void testInvalidRowCaughtByBigquery() throws IOException, InterruptedException { + String tableSpec = createTable(BASE_TABLE_SCHEMA); + + TableRow good1 = + new TableRow() + .set("str", "foo") + .set("i64", "42") + .set("date", "2022-08-16") + .set("stronearray", Lists.newArrayList()); + TableRow good2 = + new TableRow().set("str", "foo").set("i64", "43").set("stronearray", Lists.newArrayList()); + Iterable goodRows = + ImmutableList.of( + good1.clone().set("inner", new TableRow().set("stronearray", Lists.newArrayList())), + good2.clone().set("inner", new TableRow().set("stronearray", Lists.newArrayList())), + new TableRow().set("inner", good1).set("stronearray", Lists.newArrayList()), + new TableRow().set("inner", good2).set("stronearray", Lists.newArrayList())); + + TableRow bad1 = new TableRow().set("str", "foo").set("i64", "42").set("date", "10001-08-16"); + TableRow bad2 = new TableRow().set("str", "foo").set("i64", "42").set("strone", "ab"); + TableRow bad3 = new TableRow().set("str", "foo").set("i64", "42").set("json", "BAADF00D"); + TableRow bad4 = + new TableRow() + .set("str", "foo") + .set("i64", "42") + .set("stronearray", Lists.newArrayList("toolong")); + TableRow bad5 = new TableRow().set("bytes", BIG_BYTES); + Iterable badRows = + ImmutableList.of( + bad1, + bad2, + bad3, + bad4, + bad5, + new TableRow().set("inner", bad1), + new TableRow().set("inner", bad2), + new TableRow().set("inner", bad3)); + + runPipeline( + getMethod(), + useStreamingExactlyOnce, + tableSpec, + Iterables.concat(goodRows, badRows), + badRows); + assertGoodRowsWritten(tableSpec, goodRows); + } + + private static String createTable(TableSchema tableSchema) + throws IOException, InterruptedException { + String table = "table" + System.nanoTime(); + BQ_CLIENT.deleteTable(PROJECT, BIG_QUERY_DATASET_ID, table); + BQ_CLIENT.createNewTable( + PROJECT, + BIG_QUERY_DATASET_ID, + new Table() + .setSchema(tableSchema) + .setTableReference( + new TableReference() + .setTableId(table) + .setDatasetId(BIG_QUERY_DATASET_ID) + .setProjectId(PROJECT))); + return PROJECT + "." + BIG_QUERY_DATASET_ID + "." + table; + } + + private void assertGoodRowsWritten(String tableSpec, Iterable goodRows) + throws IOException, InterruptedException { + TableRow queryResponse = + Iterables.getOnlyElement( + BQ_CLIENT.queryUnflattened( + String.format("SELECT COUNT(*) FROM %s", tableSpec), PROJECT, true, true)); + int numRowsWritten = Integer.parseInt((String) queryResponse.get("f0_")); + if (useAtLeastOnce) { + assertThat(numRowsWritten, Matchers.greaterThanOrEqualTo(Iterables.size(goodRows))); + } else { + assertThat(numRowsWritten, Matchers.equalTo(Iterables.size(goodRows))); + } + } + + private static void runPipeline( + BigQueryIO.Write.Method method, + boolean triggered, + String tableSpec, + Iterable tableRows, + Iterable expectedFailedRows) { + Pipeline p = Pipeline.create(); + + BigQueryIO.Write write = + BigQueryIO.writeTableRows() + .to(tableSpec) + .withSchema(BASE_TABLE_SCHEMA) + .withMethod(method) + .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_NEVER); + if (method == BigQueryIO.Write.Method.STORAGE_WRITE_API) { + write = write.withNumStorageWriteApiStreams(1); + if (triggered) { + write = write.withTriggeringFrequency(Duration.standardSeconds(1)); + } + } + PCollection input = p.apply("Create test cases", Create.of(tableRows)); + if (triggered) { + input = input.setIsBoundedInternal(PCollection.IsBounded.UNBOUNDED); + } + WriteResult result = input.apply("Write using Storage Write API", write); + + PCollection failedRows = + result + .getFailedStorageApiInserts() + .apply( + MapElements.into(TypeDescriptor.of(TableRow.class)) + .via(BigQueryStorageApiInsertError::getRow)); + + PAssert.that(failedRows).containsInAnyOrder(expectedFailedRows); + + p.run().waitUntilFinish(); + } +} diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/TableRowToStorageApiProtoIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/TableRowToStorageApiProtoIT.java index b2d9e04ffe22..5f488da0210b 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/TableRowToStorageApiProtoIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/bigquery/TableRowToStorageApiProtoIT.java @@ -337,7 +337,8 @@ public void testBaseTableRow() throws IOException, InterruptedException { runPipeline(tableSpec, Collections.singleton(BASE_TABLE_ROW)); List actualTableRows = - BQ_CLIENT.queryUnflattened(String.format("SELECT * FROM [%s]", tableSpec), PROJECT, true); + BQ_CLIENT.queryUnflattened( + String.format("SELECT * FROM %s", tableSpec), PROJECT, true, true); assertEquals(1, actualTableRows.size()); assertEquals(BASE_TABLE_ROW_EXPECTED, actualTableRows.get(0)); @@ -362,7 +363,8 @@ public void testNestedRichTypesAndNull() throws IOException, InterruptedExceptio runPipeline(tableSpec, Collections.singleton(tableRow)); List actualTableRows = - BQ_CLIENT.queryUnflattened(String.format("SELECT * FROM [%s]", tableSpec), PROJECT, true); + BQ_CLIENT.queryUnflattened( + String.format("SELECT * FROM %s", tableSpec), PROJECT, true, true); assertEquals(1, actualTableRows.size()); assertEquals(BASE_TABLE_ROW_EXPECTED, actualTableRows.get(0).get("nestedValue1")); @@ -391,7 +393,7 @@ private static String createTable(TableSchema tableSchema) .setTableId(table) .setDatasetId(BIG_QUERY_DATASET_ID) .setProjectId(PROJECT))); - return PROJECT + ":" + BIG_QUERY_DATASET_ID + "." + table; + return PROJECT + "." + BIG_QUERY_DATASET_ID + "." + table; } private static void runPipeline(String tableSpec, Iterable tableRows) { diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/it/SpannerChangeStreamIT.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/it/SpannerChangeStreamIT.java index 202179bd9152..de837a173bcd 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/it/SpannerChangeStreamIT.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/it/SpannerChangeStreamIT.java @@ -26,6 +26,7 @@ import com.google.cloud.spanner.ErrorCode; import com.google.cloud.spanner.Key; import com.google.cloud.spanner.Mutation; +import com.google.cloud.spanner.Options; import com.google.cloud.spanner.ResultSet; import com.google.cloud.spanner.SpannerException; import com.google.cloud.spanner.Statement; @@ -33,6 +34,10 @@ import java.util.Collections; import java.util.Map; import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; import org.apache.beam.sdk.io.gcp.spanner.SpannerConfig; import org.apache.beam.sdk.io.gcp.spanner.SpannerIO; import org.apache.beam.sdk.io.gcp.spanner.changestreams.model.DataChangeRecord; @@ -40,10 +45,12 @@ import org.apache.beam.sdk.testing.PAssert; import org.apache.beam.sdk.testing.TestPipeline; import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.Filter; import org.apache.beam.sdk.transforms.ParDo; import org.apache.beam.sdk.values.PCollection; import org.apache.commons.lang3.tuple.Pair; import org.joda.time.Instant; +import org.junit.Assert; import org.junit.Before; import org.junit.BeforeClass; import org.junit.ClassRule; @@ -141,6 +148,83 @@ public void testReadSpannerChangeStream() { assertMetadataTableHasBeenDropped(); } + @Test + public void testReadSpannerChangeStreamFilteredByTransactionTag() { + // Defines how many rows are going to be inserted / updated / deleted in the test + final int numRows = 5; + // Inserts numRows rows and uses the first commit timestamp as the startAt for reading the + // change stream + final Pair insertTimestamps = insertRows(numRows); + final Timestamp startAt = insertTimestamps.getLeft(); + // Updates the created rows + updateRows(numRows); + // Delete the created rows and uses the last commit timestamp as the endAt for reading the + // change stream + final Pair deleteTimestamps = deleteRows(numRows); + final Timestamp endAt = deleteTimestamps.getRight(); + + final SpannerConfig spannerConfig = + SpannerConfig.create() + .withProjectId(projectId) + .withInstanceId(instanceId) + .withDatabaseId(databaseId); + + // Filter records to only those from transactions with tag "app=beam;action=update" + final PCollection tokens = + pipeline + .apply( + SpannerIO.readChangeStream() + .withSpannerConfig(spannerConfig) + .withChangeStreamName(changeStreamName) + .withMetadataDatabase(databaseId) + .withMetadataTable(metadataTableName) + .withInclusiveStartAt(startAt) + .withInclusiveEndAt(endAt)) + .apply( + Filter.by( + record -> + !record.isSystemTransaction() + && record + .getTransactionTag() + .equalsIgnoreCase("app=beam;action=update"))) + .apply(ParDo.of(new ModsToString())); + + // Each row is composed by the following data + // + PAssert.that(tokens) + .satisfies( + stringTokens -> { + Set setTokens = + StreamSupport.stream(stringTokens.spliterator(), false) + .collect(Collectors.toSet()); + Assert.assertTrue( + Stream.of( + "UPDATE,1,First Name 1,Last Name 1,Updated First Name 1,Updated Last Name 1", + "UPDATE,2,First Name 2,Last Name 2,Updated First Name 2,Updated Last Name 2", + "UPDATE,3,First Name 3,Last Name 3,Updated First Name 3,Updated Last Name 3", + "UPDATE,4,First Name 4,Last Name 4,Updated First Name 4,Updated Last Name 4", + "UPDATE,5,First Name 5,Last Name 5,Updated First Name 5,Updated Last Name 5") + .allMatch(setTokens::contains)); + Assert.assertTrue( + Stream.of( + "INSERT,1,null,null,First Name 1,Last Name 1", + "INSERT,2,null,null,First Name 2,Last Name 2", + "INSERT,3,null,null,First Name 3,Last Name 3", + "INSERT,4,null,null,First Name 4,Last Name 4", + "INSERT,5,null,null,First Name 5,Last Name 5", + "DELETE,1,Updated First Name 1,Updated Last Name 1,null,null", + "DELETE,2,Updated First Name 2,Updated Last Name 2,null,null", + "DELETE,3,Updated First Name 3,Updated Last Name 3,null,null", + "DELETE,4,Updated First Name 4,Updated Last Name 4,null,null", + "DELETE,5,Updated First Name 5,Updated Last Name 5,null,null") + .noneMatch(setTokens::contains)); + return null; + }); + pipeline.run().waitUntilFinish(); + + assertMetadataTableHasBeenDropped(); + } + private static void assertMetadataTableHasBeenDropped() { try (ResultSet resultSet = databaseClient @@ -187,34 +271,43 @@ private static Pair deleteRows(int n) { } private static Timestamp insertRow(int singerId) { - return databaseClient.write( - Collections.singletonList( - Mutation.newInsertBuilder(changeStreamTableName) - .set("SingerId") - .to(singerId) - .set("FirstName") - .to("First Name " + singerId) - .set("LastName") - .to("Last Name " + singerId) - .build())); + return databaseClient + .writeWithOptions( + Collections.singletonList( + Mutation.newInsertBuilder(changeStreamTableName) + .set("SingerId") + .to(singerId) + .set("FirstName") + .to("First Name " + singerId) + .set("LastName") + .to("Last Name " + singerId) + .build()), + Options.tag("app=beam;action=insert")) + .getCommitTimestamp(); } private static Timestamp updateRow(int singerId) { - return databaseClient.write( - Collections.singletonList( - Mutation.newUpdateBuilder(changeStreamTableName) - .set("SingerId") - .to(singerId) - .set("FirstName") - .to("Updated First Name " + singerId) - .set("LastName") - .to("Updated Last Name " + singerId) - .build())); + return databaseClient + .writeWithOptions( + Collections.singletonList( + Mutation.newUpdateBuilder(changeStreamTableName) + .set("SingerId") + .to(singerId) + .set("FirstName") + .to("Updated First Name " + singerId) + .set("LastName") + .to("Updated Last Name " + singerId) + .build()), + Options.tag("app=beam;action=update")) + .getCommitTimestamp(); } private static Timestamp deleteRow(int singerId) { - return databaseClient.write( - Collections.singletonList(Mutation.delete(changeStreamTableName, Key.of(singerId)))); + return databaseClient + .writeWithOptions( + Collections.singletonList(Mutation.delete(changeStreamTableName, Key.of(singerId))), + Options.tag("app=beam;action=delete")) + .getCommitTimestamp(); } private static class ModsToString extends DoFn { diff --git a/sdks/java/io/hadoop-format/build.gradle b/sdks/java/io/hadoop-format/build.gradle index 702d37175d53..ec70824a5ab8 100644 --- a/sdks/java/io/hadoop-format/build.gradle +++ b/sdks/java/io/hadoop-format/build.gradle @@ -40,14 +40,6 @@ hadoopVersions.each {kv -> configurations.create("hadoopVersion$kv.key")} def elastic_search_version = "7.12.0" -configurations.create("sparkRunner") -configurations.sparkRunner { - // Ban certain dependencies to prevent a StackOverflow within Spark - // because JUL -> SLF4J -> JUL, and similarly JDK14 -> SLF4J -> JDK14 - exclude group: "org.slf4j", module: "jul-to-slf4j" - exclude group: "org.slf4j", module: "slf4j-jdk14" -} - // Ban dependencies from the test runtime classpath configurations.testRuntimeClasspath { // Prevent a StackOverflow because of wiring LOG4J -> SLF4J -> LOG4J @@ -115,15 +107,6 @@ dependencies { testRuntimeOnly library.java.slf4j_jdk14 testRuntimeOnly project(path: ":runners:direct-java", configuration: "shadow") - delegate.add("sparkRunner", project(path: ":sdks:java:io:hadoop-format", configuration: "testRuntimeMigration")) - - sparkRunner project(path: ":examples:java", configuration: "testRuntimeMigration") - sparkRunner project(path: ":examples:java:twitter", configuration: "testRuntimeMigration") - sparkRunner project(":runners:spark:2") - sparkRunner project(":sdks:java:io:hadoop-file-system") - sparkRunner library.java.spark_streaming - sparkRunner library.java.spark_core - hadoopVersions.each {kv -> "hadoopVersion$kv.key" "org.apache.hadoop:hadoop-common:$kv.value" "hadoopVersion$kv.key" "org.apache.hadoop:hadoop-mapreduce-client-core:$kv.value" @@ -169,29 +152,6 @@ task createTargetDirectoryForCassandra() { } test.dependsOn createTargetDirectoryForCassandra -def runnerClass = "org.apache.beam.runners.spark.TestSparkRunner" -task sparkRunner(type: Test) { - group = "Verification" - def beamTestPipelineOptions = [ - "--project=hadoop-format", - "--tempRoot=/tmp/hadoop-format/", - "--streaming=false", - "--runner=" + runnerClass, - "--enableSparkMetricSinks=false", - ] - classpath = configurations.sparkRunner - include "**/HadoopFormatIOSequenceFileTest.class" - useJUnit { - includeCategories 'org.apache.beam.sdk.testing.ValidatesRunner' - } - forkEvery 1 - maxParallelForks 4 - systemProperty "spark.ui.enabled", "false" - systemProperty "spark.ui.showConsoleProgress", "false" - systemProperty "beam.spark.test.reuseSparkContext", "true" - systemProperty "beamTestPipelineOptions", JsonOutput.toJson(beamTestPipelineOptions) -} - task hadoopVersionsTest(group: "Verification") { description = "Runs Hadoop format tests with different Hadoop versions" dependsOn createTaskNames(hadoopVersions, "Test") diff --git a/sdks/java/io/jdbc/src/main/java/org/apache/beam/sdk/io/jdbc/JdbcUtil.java b/sdks/java/io/jdbc/src/main/java/org/apache/beam/sdk/io/jdbc/JdbcUtil.java index 01894c621ca4..5a675eb5f9f1 100644 --- a/sdks/java/io/jdbc/src/main/java/org/apache/beam/sdk/io/jdbc/JdbcUtil.java +++ b/sdks/java/io/jdbc/src/main/java/org/apache/beam/sdk/io/jdbc/JdbcUtil.java @@ -193,7 +193,7 @@ static JdbcIO.PreparedStatementSetCaller getPreparedStatementSetCaller( String logicalTypeName = fieldType.getLogicalType().getIdentifier(); // Special case of Timestamp and Numeric which are logical types in Portable framework - // but has their own fieldType in Java. + // but have their own fieldType in Java. if (logicalTypeName.equals(MicrosInstant.IDENTIFIER)) { // Process timestamp of MicrosInstant kind, which should only be passed from other type // systems such as SQL and other Beam SDKs. @@ -207,50 +207,46 @@ static JdbcIO.PreparedStatementSetCaller getPreparedStatementSetCaller( return (element, ps, i, fieldWithIndex) -> { ps.setBigDecimal(i + 1, element.getDecimal(fieldWithIndex.getIndex())); }; - } - - JDBCType jdbcType = JDBCType.valueOf(logicalTypeName); - switch (jdbcType) { - case DATE: - return (element, ps, i, fieldWithIndex) -> { - ReadableDateTime value = element.getDateTime(fieldWithIndex.getIndex()); - ps.setDate( - i + 1, - value == null - ? null - : new Date( - getDateOrTimeOnly(value.toDateTime(), true).getTime().getTime())); - }; - case TIME: - return (element, ps, i, fieldWithIndex) -> { - ReadableDateTime value = element.getDateTime(fieldWithIndex.getIndex()); - ps.setTime( - i + 1, - value == null - ? null - : new Time( - getDateOrTimeOnly( - element.getDateTime(fieldWithIndex.getIndex()).toDateTime(), - false) - .getTime() - .getTime())); - }; - case TIMESTAMP_WITH_TIMEZONE: - return (element, ps, i, fieldWithIndex) -> { - ReadableDateTime value = element.getDateTime(fieldWithIndex.getIndex()); - if (value == null) { - ps.setTimestamp(i + 1, null); - } else { - Calendar calendar = withTimestampAndTimezone(value.toDateTime()); - ps.setTimestamp(i + 1, new Timestamp(calendar.getTime().getTime()), calendar); - } - }; - case OTHER: - return (element, ps, i, fieldWithIndex) -> - ps.setObject( - i + 1, element.getValue(fieldWithIndex.getIndex()), java.sql.Types.OTHER); - default: - return getPreparedStatementSetCaller(fieldType.getLogicalType().getBaseType()); + } else if (logicalTypeName.equals("DATE")) { + return (element, ps, i, fieldWithIndex) -> { + ReadableDateTime value = element.getDateTime(fieldWithIndex.getIndex()); + ps.setDate( + i + 1, + value == null + ? null + : new Date(getDateOrTimeOnly(value.toDateTime(), true).getTime().getTime())); + }; + } else if (logicalTypeName.equals("TIME")) { + return (element, ps, i, fieldWithIndex) -> { + ReadableDateTime value = element.getDateTime(fieldWithIndex.getIndex()); + ps.setTime( + i + 1, + value == null + ? null + : new Time( + getDateOrTimeOnly( + element.getDateTime(fieldWithIndex.getIndex()).toDateTime(), + false) + .getTime() + .getTime())); + }; + } else if (logicalTypeName.equals("TIMESTAMP_WITH_TIMEZONE")) { + return (element, ps, i, fieldWithIndex) -> { + ReadableDateTime value = element.getDateTime(fieldWithIndex.getIndex()); + if (value == null) { + ps.setTimestamp(i + 1, null); + } else { + Calendar calendar = withTimestampAndTimezone(value.toDateTime()); + ps.setTimestamp(i + 1, new Timestamp(calendar.getTime().getTime()), calendar); + } + }; + } else if (logicalTypeName.equals("OTHER")) { + return (element, ps, i, fieldWithIndex) -> + ps.setObject( + i + 1, element.getValue(fieldWithIndex.getIndex()), java.sql.Types.OTHER); + } else { + // generic beam logic type (such as portable logical types) + return getPreparedStatementSetCaller(fieldType.getLogicalType().getBaseType()); } } default: diff --git a/sdks/java/io/jdbc/src/main/java/org/apache/beam/sdk/io/jdbc/LogicalTypes.java b/sdks/java/io/jdbc/src/main/java/org/apache/beam/sdk/io/jdbc/LogicalTypes.java index e61295889590..f21aa5b6f299 100644 --- a/sdks/java/io/jdbc/src/main/java/org/apache/beam/sdk/io/jdbc/LogicalTypes.java +++ b/sdks/java/io/jdbc/src/main/java/org/apache/beam/sdk/io/jdbc/LogicalTypes.java @@ -17,20 +17,20 @@ */ package org.apache.beam.sdk.io.jdbc; -import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument; - import java.sql.JDBCType; import java.time.Instant; -import java.util.Arrays; import java.util.Objects; -import org.apache.beam.repackaged.core.org.apache.commons.lang3.StringUtils; import org.apache.beam.sdk.annotations.Experimental; import org.apache.beam.sdk.annotations.Experimental.Kind; import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.Schema.FieldType; +import org.apache.beam.sdk.schemas.logicaltypes.FixedBytes; import org.apache.beam.sdk.schemas.logicaltypes.FixedPrecisionNumeric; +import org.apache.beam.sdk.schemas.logicaltypes.FixedString; import org.apache.beam.sdk.schemas.logicaltypes.PassThroughLogicalType; import org.apache.beam.sdk.schemas.logicaltypes.UuidLogicalType; +import org.apache.beam.sdk.schemas.logicaltypes.VariableBytes; +import org.apache.beam.sdk.schemas.logicaltypes.VariableString; import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.annotations.VisibleForTesting; import org.checkerframework.checker.nullness.qual.Nullable; @@ -78,22 +78,22 @@ class LogicalTypes { @VisibleForTesting static Schema.FieldType fixedLengthString(JDBCType jdbcType, int length) { - return Schema.FieldType.logicalType(FixedLengthString.of(jdbcType.getName(), length)); + return Schema.FieldType.logicalType(FixedString.of(jdbcType.getName(), length)); } @VisibleForTesting static Schema.FieldType fixedLengthBytes(JDBCType jdbcType, int length) { - return Schema.FieldType.logicalType(FixedLengthBytes.of(jdbcType.getName(), length)); + return Schema.FieldType.logicalType(FixedBytes.of(jdbcType.getName(), length)); } @VisibleForTesting static Schema.FieldType variableLengthString(JDBCType jdbcType, int length) { - return Schema.FieldType.logicalType(VariableLengthString.of(jdbcType.getName(), length)); + return Schema.FieldType.logicalType(VariableString.of(jdbcType.getName(), length)); } @VisibleForTesting static Schema.FieldType variableLengthBytes(JDBCType jdbcType, int length) { - return Schema.FieldType.logicalType(VariableLengthBytes.of(jdbcType.getName(), length)); + return Schema.FieldType.logicalType(VariableBytes.of(jdbcType.getName(), length)); } @VisibleForTesting @@ -101,6 +101,21 @@ static Schema.FieldType numeric(int precision, int scale) { return Schema.FieldType.logicalType(FixedPrecisionNumeric.of(precision, scale)); } + /** + * Returns a {@link FixedBytes}, or {@link VariableBytes} when length is Integer.MAX_VALUE. + * + *

    In some database, certain variable bytes type (e.g. bytea in postgresql) also returns BINARY + * jdbc type. This helper method make BINARY(Integer.MAX_VALUE) returns a variable bytes logical + * type thus avoid out-of-memory due to padding in fixed-length bytes. + */ + static Schema.LogicalType fixedOrVariableBytes(String name, int length) { + if (length == Integer.MAX_VALUE) { + return VariableBytes.of(name, length); + } else { + return FixedBytes.of(name, length); + } + } + /** Base class for JDBC logical types. */ abstract static class JdbcLogicalType implements Schema.LogicalType { protected final String identifier; @@ -164,88 +179,4 @@ public int hashCode() { return Objects.hash(identifier, baseType, argument); } } - - /** Fixed length string types such as CHAR. */ - static final class FixedLengthString extends JdbcLogicalType { - private final int length; - - static FixedLengthString of(String identifier, int length) { - return new FixedLengthString(identifier, length); - } - - private FixedLengthString(String identifier, int length) { - super(identifier, FieldType.INT32, Schema.FieldType.STRING, length); - this.length = length; - } - - @Override - public String toInputType(String base) { - checkArgument(base == null || base.length() <= length); - return StringUtils.rightPad(base, length); - } - } - - /** Fixed length byte types such as BINARY. */ - static final class FixedLengthBytes extends JdbcLogicalType { - private final int length; - - static FixedLengthBytes of(String identifier, int length) { - return new FixedLengthBytes(identifier, length); - } - - private FixedLengthBytes(String identifier, int length) { - super(identifier, FieldType.INT32, Schema.FieldType.BYTES, length); - this.length = length; - } - - @Override - public byte[] toInputType(byte[] base) { - checkArgument(base == null || base.length <= length); - if (base == null || base.length == length) { - return base; - } else { - return Arrays.copyOf(base, length); - } - } - } - - /** Variable length string types such as VARCHAR and LONGVARCHAR. */ - static final class VariableLengthString extends JdbcLogicalType { - private final int maxLength; - - static VariableLengthString of(String identifier, int maxLength) { - return new VariableLengthString(identifier, maxLength); - } - - private VariableLengthString(String identifier, int maxLength) { - super(identifier, FieldType.INT32, Schema.FieldType.STRING, maxLength); - this.maxLength = maxLength; - } - - @Override - public String toInputType(String base) { - checkArgument(base == null || base.length() <= maxLength); - return base; - } - } - - /** Variable length bytes types such as VARBINARY and LONGVARBINARY. */ - static final class VariableLengthBytes extends JdbcLogicalType { - private final int maxLength; - - static VariableLengthBytes of(String identifier, int maxLength) { - return new VariableLengthBytes(identifier, maxLength); - } - - private VariableLengthBytes(String identifier, int maxLength) { - super(identifier, FieldType.INT32, Schema.FieldType.BYTES, maxLength); - this.maxLength = maxLength; - } - - @Override - public byte[] toInputType(byte[] base) { - checkArgument(base == null || base.length <= maxLength); - return base; - } - } } diff --git a/sdks/java/io/jdbc/src/main/java/org/apache/beam/sdk/io/jdbc/SchemaUtil.java b/sdks/java/io/jdbc/src/main/java/org/apache/beam/sdk/io/jdbc/SchemaUtil.java index a34ef7847ef9..b466564cfeca 100644 --- a/sdks/java/io/jdbc/src/main/java/org/apache/beam/sdk/io/jdbc/SchemaUtil.java +++ b/sdks/java/io/jdbc/src/main/java/org/apache/beam/sdk/io/jdbc/SchemaUtil.java @@ -53,6 +53,9 @@ import org.apache.beam.sdk.schemas.Schema; import org.apache.beam.sdk.schemas.Schema.FieldType; import org.apache.beam.sdk.schemas.logicaltypes.FixedPrecisionNumeric; +import org.apache.beam.sdk.schemas.logicaltypes.FixedString; +import org.apache.beam.sdk.schemas.logicaltypes.VariableBytes; +import org.apache.beam.sdk.schemas.logicaltypes.VariableString; import org.apache.beam.sdk.values.Row; import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap; import org.joda.time.DateTime; @@ -73,7 +76,6 @@ class SchemaUtil { interface ResultSetFieldExtractor extends Serializable { Object extract(ResultSet rs, Integer index) throws SQLException; } - // ResultSetExtractors for primitive schema types (excluding arrays, structs and logical types). private static final EnumMap RESULTSET_FIELD_EXTRACTORS = @@ -114,13 +116,13 @@ private static BeamFieldConverter jdbcTypeToBeamFieldConverter( case BIGINT: return beamFieldOfType(Schema.FieldType.INT64); case BINARY: - return beamLogicalField(BINARY.getName(), LogicalTypes.FixedLengthBytes::of); + return beamLogicalField(BINARY.getName(), LogicalTypes::fixedOrVariableBytes); case BIT: return beamFieldOfType(LogicalTypes.JDBC_BIT_TYPE); case BOOLEAN: return beamFieldOfType(Schema.FieldType.BOOLEAN); case CHAR: - return beamLogicalField(CHAR.getName(), LogicalTypes.FixedLengthString::of); + return beamLogicalField(CHAR.getName(), FixedString::of); case DATE: return beamFieldOfType(LogicalTypes.JDBC_DATE_TYPE); case DECIMAL: @@ -132,17 +134,17 @@ private static BeamFieldConverter jdbcTypeToBeamFieldConverter( case INTEGER: return beamFieldOfType(Schema.FieldType.INT32); case LONGNVARCHAR: - return beamLogicalField(LONGNVARCHAR.getName(), LogicalTypes.VariableLengthString::of); + return beamLogicalField(LONGNVARCHAR.getName(), VariableString::of); case LONGVARBINARY: - return beamLogicalField(LONGVARBINARY.getName(), LogicalTypes.VariableLengthBytes::of); + return beamLogicalField(LONGVARBINARY.getName(), VariableBytes::of); case LONGVARCHAR: - return beamLogicalField(LONGVARCHAR.getName(), LogicalTypes.VariableLengthString::of); + return beamLogicalField(LONGVARCHAR.getName(), VariableString::of); case NCHAR: - return beamLogicalField(NCHAR.getName(), LogicalTypes.FixedLengthString::of); + return beamLogicalField(NCHAR.getName(), FixedString::of); case NUMERIC: return beamLogicalNumericField(); case NVARCHAR: - return beamLogicalField(NVARCHAR.getName(), LogicalTypes.VariableLengthString::of); + return beamLogicalField(NVARCHAR.getName(), VariableString::of); case REAL: return beamFieldOfType(Schema.FieldType.FLOAT); case SMALLINT: @@ -156,9 +158,9 @@ private static BeamFieldConverter jdbcTypeToBeamFieldConverter( case TINYINT: return beamFieldOfType(Schema.FieldType.BYTE); case VARBINARY: - return beamLogicalField(VARBINARY.getName(), LogicalTypes.VariableLengthBytes::of); + return beamLogicalField(VARBINARY.getName(), VariableBytes::of); case VARCHAR: - return beamLogicalField(VARCHAR.getName(), LogicalTypes.VariableLengthString::of); + return beamLogicalField(VARCHAR.getName(), VariableString::of); case BLOB: return beamFieldOfType(FieldType.BYTES); case CLOB: @@ -290,26 +292,17 @@ private static ResultSetFieldExtractor createLogicalTypeExtracto final Schema.LogicalType fieldType) { String logicalTypeName = fieldType.getIdentifier(); - JDBCType underlyingType; - if (Objects.equals(fieldType, LogicalTypes.JDBC_UUID_TYPE.getLogicalType())) { return OBJECT_EXTRACTOR; - } else if (Objects.equals(logicalTypeName, FixedPrecisionNumeric.IDENTIFIER)) { - underlyingType = JDBCType.NUMERIC; + } else if (logicalTypeName.equals("DATE")) { + return DATE_EXTRACTOR; + } else if (logicalTypeName.equals("TIME")) { + return TIME_EXTRACTOR; + } else if (logicalTypeName.equals("TIMESTAMP_EXTRACTOR")) { + return TIMESTAMP_EXTRACTOR; } else { - underlyingType = JDBCType.valueOf(logicalTypeName); - } - - switch (underlyingType) { - case DATE: - return DATE_EXTRACTOR; - case TIME: - return TIME_EXTRACTOR; - case TIMESTAMP_WITH_TIMEZONE: - return TIMESTAMP_EXTRACTOR; - default: - ResultSetFieldExtractor extractor = createFieldExtractor(fieldType.getBaseType()); - return (rs, index) -> fieldType.toInputType((BaseT) extractor.extract(rs, index)); + ResultSetFieldExtractor extractor = createFieldExtractor(fieldType.getBaseType()); + return (rs, index) -> fieldType.toInputType((BaseT) extractor.extract(rs, index)); } } diff --git a/sdks/java/io/neo4j/src/test/java/org/apache/beam/sdk/io/neo4j/Neo4jIOIT.java b/sdks/java/io/neo4j/src/test/java/org/apache/beam/sdk/io/neo4j/Neo4jIOIT.java index fc8d712b6cca..e5f606642a5f 100644 --- a/sdks/java/io/neo4j/src/test/java/org/apache/beam/sdk/io/neo4j/Neo4jIOIT.java +++ b/sdks/java/io/neo4j/src/test/java/org/apache/beam/sdk/io/neo4j/Neo4jIOIT.java @@ -70,7 +70,7 @@ public static void setup() throws Exception { new Neo4jContainer<>(DockerImageName.parse("neo4j").withTag(Neo4jTestUtil.NEO4J_VERSION)) .withStartupAttempts(1) .withAdminPassword(Neo4jTestUtil.NEO4J_PASSWORD) - .withEnv("NEO4J_dbms_default_listen_address", "0.0.0.0") + .withEnv("dbms_default_listen_address", "0.0.0.0") .withNetworkAliases(Neo4jTestUtil.NEO4J_NETWORK_ALIAS) .withSharedMemorySize(256 * 1024 * 1024L); // 256MB @@ -88,7 +88,7 @@ public static void setup() throws Exception { Neo4jTestUtil.executeOnNeo4j( containerHostname, containerPort, - "CREATE CONSTRAINT something_id_unique ON (n:Something) ASSERT n.id IS UNIQUE", + "CREATE CONSTRAINT something_id_unique FOR (n:Something) REQUIRE n.id IS UNIQUE", true); } diff --git a/sdks/java/io/parquet/src/main/java/org/apache/beam/sdk/io/parquet/ParquetIO.java b/sdks/java/io/parquet/src/main/java/org/apache/beam/sdk/io/parquet/ParquetIO.java index ee67989c5f42..433a53a20fe1 100644 --- a/sdks/java/io/parquet/src/main/java/org/apache/beam/sdk/io/parquet/ParquetIO.java +++ b/sdks/java/io/parquet/src/main/java/org/apache/beam/sdk/io/parquet/ParquetIO.java @@ -46,9 +46,7 @@ import org.apache.beam.sdk.coders.StringUtf8Coder; import org.apache.beam.sdk.io.FileIO; import org.apache.beam.sdk.io.FileIO.ReadableFile; -import org.apache.beam.sdk.io.fs.ResourceId; import org.apache.beam.sdk.io.hadoop.SerializableConfiguration; -import org.apache.beam.sdk.io.parquet.ParquetIO.ReadFiles.ReadFn; import org.apache.beam.sdk.io.parquet.ParquetIO.ReadFiles.SplitReadFn; import org.apache.beam.sdk.io.range.OffsetRange; import org.apache.beam.sdk.options.ValueProvider; @@ -77,7 +75,6 @@ import org.apache.parquet.filter2.compat.FilterCompat; import org.apache.parquet.filter2.compat.FilterCompat.Filter; import org.apache.parquet.hadoop.ParquetFileReader; -import org.apache.parquet.hadoop.ParquetReader; import org.apache.parquet.hadoop.ParquetWriter; import org.apache.parquet.hadoop.api.InitContext; import org.apache.parquet.hadoop.api.ReadSupport; @@ -136,18 +133,8 @@ * PCollection output = files.apply(ParquetIO.readFiles(SCHEMA)); * }

    * - *

    Splittable reading can be enabled by allowing the use of Splittable DoFn. It initially split - * the files into blocks of 64MB and may dynamically split further for higher read efficiency. It - * can be enabled by using {@link ParquetIO.Read#withSplit()}. - * - *

    For example: - * - *

    {@code
    - * PCollection records = pipeline.apply(ParquetIO.read(SCHEMA).from("/foo/bar").withSplit());
    - * ...
    - * }
    - * - *

    Since Beam version 2.35.0 the splittable reading is enabled by default. + *

    ParquetIO leverages splittable reading by using Splittable DoFn. It initially splits the files + * into the blocks of 64MB and may dynamically split further for higher read efficiency. * *

    Reading with projection can be enabled with the projection schema as following. Splittable * reading is enabled when reading with projection. The projection_schema contains only the column @@ -271,7 +258,6 @@ public static Read read(Schema schema) { return new AutoValue_ParquetIO_Read.Builder() .setSchema(schema) .setInferBeamSchema(false) - .setSplittable(true) .build(); } @@ -283,7 +269,6 @@ public static ReadFiles readFiles(Schema schema) { return new AutoValue_ParquetIO_ReadFiles.Builder() .setSchema(schema) .setInferBeamSchema(false) - .setSplittable(true) .build(); } @@ -292,10 +277,7 @@ public static ReadFiles readFiles(Schema schema) { * pattern) and converts to user defined type using provided parseFn. */ public static Parse parseGenericRecords(SerializableFunction parseFn) { - return new AutoValue_ParquetIO_Parse.Builder() - .setParseFn(parseFn) - .setSplittable(true) - .build(); + return new AutoValue_ParquetIO_Parse.Builder().setParseFn(parseFn).build(); } /** @@ -304,10 +286,7 @@ public static Parse parseGenericRecords(SerializableFunction ParseFiles parseFilesGenericRecords( SerializableFunction parseFn) { - return new AutoValue_ParquetIO_ParseFiles.Builder() - .setParseFn(parseFn) - .setSplittable(true) - .build(); + return new AutoValue_ParquetIO_ParseFiles.Builder().setParseFn(parseFn).build(); } /** Implementation of {@link #read(Schema)}. */ @@ -328,8 +307,6 @@ public abstract static class Read extends PTransform filepattern); abstract Builder setSchema(Schema schema); @@ -367,7 +342,6 @@ public Read from(String filepattern) { public Read withProjection(Schema projectionSchema, Schema encoderSchema) { return toBuilder() .setProjectionSchema(projectionSchema) - .setSplittable(true) .setEncoderSchema(encoderSchema) .build(); } @@ -389,28 +363,6 @@ public Read withBeamSchemas(boolean inferBeamSchema) { return toBuilder().setInferBeamSchema(inferBeamSchema).build(); } - /** - * Enable the Splittable reading. - * - * @deprecated as of version 2.35.0. Splittable reading is enabled by default. - */ - @Deprecated - public Read withSplit() { - return toBuilder().setSplittable(true).build(); - } - - /** - * Disable the Splittable reading. - * - * @deprecated This method may currently be used to opt-out of the default, splittable, - * behavior. However, this will be removed in a future release assuming no issues are - * discovered. - */ - @Deprecated - public Read withoutSplit() { - return toBuilder().setSplittable(false).build(); - } - /** * Define the Avro data model; see {@link AvroParquetReader.Builder#withDataModel(GenericData)}. */ @@ -431,10 +383,8 @@ public PCollection expand(PBegin input) { ReadFiles readFiles = readFiles(getSchema()) .withBeamSchemas(getInferBeamSchema()) - .withAvroDataModel(getAvroDataModel()); - if (isSplittable()) { - readFiles = readFiles.withSplit().withProjection(getProjectionSchema(), getEncoderSchema()); - } + .withAvroDataModel(getAvroDataModel()) + .withProjection(getProjectionSchema(), getEncoderSchema()); if (getConfiguration() != null) { readFiles = readFiles.withConfiguration(getConfiguration().get()); } @@ -452,7 +402,6 @@ public void populateDisplayData(DisplayData.Builder builder) { .add( DisplayData.item("inferBeamSchema", getInferBeamSchema()) .withLabel("Infer Beam Schema")) - .add(DisplayData.item("splittable", isSplittable())) .addIfNotNull(DisplayData.item("projectionSchema", String.valueOf(getProjectionSchema()))) .addIfNotNull(DisplayData.item("avroDataModel", String.valueOf(getAvroDataModel()))); if (this.getConfiguration() != null) { @@ -477,8 +426,6 @@ public abstract static class Parse extends PTransform> abstract @Nullable SerializableConfiguration getConfiguration(); - abstract boolean isSplittable(); - abstract Builder toBuilder(); @AutoValue.Builder @@ -491,8 +438,6 @@ abstract static class Builder { abstract Builder setConfiguration(SerializableConfiguration configuration); - abstract Builder setSplittable(boolean splittable); - abstract Parse build(); } @@ -521,28 +466,6 @@ public Parse withConfiguration(Configuration configuration) { return toBuilder().setConfiguration(new SerializableConfiguration(configuration)).build(); } - /** - * Enable the Splittable reading. - * - * @deprecated as of version 2.35.0. Splittable reading is enabled by default. - */ - @Deprecated - public Parse withSplit() { - return toBuilder().setSplittable(true).build(); - } - - /** - * Disable the Splittable reading. - * - * @deprecated This method may currently be used to opt-out of the default, splittable, - * behavior. However, this will be removed in a future release assuming no issues are - * discovered. - */ - @Deprecated - public Parse withoutSplit() { - return toBuilder().setSplittable(false).build(); - } - @Override public PCollection expand(PBegin input) { checkNotNull(getFilepattern(), "Filepattern cannot be null."); @@ -554,7 +477,6 @@ public PCollection expand(PBegin input) { parseFilesGenericRecords(getParseFn()) .toBuilder() .setCoder(getCoder()) - .setSplittable(isSplittable()) .setConfiguration(getConfiguration()) .build()); } @@ -565,7 +487,6 @@ public void populateDisplayData(DisplayData.Builder builder) { builder .addIfNotNull( DisplayData.item("filePattern", getFilepattern()).withLabel("Input File Pattern")) - .add(DisplayData.item("splittable", isSplittable())) .add(DisplayData.item("parseFn", getParseFn().getClass()).withLabel("Parse function")); if (this.getCoder() != null) { builder.add(DisplayData.item("coder", getCoder().getClass())); @@ -592,8 +513,6 @@ public abstract static class ParseFiles abstract @Nullable SerializableConfiguration getConfiguration(); - abstract boolean isSplittable(); - abstract Builder toBuilder(); @AutoValue.Builder @@ -604,8 +523,6 @@ abstract static class Builder { abstract Builder setConfiguration(SerializableConfiguration configuration); - abstract Builder setSplittable(boolean split); - abstract ParseFiles build(); } @@ -626,43 +543,19 @@ public ParseFiles withConfiguration(Configuration configuration) { return toBuilder().setConfiguration(new SerializableConfiguration(configuration)).build(); } - /** - * Enable the Splittable reading. - * - * @deprecated as of version 2.35.0. Splittable reading is enabled by default. - */ - @Deprecated - public ParseFiles withSplit() { - return toBuilder().setSplittable(true).build(); - } - - /** - * Disable the Splittable reading. - * - * @deprecated This method may currently be used to opt-out of the default, splittable, - * behavior. However, this will be removed in a future release assuming no issues are - * discovered. - */ - @Deprecated - public ParseFiles withoutSplit() { - return toBuilder().setSplittable(false).build(); - } - @Override public PCollection expand(PCollection input) { checkArgument(!isGenericRecordOutput(), "Parse can't be used for reading as GenericRecord."); return input - .apply(ParDo.of(buildFileReadingFn())) + .apply(ParDo.of(new SplitReadFn<>(null, null, getParseFn(), getConfiguration()))) .setCoder(inferCoder(input.getPipeline().getCoderRegistry())); } @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); - builder - .add(DisplayData.item("splittable", isSplittable())) - .add(DisplayData.item("parseFn", getParseFn().getClass()).withLabel("Parse function")); + builder.add(DisplayData.item("parseFn", getParseFn().getClass()).withLabel("Parse function")); if (this.getCoder() != null) { builder.add(DisplayData.item("coder", getCoder().getClass())); } @@ -676,13 +569,6 @@ public void populateDisplayData(DisplayData.Builder builder) { } } - /** Returns Splittable or normal Parquet file reading DoFn. */ - private DoFn buildFileReadingFn() { - return isSplittable() - ? new SplitReadFn<>(null, null, getParseFn(), getConfiguration()) - : new ReadFn<>(null, getParseFn(), getConfiguration()); - } - /** Returns true if expected output is {@code PCollection}. */ private boolean isGenericRecordOutput() { String outputType = TypeDescriptors.outputOf(getParseFn()).getType().getTypeName(); @@ -735,8 +621,6 @@ public abstract static class ReadFiles abstract boolean getInferBeamSchema(); - abstract boolean isSplittable(); - abstract Builder toBuilder(); @AutoValue.Builder @@ -753,8 +637,6 @@ abstract static class Builder { abstract Builder setInferBeamSchema(boolean inferBeamSchema); - abstract Builder setSplittable(boolean split); - abstract ReadFiles build(); } @@ -769,7 +651,6 @@ public ReadFiles withProjection(Schema projectionSchema, Schema encoderSchema) { return toBuilder() .setProjectionSchema(projectionSchema) .setEncoderSchema(encoderSchema) - .setSplittable(true) .build(); } @@ -790,32 +671,18 @@ public ReadFiles withBeamSchemas(boolean inferBeamSchema) { return toBuilder().setInferBeamSchema(inferBeamSchema).build(); } - /** - * Enable the Splittable reading. - * - * @deprecated as of version 2.35.0. Splittable reading is enabled by default. - */ - @Deprecated - public ReadFiles withSplit() { - return toBuilder().setSplittable(true).build(); - } - - /** - * Disable the Splittable reading. - * - * @deprecated This method may currently be used to opt-out of the default, splittable, - * behavior. However, this will be removed in a future release assuming no issues are - * discovered. - */ - @Deprecated - public ReadFiles withoutSplit() { - return toBuilder().setSplittable(false).build(); - } - @Override public PCollection expand(PCollection input) { checkNotNull(getSchema(), "Schema can not be null"); - return input.apply(ParDo.of(getReaderFn())).setCoder(getCollectionCoder()); + return input + .apply( + ParDo.of( + new SplitReadFn<>( + getAvroDataModel(), + getProjectionSchema(), + GenericRecordPassthroughFn.create(), + getConfiguration()))) + .setCoder(getCollectionCoder()); } @Override @@ -826,7 +693,6 @@ public void populateDisplayData(DisplayData.Builder builder) { .add( DisplayData.item("inferBeamSchema", getInferBeamSchema()) .withLabel("Infer Beam Schema")) - .add(DisplayData.item("splittable", isSplittable())) .addIfNotNull(DisplayData.item("projectionSchema", String.valueOf(getProjectionSchema()))) .addIfNotNull(DisplayData.item("avroDataModel", String.valueOf(getAvroDataModel()))); if (this.getConfiguration() != null) { @@ -839,26 +705,13 @@ public void populateDisplayData(DisplayData.Builder builder) { } } - /** Returns Parquet file reading function based on {@link #isSplittable()}. */ - private DoFn getReaderFn() { - return isSplittable() - ? new SplitReadFn<>( - getAvroDataModel(), - getProjectionSchema(), - GenericRecordPassthroughFn.create(), - getConfiguration()) - : new ReadFn<>( - getAvroDataModel(), GenericRecordPassthroughFn.create(), getConfiguration()); - } - /** * Returns {@link org.apache.beam.sdk.schemas.SchemaCoder} when using Beam schemas, {@link * AvroCoder} when not using Beam schema. */ @Experimental(Kind.SCHEMAS) private Coder getCollectionCoder() { - Schema coderSchema = - getProjectionSchema() != null && isSplittable() ? getEncoderSchema() : getSchema(); + Schema coderSchema = getProjectionSchema() != null ? getEncoderSchema() : getSchema(); return getInferBeamSchema() ? AvroUtils.schemaCoder(coderSchema) : AvroCoder.of(coderSchema); } @@ -1126,59 +979,6 @@ public Progress getProgress() { } } - /** - * @deprecated as of version 2.35.0. Splittable reading with {@link SplitReadFn} should be used - * instead. - */ - @Deprecated - static class ReadFn extends DoFn { - - private final Class modelClass; - - private final SerializableFunction parseFn; - - private final SerializableConfiguration configuration; - - ReadFn( - GenericData model, - SerializableFunction parseFn, - SerializableConfiguration configuration) { - this.modelClass = model != null ? model.getClass() : null; - this.parseFn = checkNotNull(parseFn, "GenericRecord parse function is null"); - this.configuration = configuration; - } - - @ProcessElement - public void processElement(@Element ReadableFile file, OutputReceiver receiver) - throws Exception { - if (!file.getMetadata().isReadSeekEfficient()) { - ResourceId filename = file.getMetadata().resourceId(); - throw new RuntimeException(String.format("File has to be seekable: %s", filename)); - } - - SeekableByteChannel seekableByteChannel = file.openSeekable(); - - AvroParquetReader.Builder builder = - (AvroParquetReader.Builder) - AvroParquetReader.builder( - new BeamParquetInputFile(seekableByteChannel)) - .withConf(SerializableConfiguration.newConfiguration(configuration)); - if (modelClass != null) { - // all GenericData implementations have a static get method - builder = builder.withDataModel(buildModelObject(modelClass)); - } - - try (ParquetReader reader = builder.build()) { - GenericRecord read; - while ((read = reader.read()) != null) { - receiver.output(parseFn.apply(read)); - } - } - - seekableByteChannel.close(); - } - } - private static class BeamParquetInputFile implements InputFile { private final SeekableByteChannel seekableByteChannel; diff --git a/sdks/java/io/parquet/src/test/java/org/apache/beam/sdk/io/parquet/ParquetIOTest.java b/sdks/java/io/parquet/src/test/java/org/apache/beam/sdk/io/parquet/ParquetIOTest.java index 5576be2a59f0..6dd67e3e511c 100644 --- a/sdks/java/io/parquet/src/test/java/org/apache/beam/sdk/io/parquet/ParquetIOTest.java +++ b/sdks/java/io/parquet/src/test/java/org/apache/beam/sdk/io/parquet/ParquetIOTest.java @@ -184,7 +184,6 @@ public void testWriteAndRead() { mainPipeline.run().waitUntilFinish(); ParquetIO.Read read = ParquetIO.read(SCHEMA); - assertTrue(read.isSplittable()); PCollection readBack = readPipeline.apply(read.from(temporaryFolder.getRoot().getAbsolutePath() + "/*")); @@ -210,27 +209,6 @@ public void testWriteWithRowGroupSizeAndRead() { readPipeline.run().waitUntilFinish(); } - @Test - public void testWriteAndReadWithoutSplit() { - List records = generateGenericRecords(1000); - - mainPipeline - .apply(Create.of(records).withCoder(AvroCoder.of(SCHEMA))) - .apply( - FileIO.write() - .via(ParquetIO.sink(SCHEMA)) - .to(temporaryFolder.getRoot().getAbsolutePath())); - mainPipeline.run().waitUntilFinish(); - - PCollection readBackWithSplit = - readPipeline.apply( - ParquetIO.read(SCHEMA) - .from(temporaryFolder.getRoot().getAbsolutePath() + "/*") - .withoutSplit()); - PAssert.that(readBackWithSplit).containsInAnyOrder(records); - readPipeline.run().waitUntilFinish(); - } - @Test public void testWriteAndReadWithBeamSchema() { List records = generateGenericRecords(1000); @@ -255,7 +233,7 @@ public void testWriteAndReadWithBeamSchema() { } @Test - public void testWriteAndReadFilesAsJsonForWithSplitForUnknownSchema() { + public void testWriteAndReadFilesAsJsonForUnknownSchema() { List records = generateGenericRecords(1000); mainPipeline @@ -266,13 +244,12 @@ public void testWriteAndReadFilesAsJsonForWithSplitForUnknownSchema() { .to(temporaryFolder.getRoot().getAbsolutePath())); mainPipeline.run().waitUntilFinish(); - PCollection readBackAsJsonWithSplit = + PCollection readBackAsJson = readPipeline.apply( ParquetIO.parseGenericRecords(ParseGenericRecordAsJsonFn.create()) - .from(temporaryFolder.getRoot().getAbsolutePath() + "/*") - .withSplit()); + .from(temporaryFolder.getRoot().getAbsolutePath() + "/*")); - PAssert.that(readBackAsJsonWithSplit).containsInAnyOrder(convertRecordsToJson(records)); + PAssert.that(readBackAsJson).containsInAnyOrder(convertRecordsToJson(records)); readPipeline.run().waitUntilFinish(); } @@ -281,7 +258,6 @@ public void testWriteAndReadFiles() { List records = generateGenericRecords(1000); ParquetIO.ReadFiles readFiles = ParquetIO.readFiles(SCHEMA); - assertTrue(readFiles.isSplittable()); PCollection writeThenRead = mainPipeline @@ -308,7 +284,6 @@ public void testReadFilesAsJsonForUnknownSchemaFiles() { ParquetIO.ParseFiles parseFiles = ParquetIO.parseFilesGenericRecords(ParseGenericRecordAsJsonFn.create()); - assertTrue(parseFiles.isSplittable()); PCollection writeThenRead = mainPipeline @@ -401,7 +376,6 @@ public void testReadDisplayData() { DisplayData.from( ParquetIO.read(SCHEMA) .from("foo.parquet") - .withSplit() .withProjection(REQUESTED_SCHEMA, SCHEMA) .withAvroDataModel(GenericData.get()) .withConfiguration(configuration)); @@ -409,7 +383,6 @@ public void testReadDisplayData() { assertThat(displayData, hasDisplayItem("filePattern", "foo.parquet")); assertThat(displayData, hasDisplayItem("schema", SCHEMA.toString())); assertThat(displayData, hasDisplayItem("inferBeamSchema", false)); - assertThat(displayData, hasDisplayItem("splittable", true)); assertThat(displayData, hasDisplayItem("projectionSchema", REQUESTED_SCHEMA.toString())); assertThat(displayData, hasDisplayItem("avroDataModel", GenericData.get().toString())); assertThat(displayData, hasDisplayItem("parquet.foo", "foo")); @@ -445,29 +418,6 @@ public void testWriteAndReadUsingReflectDataSchemaWithoutDataModelThrowsExceptio readPipeline.run().waitUntilFinish(); } - @Test(expected = org.apache.beam.sdk.Pipeline.PipelineExecutionException.class) - public void testWriteAndReadWithSplitUsingReflectDataSchemaWithoutDataModelThrowsException() { - Schema testRecordSchema = ReflectData.get().getSchema(TestRecord.class); - - List records = generateGenericRecords(1000); - mainPipeline - .apply(Create.of(records).withCoder(AvroCoder.of(testRecordSchema))) - .apply( - FileIO.write() - .via(ParquetIO.sink(testRecordSchema)) - .to(temporaryFolder.getRoot().getAbsolutePath())); - mainPipeline.run().waitUntilFinish(); - - PCollection readBack = - readPipeline.apply( - ParquetIO.read(testRecordSchema) - .withSplit() - .from(temporaryFolder.getRoot().getAbsolutePath() + "/*")); - - PAssert.that(readBack).containsInAnyOrder(records); - readPipeline.run().waitUntilFinish(); - } - @Test public void testWriteAndReadUsingReflectDataSchemaWithDataModel() { Schema testRecordSchema = ReflectData.get().getSchema(TestRecord.class); @@ -491,30 +441,6 @@ public void testWriteAndReadUsingReflectDataSchemaWithDataModel() { readPipeline.run().waitUntilFinish(); } - @Test - public void testWriteAndReadWithSplitUsingReflectDataSchemaWithDataModel() { - Schema testRecordSchema = ReflectData.get().getSchema(TestRecord.class); - - List records = generateGenericRecords(1000); - mainPipeline - .apply(Create.of(records).withCoder(AvroCoder.of(testRecordSchema))) - .apply( - FileIO.write() - .via(ParquetIO.sink(testRecordSchema)) - .to(temporaryFolder.getRoot().getAbsolutePath())); - mainPipeline.run().waitUntilFinish(); - - PCollection readBack = - readPipeline.apply( - ParquetIO.read(testRecordSchema) - .withSplit() - .withAvroDataModel(GenericData.get()) - .from(temporaryFolder.getRoot().getAbsolutePath() + "/*")); - - PAssert.that(readBack).containsInAnyOrder(records); - readPipeline.run().waitUntilFinish(); - } - @Test public void testWriteAndReadUsingGenericDataSchemaWithDataModel() { Schema schema = new Schema.Parser().parse(SCHEMA_STRING); @@ -538,30 +464,6 @@ public void testWriteAndReadUsingGenericDataSchemaWithDataModel() { readPipeline.run().waitUntilFinish(); } - @Test - public void testWriteAndReadwithSplitUsingGenericDataSchemaWithDataModel() { - Schema schema = new Schema.Parser().parse(SCHEMA_STRING); - - List records = generateGenericRecords(1000); - mainPipeline - .apply(Create.of(records).withCoder(AvroCoder.of(schema))) - .apply( - FileIO.write() - .via(ParquetIO.sink(schema).withAvroDataModel(GenericData.get())) - .to(temporaryFolder.getRoot().getAbsolutePath())); - mainPipeline.run().waitUntilFinish(); - - PCollection readBack = - readPipeline.apply( - ParquetIO.read(schema) - .withSplit() - .withAvroDataModel(GenericData.get()) - .from(temporaryFolder.getRoot().getAbsolutePath() + "/*")); - - PAssert.that(readBack).containsInAnyOrder(records); - readPipeline.run().waitUntilFinish(); - } - @Test public void testWriteAndReadWithConfiguration() { List records = generateGenericRecords(10); @@ -583,8 +485,7 @@ public void testWriteAndReadWithConfiguration() { readPipeline.apply( ParquetIO.read(SCHEMA) .from(temporaryFolder.getRoot().getAbsolutePath() + "/*") - .withConfiguration(configuration) - .withSplit()); + .withConfiguration(configuration)); PAssert.that(readBack).containsInAnyOrder(expectedRecords); readPipeline.run().waitUntilFinish(); } diff --git a/sdks/java/io/sparkreceiver/README.md b/sdks/java/io/sparkreceiver/README.md new file mode 100644 index 000000000000..6ce48efd58fe --- /dev/null +++ b/sdks/java/io/sparkreceiver/README.md @@ -0,0 +1,38 @@ + + +SparkReceiverIO contains I/O transforms which allow you to read messages from Spark Receiver (org.apache.spark.streaming.receiver.Receiver). + +## Dependencies + +To use SparkReceiverIO you must first add a dependency on `beam-sdks-java-io-sparkreceiver`. + +```maven + + org.apache.beam + beam-sdks-java-io-sparkreceiver + ... + +``` + +## Documentation + +The documentation is maintained in JavaDoc for SparkReceiverIO class. It includes +usage examples and primary concepts. +- [SparkReceiverIO.java](src/main/java/org/apache/beam/sdk/io/sparkreceiver/SparkReceiverIO.java) diff --git a/sdks/java/io/sparkreceiver/build.gradle b/sdks/java/io/sparkreceiver/build.gradle index 8d4b96f298cd..52c6a6340499 100644 --- a/sdks/java/io/sparkreceiver/build.gradle +++ b/sdks/java/io/sparkreceiver/build.gradle @@ -33,9 +33,10 @@ ext.summary = """Apache Beam SDK provides a simple, Java-based interface for streaming integration with CDAP plugins.""" configurations.all { - exclude group: 'org.slf4j', module: 'slf4j-log4j12' + exclude group: 'ch.qos.logback', module: 'logback-classic' exclude group: 'org.slf4j', module: 'slf4j-jdk14' - exclude group: 'org.slf4j', module: 'slf4j-simple' + exclude group: 'org.slf4j', module: 'slf4j-log4j12' + exclude group: 'org.slf4j', module: 'slf4j-reload4j' } dependencies { @@ -47,8 +48,11 @@ dependencies { implementation library.java.vendored_guava_26_0_jre implementation project(path: ":sdks:java:core", configuration: "shadow") compileOnly "org.scala-lang:scala-library:2.11.12" - testImplementation project(path: ":sdks:java:io:cdap", configuration: "testRuntimeMigration") testImplementation library.java.junit + testImplementation library.java.testcontainers_rabbitmq testImplementation project(path: ":runners:direct-java", configuration: "shadow") - testImplementation project(path: ":examples:java", configuration: "testRuntimeMigration") + testImplementation project(":sdks:java:io:synthetic") + testImplementation project(path: ":sdks:java:io:common", configuration: "testRuntimeMigration") + testImplementation project(path: ":sdks:java:testing:test-utils", configuration: "testRuntimeMigration") + testImplementation "com.rabbitmq:amqp-client:5.16.0" } diff --git a/sdks/java/io/sparkreceiver/src/main/java/org/apache/beam/sdk/io/sparkreceiver/ReadFromSparkReceiverWithOffsetDoFn.java b/sdks/java/io/sparkreceiver/src/main/java/org/apache/beam/sdk/io/sparkreceiver/ReadFromSparkReceiverWithOffsetDoFn.java index c51a5168ce39..8b2fdcb01ad1 100644 --- a/sdks/java/io/sparkreceiver/src/main/java/org/apache/beam/sdk/io/sparkreceiver/ReadFromSparkReceiverWithOffsetDoFn.java +++ b/sdks/java/io/sparkreceiver/src/main/java/org/apache/beam/sdk/io/sparkreceiver/ReadFromSparkReceiverWithOffsetDoFn.java @@ -19,6 +19,9 @@ import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; +import java.math.BigDecimal; +import java.math.MathContext; +import java.nio.ByteBuffer; import java.util.Queue; import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.TimeUnit; @@ -30,15 +33,19 @@ import org.apache.beam.sdk.transforms.splittabledofn.ManualWatermarkEstimator; import org.apache.beam.sdk.transforms.splittabledofn.OffsetRangeTracker; import org.apache.beam.sdk.transforms.splittabledofn.RestrictionTracker; +import org.apache.beam.sdk.transforms.splittabledofn.SplitResult; import org.apache.beam.sdk.transforms.splittabledofn.WatermarkEstimator; import org.apache.beam.sdk.transforms.splittabledofn.WatermarkEstimators; import org.apache.beam.sdk.transforms.windowing.BoundedWindow; +import org.apache.commons.lang3.SerializationUtils; import org.apache.spark.SparkConf; import org.apache.spark.streaming.receiver.Receiver; import org.checkerframework.checker.nullness.qual.Nullable; import org.joda.time.Instant; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import scala.collection.Iterator; +import scala.collection.mutable.ArrayBuffer; /** * A SplittableDoFn which reads from {@link Receiver} that implements {@link HasOffset}. By default, @@ -56,7 +63,7 @@ class ReadFromSparkReceiverWithOffsetDoFn extends DoFn { LoggerFactory.getLogger(ReadFromSparkReceiverWithOffsetDoFn.class); /** Constant waiting time after the {@link Receiver} starts. Required to prepare for polling */ - private static final int START_POLL_TIMEOUT_MS = 1000; + private static final int START_POLL_TIMEOUT_MS = 2000; private final SerializableFunction> createWatermarkEstimatorFn; @@ -104,10 +111,68 @@ public double getSize(@Element byte[] element, @Restriction OffsetRange offsetRa return restrictionTracker(element, offsetRange).getProgress().getWorkRemaining(); } + /** + * {@link OffsetRangeTracker} that performs basic split only in {@link + * OffsetRangeTracker#checkDone}. This behavior allows reading from primary range until resume, + * and then split to {alreadyReadRange, residualRange}. + */ + private static class CustomOffsetRangeTracker extends OffsetRangeTracker { + + public CustomOffsetRangeTracker(OffsetRange range) { + super(range); + } + + @SuppressWarnings("nullness") // Base method can return null + @Override + public SplitResult trySplit(double fractionOfRemainder) { + if (lastAttemptedOffset != null) { + if (range.getTo() == Long.MAX_VALUE) { + // Do not split, just use primary range + return null; + } else { + // Need to add residual range + OffsetRange res = new OffsetRange(range.getTo(), Long.MAX_VALUE); + this.range = new OffsetRange(range.getFrom(), range.getTo()); + return SplitResult.of(range, res); + } + } + // Basic split logic when lastAttemptedOffset is null + + // Convert to BigDecimal in computation to prevent overflow, which may result in loss of + // precision. + BigDecimal cur = + BigDecimal.valueOf(range.getFrom()).subtract(BigDecimal.ONE, MathContext.DECIMAL128); + // split = cur + max(1, (range.getTo() - cur) * fractionOfRemainder) + BigDecimal splitPos = + cur.add( + BigDecimal.valueOf(range.getTo()) + .subtract(cur, MathContext.DECIMAL128) + .multiply(BigDecimal.valueOf(fractionOfRemainder), MathContext.DECIMAL128) + .max(BigDecimal.ONE), + MathContext.DECIMAL128); + + long split = splitPos.longValue(); + if (split >= range.getTo()) { + return null; + } + OffsetRange res = new OffsetRange(split, range.getTo()); + this.range = new OffsetRange(range.getFrom(), split); + return SplitResult.of(range, res); + } + + @Override + public void checkDone() throws IllegalStateException { + if (lastAttemptedOffset != null && range.getTo() == Long.MAX_VALUE) { + // Perform basic split + super.trySplit(0); + } + } + } + @NewTracker public OffsetRangeTracker restrictionTracker( @Element byte[] element, @Restriction OffsetRange restriction) { - return new OffsetRangeTracker(restriction); + return new CustomOffsetRangeTracker(restriction); } @GetRestrictionCoder @@ -141,15 +206,46 @@ public boolean hasRecords() { @Override public void start(Receiver sparkReceiver) { this.sparkReceiver = sparkReceiver; - try { - new WrappedSupervisor( - sparkReceiver, - new SparkConf(), - objects -> { - V record = (V) objects[0]; - recordsQueue.offer(record); + + final SerializableFunction storeFn = + (input) -> { + if (input == null) { return null; - }); + } + /* + Use only [0] element - data. + The other elements are not needed because they are related to Spark environment options. + */ + Object data = input[0]; + + if (data instanceof ByteBuffer) { + final ByteBuffer byteBuffer = ((ByteBuffer) data).asReadOnlyBuffer(); + final byte[] bytes = new byte[byteBuffer.limit()]; + byteBuffer.get(bytes); + final V record = SerializationUtils.deserialize(bytes); + recordsQueue.offer(record); + } else if (data instanceof Iterator) { + final Iterator iterator = (Iterator) data; + while (iterator.hasNext()) { + V record = iterator.next(); + recordsQueue.offer(record); + } + } else if (data instanceof ArrayBuffer) { + final ArrayBuffer arrayBuffer = (ArrayBuffer) data; + final Iterator iterator = arrayBuffer.iterator(); + while (iterator.hasNext()) { + V record = iterator.next(); + recordsQueue.offer(record); + } + } else { + V record = (V) data; + recordsQueue.offer(record); + } + return null; + }; + + try { + new WrappedSupervisor(sparkReceiver, new SparkConf(), storeFn); } catch (Exception e) { LOG.error("Can not init Spark Receiver!", e); throw new IllegalStateException("Spark Receiver was not initialized"); @@ -188,26 +284,38 @@ public ProcessContinuation processElement( LOG.error("Can not build Spark Receiver", e); throw new IllegalStateException("Spark Receiver was not built!"); } + LOG.debug("Restriction {}", tracker.currentRestriction().toString()); sparkConsumer = new SparkConsumerWithOffset<>(tracker.currentRestriction().getFrom()); sparkConsumer.start(sparkReceiver); - while (sparkConsumer.hasRecords()) { - V record = sparkConsumer.poll(); - if (record != null) { - Long offset = getOffsetFn.apply(record); - if (!tracker.tryClaim(offset)) { - sparkConsumer.stop(); - LOG.debug("Stop for restriction: {}", tracker.currentRestriction().toString()); - return ProcessContinuation.stop(); + while (true) { + try { + TimeUnit.MILLISECONDS.sleep(START_POLL_TIMEOUT_MS); + } catch (InterruptedException e) { + LOG.error("SparkReceiver was interrupted before polling started", e); + throw new IllegalStateException("Spark Receiver was interrupted before polling started"); + } + if (!sparkConsumer.hasRecords()) { + sparkConsumer.stop(); + tracker.checkDone(); + LOG.debug("Resume for restriction: {}", tracker.currentRestriction().toString()); + return ProcessContinuation.resume(); + } + while (sparkConsumer.hasRecords()) { + V record = sparkConsumer.poll(); + if (record != null) { + Long offset = getOffsetFn.apply(record); + if (!tracker.tryClaim(offset)) { + sparkConsumer.stop(); + LOG.debug("Stop for restriction: {}", tracker.currentRestriction().toString()); + return ProcessContinuation.stop(); + } + Instant currentTimeStamp = getTimestampFn.apply(record); + ((ManualWatermarkEstimator) watermarkEstimator).setWatermark(currentTimeStamp); + receiver.outputWithTimestamp(record, currentTimeStamp); } - Instant currentTimeStamp = getTimestampFn.apply(record); - ((ManualWatermarkEstimator) watermarkEstimator).setWatermark(currentTimeStamp); - receiver.outputWithTimestamp(record, currentTimeStamp); } } - sparkConsumer.stop(); - LOG.debug("Resume for restriction: {}", tracker.currentRestriction().toString()); - return ProcessContinuation.resume(); } private static Instant ensureTimestampWithinBounds(Instant timestamp) { diff --git a/sdks/java/io/sparkreceiver/src/test/java/org/apache/beam/sdk/io/sparkreceiver/ArrayBufferDataReceiver.java b/sdks/java/io/sparkreceiver/src/test/java/org/apache/beam/sdk/io/sparkreceiver/ArrayBufferDataReceiver.java new file mode 100644 index 000000000000..849ea0a1373e --- /dev/null +++ b/sdks/java/io/sparkreceiver/src/test/java/org/apache/beam/sdk/io/sparkreceiver/ArrayBufferDataReceiver.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.sparkreceiver; + +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.util.concurrent.ThreadFactoryBuilder; +import org.apache.spark.storage.StorageLevel; +import org.apache.spark.streaming.receiver.Receiver; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import scala.collection.mutable.ArrayBuffer; + +/** + * Imitation of Spark {@link Receiver} that implements {@link HasOffset} interface and pushes data + * passing the {@link ArrayBuffer}. Used to test {@link SparkReceiverIO#read()}. + */ +public class ArrayBufferDataReceiver extends Receiver implements HasOffset { + + private static final Logger LOG = LoggerFactory.getLogger(ArrayBufferDataReceiver.class); + private static final int TIMEOUT_MS = 500; + public static final int RECORDS_COUNT = 20; + + private Long startOffset; + + ArrayBufferDataReceiver() { + super(StorageLevel.MEMORY_AND_DISK_2()); + } + + @Override + public void setStartOffset(Long startOffset) { + if (startOffset != null) { + this.startOffset = startOffset; + } + } + + @Override + @SuppressWarnings("FutureReturnValueIgnored") + public void onStart() { + Executors.newSingleThreadExecutor(new ThreadFactoryBuilder().build()).submit(this::receive); + } + + @Override + public void onStop() {} + + @Override + public Long getEndOffset() { + return Long.MAX_VALUE; + } + + private void receive() { + Long currentOffset = startOffset; + while (!isStopped()) { + if (currentOffset < RECORDS_COUNT) { + ArrayBuffer dataArray = new ArrayBuffer<>(); + for (int i = 0; i < Math.max(2, RECORDS_COUNT / 10); i++) { + dataArray.$plus$eq(String.valueOf(currentOffset++)); + } + store(dataArray); + } else { + break; + } + try { + TimeUnit.MILLISECONDS.sleep(TIMEOUT_MS); + } catch (InterruptedException e) { + LOG.error("Interrupted", e); + } + } + } +} diff --git a/sdks/java/io/sparkreceiver/src/test/java/org/apache/beam/sdk/io/sparkreceiver/ByteBufferDataReceiver.java b/sdks/java/io/sparkreceiver/src/test/java/org/apache/beam/sdk/io/sparkreceiver/ByteBufferDataReceiver.java new file mode 100644 index 000000000000..dcef495aa67a --- /dev/null +++ b/sdks/java/io/sparkreceiver/src/test/java/org/apache/beam/sdk/io/sparkreceiver/ByteBufferDataReceiver.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.sparkreceiver; + +import java.nio.ByteBuffer; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.util.concurrent.ThreadFactoryBuilder; +import org.apache.commons.lang3.SerializationUtils; +import org.apache.spark.storage.StorageLevel; +import org.apache.spark.streaming.receiver.Receiver; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Imitation of Spark {@link Receiver} that implements {@link HasOffset} interface and pushes data + * passing the {@link ByteBuffer}. Used to test {@link SparkReceiverIO#read()}. + */ +public class ByteBufferDataReceiver extends Receiver implements HasOffset { + + private static final Logger LOG = LoggerFactory.getLogger(ByteBufferDataReceiver.class); + private static final int TIMEOUT_MS = 500; + public static final int RECORDS_COUNT = 20; + + private Long startOffset; + + ByteBufferDataReceiver() { + super(StorageLevel.MEMORY_AND_DISK_2()); + } + + @Override + public void setStartOffset(Long startOffset) { + if (startOffset != null) { + this.startOffset = startOffset; + } + } + + @Override + @SuppressWarnings("FutureReturnValueIgnored") + public void onStart() { + Executors.newSingleThreadExecutor(new ThreadFactoryBuilder().build()).submit(this::receive); + } + + @Override + public void onStop() {} + + @Override + public Long getEndOffset() { + return Long.MAX_VALUE; + } + + private void receive() { + Long currentOffset = startOffset; + while (!isStopped()) { + if (currentOffset < RECORDS_COUNT) { + ByteBuffer dataBuffer = + ByteBuffer.wrap(SerializationUtils.serialize(String.valueOf(currentOffset++))); + store(dataBuffer); + } else { + break; + } + try { + TimeUnit.MILLISECONDS.sleep(TIMEOUT_MS); + } catch (InterruptedException e) { + LOG.error("Interrupted", e); + } + } + } +} diff --git a/sdks/java/io/sparkreceiver/src/test/java/org/apache/beam/sdk/io/sparkreceiver/IteratorDataReceiver.java b/sdks/java/io/sparkreceiver/src/test/java/org/apache/beam/sdk/io/sparkreceiver/IteratorDataReceiver.java new file mode 100644 index 000000000000..8999802542c2 --- /dev/null +++ b/sdks/java/io/sparkreceiver/src/test/java/org/apache/beam/sdk/io/sparkreceiver/IteratorDataReceiver.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.sparkreceiver; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.util.concurrent.ThreadFactoryBuilder; +import org.apache.spark.storage.StorageLevel; +import org.apache.spark.streaming.receiver.Receiver; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Imitation of Spark {@link Receiver} that implements {@link HasOffset} interface and pushes data + * passing the {@link Iterator}. Used to test {@link SparkReceiverIO#read()}. + */ +public class IteratorDataReceiver extends Receiver implements HasOffset { + + private static final Logger LOG = LoggerFactory.getLogger(IteratorDataReceiver.class); + private static final int TIMEOUT_MS = 500; + public static final int RECORDS_COUNT = 20; + + private Long startOffset; + + IteratorDataReceiver() { + super(StorageLevel.MEMORY_AND_DISK_2()); + } + + @Override + public void setStartOffset(Long startOffset) { + if (startOffset != null) { + this.startOffset = startOffset; + } + } + + @Override + @SuppressWarnings("FutureReturnValueIgnored") + public void onStart() { + Executors.newSingleThreadExecutor(new ThreadFactoryBuilder().build()).submit(this::receive); + } + + @Override + public void onStop() {} + + @Override + public Long getEndOffset() { + return Long.MAX_VALUE; + } + + private void receive() { + Long currentOffset = startOffset; + while (!isStopped()) { + if (currentOffset < RECORDS_COUNT) { + List dataArray = new ArrayList<>(); + for (int i = 0; i < Math.max(2, RECORDS_COUNT / 10); i++) { + dataArray.add(String.valueOf(currentOffset++)); + } + store(dataArray.iterator()); + } else { + break; + } + try { + TimeUnit.MILLISECONDS.sleep(TIMEOUT_MS); + } catch (InterruptedException e) { + LOG.error("Interrupted", e); + } + } + } +} diff --git a/sdks/java/io/sparkreceiver/src/test/java/org/apache/beam/sdk/io/sparkreceiver/RabbitMqReceiverWithOffset.java b/sdks/java/io/sparkreceiver/src/test/java/org/apache/beam/sdk/io/sparkreceiver/RabbitMqReceiverWithOffset.java new file mode 100644 index 000000000000..362e6280eb29 --- /dev/null +++ b/sdks/java/io/sparkreceiver/src/test/java/org/apache/beam/sdk/io/sparkreceiver/RabbitMqReceiverWithOffset.java @@ -0,0 +1,163 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.sparkreceiver; + +import com.rabbitmq.client.AMQP; +import com.rabbitmq.client.Channel; +import com.rabbitmq.client.Connection; +import com.rabbitmq.client.ConnectionFactory; +import com.rabbitmq.client.DefaultConsumer; +import com.rabbitmq.client.Envelope; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.Collections; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.util.concurrent.ThreadFactoryBuilder; +import org.apache.spark.storage.StorageLevel; +import org.apache.spark.streaming.receiver.Receiver; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Imitation of Spark {@link Receiver} for RabbitMQ that implements {@link HasOffset} interface. + * Used to test {@link SparkReceiverIO#read()}. + */ +class RabbitMqReceiverWithOffset extends Receiver implements HasOffset { + + private static final Logger LOG = LoggerFactory.getLogger(RabbitMqReceiverWithOffset.class); + private static final int MAX_PREFETCH_COUNT = 65535; + + private final String rabbitmqUrl; + private final String streamName; + private final long totalMessagesNumber; + private long startOffset; + private static final int READ_TIMEOUT_IN_MS = 100; + + RabbitMqReceiverWithOffset( + final String uri, final String streamName, final long totalMessagesNumber) { + super(StorageLevel.MEMORY_AND_DISK_2()); + rabbitmqUrl = uri; + this.streamName = streamName; + this.totalMessagesNumber = totalMessagesNumber; + } + + @Override + public void setStartOffset(Long startOffset) { + this.startOffset = startOffset != null ? startOffset : 0; + } + + @Override + public Long getEndOffset() { + return Long.MAX_VALUE; + } + + @Override + @SuppressWarnings("FutureReturnValueIgnored") + public void onStart() { + Executors.newSingleThreadExecutor(new ThreadFactoryBuilder().build()).submit(this::receive); + } + + @Override + public void onStop() {} + + private void receive() { + long currentOffset = startOffset; + + final TestConsumer testConsumer; + final Connection connection; + final Channel channel; + + try { + LOG.info("Starting receiver with offset {}", currentOffset); + final ConnectionFactory connectionFactory = new ConnectionFactory(); + connectionFactory.setUri(rabbitmqUrl); + connectionFactory.setAutomaticRecoveryEnabled(true); + connectionFactory.setConnectionTimeout(600000); + connectionFactory.setNetworkRecoveryInterval(5000); + connectionFactory.setRequestedHeartbeat(60); + connectionFactory.setTopologyRecoveryEnabled(true); + connectionFactory.setRequestedChannelMax(0); + connectionFactory.setRequestedFrameMax(0); + connection = connectionFactory.newConnection(); + + channel = connection.createChannel(); + channel.queueDeclare( + streamName, true, false, false, Collections.singletonMap("x-queue-type", "stream")); + channel.basicQos(Math.min(MAX_PREFETCH_COUNT, (int) totalMessagesNumber)); + testConsumer = new TestConsumer(this, channel, this::store); + + channel.basicConsume( + streamName, + false, + Collections.singletonMap("x-stream-offset", currentOffset), + testConsumer); + } catch (Exception e) { + LOG.error("Can not basic consume", e); + throw new RuntimeException(e); + } + + while (!isStopped()) { + try { + TimeUnit.MILLISECONDS.sleep(READ_TIMEOUT_IN_MS); + } catch (InterruptedException e) { + LOG.error("Interrupted", e); + } + } + + try { + LOG.info("Stopping receiver"); + channel.close(); + connection.close(); + } catch (TimeoutException | IOException e) { + throw new RuntimeException(e); + } + } + + /** A simple RabbitMQ {@code Consumer}. */ + static class TestConsumer extends DefaultConsumer { + + private final java.util.function.Consumer messageConsumer; + private final Receiver receiver; + + public TestConsumer( + Receiver receiver, + Channel channel, + java.util.function.Consumer messageConsumer) { + super(channel); + this.receiver = receiver; + this.messageConsumer = messageConsumer; + } + + @Override + public void handleDelivery( + String consumerTag, Envelope envelope, AMQP.BasicProperties properties, byte[] body) { + try { + final String sMessage = new String(body, StandardCharsets.UTF_8); + LOG.trace("Adding message to consumer: {}", sMessage); + messageConsumer.accept(sMessage); + if (getChannel().isOpen() && !receiver.isStopped()) { + getChannel().basicAck(envelope.getDeliveryTag(), false); + } + } catch (Exception e) { + LOG.error("Can't read from RabbitMQ: {}", e.getMessage()); + } + } + } +} diff --git a/sdks/java/io/sparkreceiver/src/test/java/org/apache/beam/sdk/io/sparkreceiver/ReadFromSparkReceiverWithOffsetDoFnTest.java b/sdks/java/io/sparkreceiver/src/test/java/org/apache/beam/sdk/io/sparkreceiver/ReadFromSparkReceiverWithOffsetDoFnTest.java new file mode 100644 index 000000000000..67b4e2cabba1 --- /dev/null +++ b/sdks/java/io/sparkreceiver/src/test/java/org/apache/beam/sdk/io/sparkreceiver/ReadFromSparkReceiverWithOffsetDoFnTest.java @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.sparkreceiver; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +import java.util.ArrayList; +import java.util.List; +import org.apache.beam.sdk.io.range.OffsetRange; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.splittabledofn.ManualWatermarkEstimator; +import org.apache.beam.sdk.transforms.splittabledofn.OffsetRangeTracker; +import org.apache.beam.sdk.transforms.splittabledofn.SplitResult; +import org.checkerframework.checker.initialization.qual.Initialized; +import org.checkerframework.checker.nullness.qual.NonNull; +import org.checkerframework.checker.nullness.qual.UnknownKeyFor; +import org.joda.time.Instant; +import org.junit.Test; + +/** Test class for {@link ReadFromSparkReceiverWithOffsetDoFn}. */ +public class ReadFromSparkReceiverWithOffsetDoFnTest { + + private static final byte[] TEST_ELEMENT = new byte[] {}; + + private final ReadFromSparkReceiverWithOffsetDoFn dofnInstance = + new ReadFromSparkReceiverWithOffsetDoFn<>(makeReadTransform()); + + private SparkReceiverIO.Read makeReadTransform() { + ReceiverBuilder receiverBuilder = + new ReceiverBuilder<>(CustomReceiverWithOffset.class).withConstructorArgs(); + return SparkReceiverIO.read() + .withSparkReceiverBuilder(receiverBuilder) + .withGetOffsetFn(Long::valueOf) + .withTimestampFn(Instant::parse); + } + + private static class MockOutputReceiver implements DoFn.OutputReceiver { + + private final List records = new ArrayList<>(); + + @Override + public void output(String output) {} + + @Override + public void outputWithTimestamp( + String output, @UnknownKeyFor @NonNull @Initialized Instant timestamp) { + records.add(output); + } + + public List getOutputs() { + return this.records; + } + } + + private final ManualWatermarkEstimator mockWatermarkEstimator = + new ManualWatermarkEstimator() { + + @Override + public void setWatermark(Instant watermark) { + // do nothing + } + + @Override + public Instant currentWatermark() { + return null; + } + + @Override + public Instant getState() { + return null; + } + }; + + private List createExpectedRecords(int numRecords) { + List records = new ArrayList<>(); + for (int i = 0; i < numRecords; i++) { + records.add(String.valueOf(i)); + } + return records; + } + + @Test + public void testInitialRestriction() { + long expectedStartOffset = 0L; + OffsetRange result = dofnInstance.initialRestriction(TEST_ELEMENT); + assertEquals(new OffsetRange(expectedStartOffset, Long.MAX_VALUE), result); + } + + @Test + public void testRestrictionTrackerSplit() { + OffsetRangeTracker offsetRangeTracker = + dofnInstance.restrictionTracker( + TEST_ELEMENT, dofnInstance.initialRestriction(TEST_ELEMENT)); + assertEquals(0L, offsetRangeTracker.currentRestriction().getFrom()); + assertEquals(Long.MAX_VALUE, offsetRangeTracker.currentRestriction().getTo()); + + assertEquals( + SplitResult.of(new OffsetRange(0, 0), new OffsetRange(0, Long.MAX_VALUE)), + offsetRangeTracker.trySplit(0d)); + + offsetRangeTracker = + dofnInstance.restrictionTracker( + TEST_ELEMENT, dofnInstance.initialRestriction(TEST_ELEMENT)); + + assertTrue(offsetRangeTracker.tryClaim(0L)); + assertNull(offsetRangeTracker.trySplit(0d)); + + offsetRangeTracker.checkDone(); + assertEquals( + SplitResult.of(new OffsetRange(0, 1), new OffsetRange(1, Long.MAX_VALUE)), + offsetRangeTracker.trySplit(0d)); + } + + @Test + public void testProcessElement() { + MockOutputReceiver receiver = new MockOutputReceiver(); + DoFn.ProcessContinuation result = + dofnInstance.processElement( + TEST_ELEMENT, + dofnInstance.restrictionTracker( + TEST_ELEMENT, dofnInstance.initialRestriction(TEST_ELEMENT)), + mockWatermarkEstimator, + receiver); + assertEquals(DoFn.ProcessContinuation.resume(), result); + assertEquals( + createExpectedRecords(CustomReceiverWithOffset.RECORDS_COUNT), receiver.getOutputs()); + } +} diff --git a/sdks/java/io/sparkreceiver/src/test/java/org/apache/beam/sdk/io/sparkreceiver/SparkReceiverIOIT.java b/sdks/java/io/sparkreceiver/src/test/java/org/apache/beam/sdk/io/sparkreceiver/SparkReceiverIOIT.java new file mode 100644 index 000000000000..b335aab2ed53 --- /dev/null +++ b/sdks/java/io/sparkreceiver/src/test/java/org/apache/beam/sdk/io/sparkreceiver/SparkReceiverIOIT.java @@ -0,0 +1,354 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.beam.sdk.io.sparkreceiver; + +import static org.apache.beam.sdk.io.synthetic.SyntheticOptions.fromJsonString; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; + +import com.google.cloud.Timestamp; +import com.rabbitmq.client.Channel; +import com.rabbitmq.client.Connection; +import com.rabbitmq.client.ConnectionFactory; +import com.rabbitmq.client.MessageProperties; +import java.io.IOException; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.security.KeyManagementException; +import java.security.NoSuchAlgorithmException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.UUID; +import java.util.concurrent.TimeoutException; +import java.util.function.BiFunction; +import java.util.stream.Collectors; +import java.util.stream.LongStream; +import org.apache.beam.sdk.PipelineResult; +import org.apache.beam.sdk.coders.StringUtf8Coder; +import org.apache.beam.sdk.io.common.IOITHelper; +import org.apache.beam.sdk.io.common.IOTestPipelineOptions; +import org.apache.beam.sdk.io.synthetic.SyntheticSourceOptions; +import org.apache.beam.sdk.metrics.Counter; +import org.apache.beam.sdk.metrics.Metrics; +import org.apache.beam.sdk.options.Default; +import org.apache.beam.sdk.options.Description; +import org.apache.beam.sdk.options.ExperimentalOptions; +import org.apache.beam.sdk.options.PipelineOptionsFactory; +import org.apache.beam.sdk.options.StreamingOptions; +import org.apache.beam.sdk.options.Validation; +import org.apache.beam.sdk.testing.TestPipeline; +import org.apache.beam.sdk.testing.TestPipelineOptions; +import org.apache.beam.sdk.testutils.NamedTestResult; +import org.apache.beam.sdk.testutils.metrics.IOITMetrics; +import org.apache.beam.sdk.testutils.metrics.MetricsReader; +import org.apache.beam.sdk.testutils.metrics.TimeMonitor; +import org.apache.beam.sdk.testutils.publishing.InfluxDBSettings; +import org.apache.beam.sdk.transforms.DoFn; +import org.apache.beam.sdk.transforms.ParDo; +import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableSet; +import org.checkerframework.checker.nullness.qual.Nullable; +import org.joda.time.Duration; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Rule; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.containers.RabbitMQContainer; +import org.testcontainers.utility.DockerImageName; + +/** + * IO Integration test for {@link org.apache.beam.sdk.io.sparkreceiver.SparkReceiverIO}. + * + *

    {@see https://beam.apache.org/documentation/io/testing/#i-o-transform-integration-tests} for + * more details. + * + *

    NOTE: This test sets retention policy of the messages so that all messages are retained in the + * topic so that we could read them back after writing. + */ +@RunWith(JUnit4.class) +public class SparkReceiverIOIT { + + private static final Logger LOG = LoggerFactory.getLogger(SparkReceiverIOIT.class); + + private static final String READ_TIME_METRIC_NAME = "read_time"; + + private static final String RUN_TIME_METRIC_NAME = "run_time"; + + private static final String READ_ELEMENT_METRIC_NAME = "spark_read_element_count"; + + private static final String NAMESPACE = SparkReceiverIOIT.class.getName(); + + private static final String TEST_ID = UUID.randomUUID().toString(); + + private static final String TIMESTAMP = Timestamp.now().toString(); + + private static final String TEST_MESSAGE_PREFIX = "Test "; + + private static Options options; + + private static SyntheticSourceOptions sourceOptions; + + private static GenericContainer rabbitMqContainer; + + private static InfluxDBSettings settings; + + private static final ExperimentalOptions sdfPipelineOptions; + + static { + sdfPipelineOptions = PipelineOptionsFactory.create().as(ExperimentalOptions.class); + sdfPipelineOptions.as(TestPipelineOptions.class).setBlockOnRun(false); + } + + @Rule public TestPipeline readPipeline = TestPipeline.fromOptions(sdfPipelineOptions); + + @BeforeClass + public static void setup() throws IOException { + options = IOITHelper.readIOTestPipelineOptions(Options.class); + sourceOptions = fromJsonString(options.getSourceOptions(), SyntheticSourceOptions.class); + if (options.isWithTestcontainers()) { + setupRabbitMqContainer(); + } else { + settings = + InfluxDBSettings.builder() + .withHost(options.getInfluxHost()) + .withDatabase(options.getInfluxDatabase()) + .withMeasurement(options.getInfluxMeasurement()) + .get(); + } + clearRabbitMQ(); + } + + @AfterClass + public static void afterClass() { + if (rabbitMqContainer != null) { + rabbitMqContainer.stop(); + } + + clearRabbitMQ(); + } + + private static void setupRabbitMqContainer() { + rabbitMqContainer = + new RabbitMQContainer( + DockerImageName.parse("rabbitmq").withTag(options.getRabbitMqContainerVersion())) + .withExposedPorts(5672, 15672); + rabbitMqContainer.start(); + options.setRabbitMqBootstrapServerAddress( + getBootstrapServers( + rabbitMqContainer.getHost(), rabbitMqContainer.getMappedPort(5672).toString())); + } + + private static String getBootstrapServers(String host, String port) { + return String.format("amqp://guest:guest@%s:%s", host, port); + } + + /** Pipeline options specific for this test. */ + public interface Options extends IOTestPipelineOptions, StreamingOptions { + + @Description("Options for synthetic source.") + @Validation.Required + @Default.String("{\"numRecords\": \"500\",\"keySizeBytes\": \"1\",\"valueSizeBytes\": \"90\"}") + String getSourceOptions(); + + void setSourceOptions(String sourceOptions); + + @Description("RabbitMQ bootstrap server address") + @Default.String("amqp://guest:guest@localhost:5672") + String getRabbitMqBootstrapServerAddress(); + + void setRabbitMqBootstrapServerAddress(String address); + + @Description("RabbitMQ stream") + @Default.String("rabbitMqTestStream") + String getStreamName(); + + void setStreamName(String streamName); + + @Description("Whether to use testcontainers") + @Default.Boolean(false) + Boolean isWithTestcontainers(); + + void setWithTestcontainers(Boolean withTestcontainers); + + @Description("RabbitMQ container version. Use when useTestcontainers is true") + @Nullable + @Default.String("3.9-alpine") + String getRabbitMqContainerVersion(); + + void setRabbitMqContainerVersion(String rabbitMqContainerVersion); + + @Description("Time to wait for the events to be processed by the read pipeline (in seconds)") + @Default.Integer(50) + @Validation.Required + Integer getReadTimeout(); + + void setReadTimeout(Integer readTimeout); + } + + private void writeToRabbitMq(final List messages) + throws URISyntaxException, NoSuchAlgorithmException, KeyManagementException, IOException, + TimeoutException { + + final ConnectionFactory connectionFactory = new ConnectionFactory(); + connectionFactory.setUri(options.getRabbitMqBootstrapServerAddress()); + Map arguments = new HashMap<>(); + arguments.put("x-queue-type", "stream"); + + try (Connection connection = connectionFactory.newConnection(); + Channel channel = connection.createChannel()) { + channel.queueDeclare(options.getStreamName(), true, false, false, arguments); + + messages.forEach( + message -> { + try { + channel.basicPublish( + "", + options.getStreamName(), + MessageProperties.PERSISTENT_TEXT_PLAIN, + message.getBytes(StandardCharsets.UTF_8)); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + } + } + + private SparkReceiverIO.Read readFromRabbitMqWithOffset() { + final ReceiverBuilder receiverBuilder = + new ReceiverBuilder<>(RabbitMqReceiverWithOffset.class) + .withConstructorArgs( + options.getRabbitMqBootstrapServerAddress(), + options.getStreamName(), + sourceOptions.numRecords); + + return SparkReceiverIO.read() + .withGetOffsetFn( + rabbitMqMessage -> + Long.valueOf(rabbitMqMessage.substring(TEST_MESSAGE_PREFIX.length()))) + .withSparkReceiverBuilder(receiverBuilder); + } + + /** + * Since streams in RabbitMQ are durable by definition, we have to clean them up after test + * execution. The simplest way is to delete the whole stream after test execution. + */ + private static void clearRabbitMQ() { + final ConnectionFactory connectionFactory = new ConnectionFactory(); + + try { + connectionFactory.setUri(options.getRabbitMqBootstrapServerAddress()); + try (Connection connection = connectionFactory.newConnection(); + Channel channel = connection.createChannel()) { + channel.queueDelete(options.getStreamName()); + } + } catch (URISyntaxException + | NoSuchAlgorithmException + | KeyManagementException + | IOException + | TimeoutException e) { + LOG.error("Error during RabbitMQ clean up", e); + } + } + + /** Function for counting processed pipeline elements. */ + private static class CountingFn extends DoFn { + + private final Counter elementCounter; + + CountingFn(String namespace, String name) { + elementCounter = Metrics.counter(namespace, name); + } + + @ProcessElement + public void processElement() { + elementCounter.inc(1L); + } + } + + private void cancelIfTimeout(PipelineResult readResult, PipelineResult.State readState) + throws IOException { + if (readState == null) { + readResult.cancel(); + } + } + + private long readElementMetric(PipelineResult result) { + MetricsReader metricsReader = new MetricsReader(result, SparkReceiverIOIT.NAMESPACE); + return metricsReader.getCounterMetric(SparkReceiverIOIT.READ_ELEMENT_METRIC_NAME); + } + + private Set readMetrics(PipelineResult readResult) { + BiFunction supplier = + (reader, metricName) -> { + long start = reader.getStartTimeMetric(metricName); + long end = reader.getEndTimeMetric(metricName); + return NamedTestResult.create(TEST_ID, TIMESTAMP, metricName, (end - start) / 1e3); + }; + + NamedTestResult readTime = + supplier.apply(new MetricsReader(readResult, NAMESPACE), READ_TIME_METRIC_NAME); + NamedTestResult runTime = + NamedTestResult.create(TEST_ID, TIMESTAMP, RUN_TIME_METRIC_NAME, readTime.getValue()); + + return ImmutableSet.of(readTime, runTime); + } + + @Test + public void testSparkReceiverIOReadsInStreamingWithOffset() throws IOException { + + final List messages = + LongStream.range(0, sourceOptions.numRecords) + .mapToObj(number -> TEST_MESSAGE_PREFIX + number) + .collect(Collectors.toList()); + + try { + writeToRabbitMq(messages); + } catch (Exception e) { + LOG.error("Can not write to rabbit {}", e.getMessage()); + fail(); + } + LOG.info(sourceOptions.numRecords + " records were successfully written to RabbitMQ"); + + // Use streaming pipeline to read RabbitMQ records. + readPipeline.getOptions().as(Options.class).setStreaming(true); + readPipeline + .apply("Read from unbounded RabbitMq", readFromRabbitMqWithOffset()) + .setCoder(StringUtf8Coder.of()) + .apply("Measure read time", ParDo.of(new TimeMonitor<>(NAMESPACE, READ_TIME_METRIC_NAME))) + .apply("Counting element", ParDo.of(new CountingFn(NAMESPACE, READ_ELEMENT_METRIC_NAME))); + + final PipelineResult readResult = readPipeline.run(); + final PipelineResult.State readState = + readResult.waitUntilFinish(Duration.standardSeconds(options.getReadTimeout())); + + cancelIfTimeout(readResult, readState); + + assertEquals(sourceOptions.numRecords, readElementMetric(readResult)); + + if (!options.isWithTestcontainers()) { + Set metrics = readMetrics(readResult); + IOITMetrics.publishToInflux(TEST_ID, TIMESTAMP, metrics, settings); + } + } +} diff --git a/sdks/java/io/sparkreceiver/src/test/java/org/apache/beam/sdk/io/sparkreceiver/SparkReceiverIOTest.java b/sdks/java/io/sparkreceiver/src/test/java/org/apache/beam/sdk/io/sparkreceiver/SparkReceiverIOTest.java index e81dca5150e5..6931e7199926 100644 --- a/sdks/java/io/sparkreceiver/src/test/java/org/apache/beam/sdk/io/sparkreceiver/SparkReceiverIOTest.java +++ b/sdks/java/io/sparkreceiver/src/test/java/org/apache/beam/sdk/io/sparkreceiver/SparkReceiverIOTest.java @@ -20,14 +20,14 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertThrows; -import java.util.HashSet; -import java.util.Set; +import java.util.ArrayList; +import java.util.List; import org.apache.beam.sdk.coders.StringUtf8Coder; +import org.apache.beam.sdk.testing.PAssert; import org.apache.beam.sdk.testing.TestPipeline; import org.apache.beam.sdk.testing.TestPipelineOptions; -import org.apache.beam.sdk.transforms.DoFn; -import org.apache.beam.sdk.transforms.ParDo; import org.apache.beam.sdk.transforms.SerializableFunction; +import org.apache.beam.sdk.values.PCollection; import org.joda.time.Duration; import org.joda.time.Instant; import org.junit.Rule; @@ -110,11 +110,13 @@ public void testReadFromCustomReceiverWithOffset() { .withTimestampFn(Instant::parse) .withSparkReceiverBuilder(receiverBuilder); + List expected = new ArrayList<>(); for (int i = 0; i < CustomReceiverWithOffset.RECORDS_COUNT; i++) { - TestOutputDoFn.EXPECTED_RECORDS.add(String.valueOf(i)); + expected.add(String.valueOf(i)); } - pipeline.apply(reader).setCoder(StringUtf8Coder.of()).apply(ParDo.of(new TestOutputDoFn())); + PCollection actual = pipeline.apply(reader).setCoder(StringUtf8Coder.of()); + PAssert.that(actual).containsInAnyOrder(expected); pipeline.run().waitUntilFinish(Duration.standardSeconds(15)); } @@ -129,28 +131,73 @@ public void testReadFromCustomReceiverWithOffsetFailsAndReread() { .withTimestampFn(Instant::parse) .withSparkReceiverBuilder(receiverBuilder); + List expected = new ArrayList<>(); for (int i = 0; i < CustomReceiverWithOffset.RECORDS_COUNT; i++) { - TestOutputDoFn.EXPECTED_RECORDS.add(String.valueOf(i)); + expected.add(String.valueOf(i)); } - pipeline.apply(reader).setCoder(StringUtf8Coder.of()).apply(ParDo.of(new TestOutputDoFn())); + PCollection actual = pipeline.apply(reader).setCoder(StringUtf8Coder.of()); + PAssert.that(actual).containsInAnyOrder(expected); pipeline.run().waitUntilFinish(Duration.standardSeconds(15)); + } + + @Test + public void testReadFromReceiverArrayBufferData() { + ReceiverBuilder receiverBuilder = + new ReceiverBuilder<>(ArrayBufferDataReceiver.class).withConstructorArgs(); + SparkReceiverIO.Read reader = + SparkReceiverIO.read() + .withGetOffsetFn(Long::valueOf) + .withTimestampFn(Instant::parse) + .withSparkReceiverBuilder(receiverBuilder); + + List expected = new ArrayList<>(); + for (int i = 0; i < ArrayBufferDataReceiver.RECORDS_COUNT; i++) { + expected.add(String.valueOf(i)); + } + PCollection actual = pipeline.apply(reader).setCoder(StringUtf8Coder.of()); + + PAssert.that(actual).containsInAnyOrder(expected); + pipeline.run().waitUntilFinish(Duration.standardSeconds(15)); + } + + @Test + public void testReadFromReceiverByteBufferData() { + ReceiverBuilder receiverBuilder = + new ReceiverBuilder<>(ByteBufferDataReceiver.class).withConstructorArgs(); + SparkReceiverIO.Read reader = + SparkReceiverIO.read() + .withGetOffsetFn(Long::valueOf) + .withTimestampFn(Instant::parse) + .withSparkReceiverBuilder(receiverBuilder); - assertEquals(0, TestOutputDoFn.EXPECTED_RECORDS.size()); + List expected = new ArrayList<>(); + for (int i = 0; i < ByteBufferDataReceiver.RECORDS_COUNT; i++) { + expected.add(String.valueOf(i)); + } + PCollection actual = pipeline.apply(reader).setCoder(StringUtf8Coder.of()); + + PAssert.that(actual).containsInAnyOrder(expected); + pipeline.run().waitUntilFinish(Duration.standardSeconds(15)); } - /** {@link DoFn} that throws {@code RuntimeException} if receives unexpected element. */ - private static class TestOutputDoFn extends DoFn { - private static final Set EXPECTED_RECORDS = new HashSet<>(); - - @ProcessElement - public void processElement(@Element String element, OutputReceiver outputReceiver) { - if (!EXPECTED_RECORDS.contains(element)) { - throw new RuntimeException("Received unexpected element: " + element); - } else { - EXPECTED_RECORDS.remove(element); - outputReceiver.output(element); - } + @Test + public void testReadFromReceiverIteratorData() { + ReceiverBuilder receiverBuilder = + new ReceiverBuilder<>(IteratorDataReceiver.class).withConstructorArgs(); + SparkReceiverIO.Read reader = + SparkReceiverIO.read() + .withGetOffsetFn(Long::valueOf) + .withTimestampFn(Instant::parse) + .withSparkReceiverBuilder(receiverBuilder); + + List expected = new ArrayList<>(); + for (int i = 0; i < IteratorDataReceiver.RECORDS_COUNT; i++) { + expected.add(String.valueOf(i)); } + PCollection actual = pipeline.apply(reader).setCoder(StringUtf8Coder.of()); + + PAssert.that(actual).containsInAnyOrder(expected); + pipeline.run().waitUntilFinish(Duration.standardSeconds(15)); } } diff --git a/sdks/java/maven-archetypes/examples/build.gradle b/sdks/java/maven-archetypes/examples/build.gradle index 148015f43898..6a034029f10e 100644 --- a/sdks/java/maven-archetypes/examples/build.gradle +++ b/sdks/java/maven-archetypes/examples/build.gradle @@ -36,7 +36,7 @@ processResources { 'libraries-bom.version': dependencies.create(project.library.java.google_cloud_platform_libraries_bom).getVersion(), 'pubsub.version': dependencies.create(project.library.java.google_api_services_pubsub).getVersion(), 'slf4j.version': dependencies.create(project.library.java.slf4j_api).getVersion(), - 'spark.version': dependencies.create(project.library.java.spark_core).getVersion(), + 'spark.version': dependencies.create(project.library.java.spark3_core).getVersion(), 'nemo.version': dependencies.create(project.library.java.nemo_compiler_frontend_beam).getVersion(), 'hadoop.version': dependencies.create(project.library.java.hadoop_client).getVersion(), 'mockito.version': dependencies.create(project.library.java.mockito_core).getVersion(), diff --git a/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml index 50515b812078..5560ca93257e 100644 --- a/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml +++ b/sdks/java/maven-archetypes/examples/src/main/resources/archetype-resources/pom.xml @@ -220,15 +220,11 @@ spark-runner - - - 4.1.17.Final - + org.apache.beam - beam-runners-spark + beam-runners-spark-3 ${beam.version} runtime @@ -246,7 +242,7 @@ org.apache.spark - spark-streaming_2.11 + spark-streaming_2.12 ${spark.version} runtime @@ -258,26 +254,10 @@ com.fasterxml.jackson.module - jackson-module-scala_2.11 + jackson-module-scala_2.12 ${jackson.version} runtime - - - org.apache.beam - beam-sdks-java-io-google-cloud-platform - ${beam.version} - - - io.grpc - grpc-netty - - - io.netty - netty-handler - - - diff --git a/sdks/java/maven-archetypes/gcp-bom-examples/build.gradle b/sdks/java/maven-archetypes/gcp-bom-examples/build.gradle index 0e4f394170e5..af06bfc41d8e 100644 --- a/sdks/java/maven-archetypes/gcp-bom-examples/build.gradle +++ b/sdks/java/maven-archetypes/gcp-bom-examples/build.gradle @@ -35,7 +35,7 @@ processResources { 'junit.version': dependencies.create(project.library.java.junit).getVersion(), 'pubsub.version': dependencies.create(project.library.java.google_api_services_pubsub).getVersion(), 'slf4j.version': dependencies.create(project.library.java.slf4j_api).getVersion(), - 'spark.version': dependencies.create(project.library.java.spark_core).getVersion(), + 'spark.version': dependencies.create(project.library.java.spark3_core).getVersion(), 'nemo.version': dependencies.create(project.library.java.nemo_compiler_frontend_beam).getVersion(), 'hadoop.version': dependencies.create(project.library.java.hadoop_client).getVersion(), 'mockito.version': dependencies.create(project.library.java.mockito_core).getVersion(), diff --git a/sdks/java/maven-archetypes/gcp-bom-examples/src/main/resources/archetype-resources/pom.xml b/sdks/java/maven-archetypes/gcp-bom-examples/src/main/resources/archetype-resources/pom.xml index 863a465f0fdc..a87a506a3a8b 100644 --- a/sdks/java/maven-archetypes/gcp-bom-examples/src/main/resources/archetype-resources/pom.xml +++ b/sdks/java/maven-archetypes/gcp-bom-examples/src/main/resources/archetype-resources/pom.xml @@ -216,13 +216,10 @@ spark-runner - - 4.1.17.Final - org.apache.beam - beam-runners-spark + beam-runners-spark-3 runtime @@ -238,7 +235,7 @@ org.apache.spark - spark-streaming_2.11 + spark-streaming_2.12 runtime @@ -249,25 +246,10 @@ com.fasterxml.jackson.module - jackson-module-scala_2.11 + jackson-module-scala_2.12 ${jackson.version} runtime - - - org.apache.beam - beam-sdks-java-io-google-cloud-platform - - - io.grpc - grpc-netty - - - io.netty - netty-handler - - - @@ -342,6 +324,13 @@ beam-sdks-java-io-google-cloud-platform + + + org.apache.beam + beam-sdks-java-extensions-python + ${beam.version} + + com.google.api-client diff --git a/sdks/java/testing/jpms-tests/build.gradle b/sdks/java/testing/jpms-tests/build.gradle index 9aa3f41b73a0..f781c29b8480 100644 --- a/sdks/java/testing/jpms-tests/build.gradle +++ b/sdks/java/testing/jpms-tests/build.gradle @@ -78,9 +78,6 @@ configurations { sparkRunnerIntegrationTest.extendsFrom(baseIntegrationTest) } -def spark_version = '3.1.1' -def spark_scala_version = '2.12' - dependencies { implementation project(path: ":sdks:java:core", configuration: "shadow") implementation project(path: ":sdks:java:extensions:google-cloud-platform-core") @@ -93,8 +90,6 @@ dependencies { flinkRunnerIntegrationTest project(":runners:flink:${project.ext.latestFlinkVersion}") dataflowRunnerIntegrationTest project(":runners:google-cloud-dataflow-java") sparkRunnerIntegrationTest project(":runners:spark:3") - sparkRunnerIntegrationTest "org.apache.spark:spark-sql_$spark_scala_version:$spark_version" - sparkRunnerIntegrationTest "org.apache.spark:spark-streaming_$spark_scala_version:$spark_version" } /* diff --git a/sdks/java/testing/load-tests/build.gradle b/sdks/java/testing/load-tests/build.gradle index 2d93993a5657..e157f2fabf32 100644 --- a/sdks/java/testing/load-tests/build.gradle +++ b/sdks/java/testing/load-tests/build.gradle @@ -39,7 +39,7 @@ def runnerDependency = (project.hasProperty(runnerProperty) : ":runners:direct-java") def loadTestRunnerVersionProperty = "runner.version" def loadTestRunnerVersion = project.findProperty(loadTestRunnerVersionProperty) -def shouldProvideSpark = ":runners:spark:2".equals(runnerDependency) +def isSparkRunner = runnerDependency.startsWith(":runners:spark:") def isDataflowRunner = ":runners:google-cloud-dataflow-java".equals(runnerDependency) def isDataflowRunnerV2 = isDataflowRunner && "V2".equals(loadTestRunnerVersion) def runnerConfiguration = ":runners:direct-java".equals(runnerDependency) ? "shadow" : null @@ -82,20 +82,9 @@ dependencies { gradleRun project(project.path) gradleRun project(path: runnerDependency, configuration: runnerConfiguration) - - // The Spark runner requires the user to provide a Spark dependency. For self-contained - // runs with the Spark runner, we can provide such a dependency. This is deliberately phrased - // to not hardcode any runner other than :runners:direct-java - if (shouldProvideSpark) { - gradleRun library.java.spark_streaming - gradleRun library.java.spark_core, { - exclude group:"org.slf4j", module:"jul-to-slf4j" - } - gradleRun library.java.spark_sql - } } -if (shouldProvideSpark) { +if (isSparkRunner) { configurations.gradleRun { // Using Spark runner causes a StackOverflowError if slf4j-jdk14 is on the classpath exclude group: "org.slf4j", module: "slf4j-jdk14" diff --git a/sdks/java/testing/nexmark/build.gradle b/sdks/java/testing/nexmark/build.gradle index 3a8d3440c80b..a7fbf2e08ad4 100644 --- a/sdks/java/testing/nexmark/build.gradle +++ b/sdks/java/testing/nexmark/build.gradle @@ -38,8 +38,7 @@ def nexmarkRunnerDependency = project.findProperty(nexmarkRunnerProperty) ?: ":runners:direct-java" def nexmarkRunnerVersionProperty = "nexmark.runner.version" def nexmarkRunnerVersion = project.findProperty(nexmarkRunnerVersionProperty) -def shouldProvideSpark2 = ":runners:spark:2".equals(nexmarkRunnerDependency) -def shouldProvideSpark3 = ":runners:spark:3".equals(nexmarkRunnerDependency) +def isSparkRunner = nexmarkRunnerDependency.startsWith(":runners:spark:") def isDataflowRunner = ":runners:google-cloud-dataflow-java".equals(nexmarkRunnerDependency) def isDataflowRunnerV2 = isDataflowRunner && "V2".equals(nexmarkRunnerVersion) def runnerConfiguration = ":runners:direct-java".equals(nexmarkRunnerDependency) ? "shadow" : null @@ -91,39 +90,15 @@ dependencies { testImplementation project(path: ":sdks:java:testing:test-utils", configuration: "testRuntimeMigration") gradleRun project(project.path) gradleRun project(path: nexmarkRunnerDependency, configuration: runnerConfiguration) - - // The Spark runner requires the user to provide a Spark dependency. For self-contained - // runs with the Spark runner, we can provide such a dependency. This is deliberately phrased - // to not hardcode any runner other than :runners:direct-java - if (shouldProvideSpark2) { - gradleRun library.java.spark_core, { - exclude group:"org.slf4j", module:"jul-to-slf4j" - } - gradleRun library.java.spark_sql - gradleRun library.java.spark_streaming - } - if (shouldProvideSpark3) { - gradleRun library.java.spark3_core, { - exclude group:"org.slf4j", module:"jul-to-slf4j" - } - - gradleRun library.java.spark3_sql - gradleRun library.java.spark3_streaming - } } -if (shouldProvideSpark2) { - configurations.gradleRun { - // Using Spark runner causes a StackOverflowError if slf4j-jdk14 is on the classpath - exclude group: "org.slf4j", module: "slf4j-jdk14" - } -} -if (shouldProvideSpark3) { +if (isSparkRunner) { configurations.gradleRun { // Using Spark runner causes a StackOverflowError if slf4j-jdk14 is on the classpath exclude group: "org.slf4j", module: "slf4j-jdk14" } } + def getNexmarkArgs = { def nexmarkArgsStr = project.findProperty(nexmarkArgsProperty) ?: "" def nexmarkArgsList = new ArrayList() @@ -155,6 +130,12 @@ def getNexmarkArgs = { } } } + + if(isSparkRunner) { + // For transparency, be explicit about configuration of local Spark + nexmarkArgsList.add("--sparkMaster=local[4]") + } + return nexmarkArgsList } @@ -162,7 +143,7 @@ def getNexmarkArgs = { // // Parameters: // -Pnexmark.runner -// Specify a runner subproject, such as ":runners:spark:2" or ":runners:flink:1.13" +// Specify a runner subproject, such as ":runners:spark:3" or ":runners:flink:1.13" // Defaults to ":runners:direct-java" // // -Pnexmark.args @@ -177,6 +158,14 @@ task run(type: JavaExec) { dependsOn ":runners:google-cloud-dataflow-java:worker:legacy-worker:shadowJar" } } + if(isSparkRunner) { + // Disable UI + systemProperty "spark.ui.enabled", "false" + systemProperty "spark.ui.showConsoleProgress", "false" + // Dataset runner only + systemProperty "spark.sql.shuffle.partitions", "4" + } + mainClass = "org.apache.beam.sdk.nexmark.Main" classpath = configurations.gradleRun args nexmarkArgsList.toArray() diff --git a/sdks/java/testing/tpcds/README.md b/sdks/java/testing/tpcds/README.md index 247b5cbe9300..85826e341ffb 100644 --- a/sdks/java/testing/tpcds/README.md +++ b/sdks/java/testing/tpcds/README.md @@ -55,10 +55,10 @@ To run a query using ZetaSQL planner (currently Query96 can be run using ZetaSQL ## Spark Runner -To execute TPC-DS benchmark with Query3 for 1Gb dataset on Apache Spark 2.x, run the following example command from the command line: +To execute TPC-DS benchmark with Query3 for 1Gb dataset on Apache Spark 3.x, run the following example command from the command line: ```bash -./gradlew :sdks:java:testing:tpcds:run -Ptpcds.runner=":runners:spark:2" -Ptpcds.args=" \ +./gradlew :sdks:java:testing:tpcds:run -Ptpcds.runner=":runners:spark:3" -Ptpcds.args=" \ --runner=SparkRunner \ --queries=3 \ --tpcParallel=1 \ diff --git a/sdks/java/testing/tpcds/build.gradle b/sdks/java/testing/tpcds/build.gradle index e9537cfe50ca..325222e8e8f1 100644 --- a/sdks/java/testing/tpcds/build.gradle +++ b/sdks/java/testing/tpcds/build.gradle @@ -94,7 +94,7 @@ if (isSpark) { // // Parameters: // -Ptpcds.runner -// Specify a runner subproject, such as ":runners:spark:2" or ":runners:flink:1.13" +// Specify a runner subproject, such as ":runners:spark:3" or ":runners:flink:1.13" // Defaults to ":runners:direct-java" // // -Ptpcds.args diff --git a/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/SqlTransformRunner.java b/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/SqlTransformRunner.java index cf3c7433f08e..cd337e87d876 100644 --- a/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/SqlTransformRunner.java +++ b/sdks/java/testing/tpcds/src/main/java/org/apache/beam/sdk/tpcds/SqlTransformRunner.java @@ -177,7 +177,6 @@ private static PCollection getTableParquet( "Read " + tableName + " (parquet)", ParquetIO.read(schema) .from(filepattern) - .withSplit() .withProjection(schemaProjected, schemaProjected) .withBeamSchemas(true)); } diff --git a/sdks/python/apache_beam/coders/coder_impl.py b/sdks/python/apache_beam/coders/coder_impl.py index 9e48a3a8f8e6..094687ce68d8 100644 --- a/sdks/python/apache_beam/coders/coder_impl.py +++ b/sdks/python/apache_beam/coders/coder_impl.py @@ -112,6 +112,7 @@ globals()['create_OutputStream'] = create_OutputStream globals()['ByteCountingOutputStream'] = ByteCountingOutputStream # pylint: enable=wrong-import-order, wrong-import-position, ungrouped-imports + is_compiled = True _LOGGER = logging.getLogger(__name__) @@ -611,6 +612,12 @@ class BytesCoderImpl(CoderImpl): A coder for bytes/str objects.""" def encode_to_stream(self, value, out, nested): # type: (bytes, create_OutputStream, bool) -> None + + # value might be of type np.bytes if passed from encode_batch, and cython + # does not recognize it as bytes. + if is_compiled and isinstance(value, np.bytes_): + value = bytes(value) + out.write(value, nested) def decode_from_stream(self, in_stream, nested): diff --git a/sdks/python/apache_beam/examples/complete/autocomplete_it_test.py b/sdks/python/apache_beam/examples/complete/autocomplete_it_test.py index 28312b7303b2..a19af5873186 100644 --- a/sdks/python/apache_beam/examples/complete/autocomplete_it_test.py +++ b/sdks/python/apache_beam/examples/complete/autocomplete_it_test.py @@ -27,28 +27,8 @@ from apache_beam.examples.complete import autocomplete from apache_beam.testing.test_pipeline import TestPipeline - -# Protect against environments where gcsio library is not available. -try: - from apache_beam.io.gcp import gcsio -except ImportError: - gcsio = None - - -def read_gcs_output_file(file_pattern): - gcs = gcsio.GcsIO() - file_names = gcs.list_prefix(file_pattern).keys() - output = [] - for file_name in file_names: - output.append(gcs.open(file_name).read().decode('utf-8').strip()) - return '\n'.join(output) - - -def create_content_input_file(path, contents): - logging.info('Creating file: %s', path) - gcs = gcsio.GcsIO() - with gcs.open(path, 'w') as f: - f.write(str.encode(contents, 'utf-8')) +from apache_beam.testing.test_utils import create_file +from apache_beam.testing.test_utils import read_files_from_pattern def format_output_file(output_string): @@ -99,13 +79,13 @@ def test_autocomplete_output_files_on_small_input(self): INPUT_FILE_DIR = \ 'gs://temp-storage-for-end-to-end-tests/py-it-cloud/input' input = '/'.join([INPUT_FILE_DIR, str(uuid.uuid4()), 'input.txt']) - create_content_input_file(input, ' '.join(self.WORDS)) + create_file(input, ' '.join(self.WORDS)) extra_opts = {'input': input, 'output': output} autocomplete.run(test_pipeline.get_full_options_as_args(**extra_opts)) # Load result file and compare. - result = read_gcs_output_file(output).strip() + result = read_files_from_pattern('%s*' % output).strip() self.assertEqual( sorted(self.EXPECTED_PREFIXES), sorted(format_output_file(result))) diff --git a/sdks/python/apache_beam/examples/complete/distribopt_test.py b/sdks/python/apache_beam/examples/complete/distribopt_test.py index 50d20d3d62cd..b9d507410267 100644 --- a/sdks/python/apache_beam/examples/complete/distribopt_test.py +++ b/sdks/python/apache_beam/examples/complete/distribopt_test.py @@ -20,9 +20,8 @@ # pytype: skip-file import logging -import os -import tempfile import unittest +import uuid from ast import literal_eval as make_tuple import numpy as np @@ -30,7 +29,9 @@ from mock import MagicMock from mock import patch -from apache_beam.testing.util import open_shards +from apache_beam.testing.test_pipeline import TestPipeline +from apache_beam.testing.test_utils import create_file +from apache_beam.testing.test_utils import read_files_from_pattern FILE_CONTENTS = 'OP01,8,12,0,12\n' \ 'OP02,30,14,3,12\n' \ @@ -44,16 +45,18 @@ class DistribOptimizationTest(unittest.TestCase): - def create_file(self, path, contents): - logging.info('Creating temp file: %s', path) - with open(path, 'w') as f: - f.write(contents) - + #TODO(https://github.com/apache/beam/issues/23606) Fix and enable + @pytest.mark.sickbay_dataflow @pytest.mark.examples_postcommit def test_basics(self): + test_pipeline = TestPipeline(is_integration_test=True) + # Setup the files with expected content. - temp_folder = tempfile.mkdtemp() - self.create_file(os.path.join(temp_folder, 'input.txt'), FILE_CONTENTS) + temp_location = test_pipeline.get_option('temp_location') + input = '/'.join([temp_location, str(uuid.uuid4()), 'input.txt']) + output = '/'.join([temp_location, str(uuid.uuid4()), 'result']) + create_file(input, FILE_CONTENTS) + extra_opts = {'input': input, 'output': output} # Run pipeline # Avoid dependency on SciPy @@ -64,16 +67,12 @@ def test_basics(self): with patch.dict('sys.modules', modules): from apache_beam.examples.complete import distribopt - distribopt.run([ - '--input=%s/input.txt' % temp_folder, - '--output', - os.path.join(temp_folder, 'result') - ], - save_main_session=False) + distribopt.run( + test_pipeline.get_full_options_as_args(**extra_opts), + save_main_session=False) # Load result file and compare. - with open_shards(os.path.join(temp_folder, 'result-*-of-*')) as result_file: - lines = result_file.readlines() + lines = read_files_from_pattern('%s*' % output).splitlines() # Only 1 result self.assertEqual(len(lines), 1) diff --git a/sdks/python/apache_beam/examples/complete/estimate_pi_it_test.py b/sdks/python/apache_beam/examples/complete/estimate_pi_it_test.py index cda92e5da4cb..bf6f8fc76c11 100644 --- a/sdks/python/apache_beam/examples/complete/estimate_pi_it_test.py +++ b/sdks/python/apache_beam/examples/complete/estimate_pi_it_test.py @@ -27,21 +27,7 @@ from apache_beam.examples.complete import estimate_pi from apache_beam.testing.test_pipeline import TestPipeline - -# Protect against environments where gcsio library is not available. -try: - from apache_beam.io.gcp import gcsio -except ImportError: - gcsio = None - - -def read_gcs_output_file(file_pattern): - gcs = gcsio.GcsIO() - file_names = gcs.list_prefix(file_pattern).keys() - output = [] - for file_name in file_names: - output.append(gcs.open(file_name).read().decode('utf-8')) - return '\n'.join(output) +from apache_beam.testing.test_utils import read_files_from_pattern class EstimatePiIT(unittest.TestCase): @@ -55,7 +41,7 @@ def test_estimate_pi_output_file(self): extra_opts = {'output': output} estimate_pi.run(test_pipeline.get_full_options_as_args(**extra_opts)) # Load result file and compare. - result = read_gcs_output_file(output) + result = read_files_from_pattern('%s*' % output) [_, _, estimated_pi] = json.loads(result.strip()) # Note: Probabilistically speaking this test can fail with a probability # that is very small (VERY) given that we run at least 100 thousand diff --git a/sdks/python/apache_beam/examples/complete/tfidf.py b/sdks/python/apache_beam/examples/complete/tfidf.py index 16ce2b8471a7..d7829f9d1c7d 100644 --- a/sdks/python/apache_beam/examples/complete/tfidf.py +++ b/sdks/python/apache_beam/examples/complete/tfidf.py @@ -24,13 +24,13 @@ # pytype: skip-file import argparse -import glob import math import re import apache_beam as beam from apache_beam.io import ReadFromText from apache_beam.io import WriteToText +from apache_beam.io.filesystems import FileSystems from apache_beam.options.pipeline_options import PipelineOptions from apache_beam.options.pipeline_options import SetupOptions from apache_beam.pvalue import AsSingleton @@ -200,7 +200,9 @@ def run(argv=None, save_main_session=True): with beam.Pipeline(options=pipeline_options) as p: # Read documents specified by the uris command line option. - pcoll = read_documents(p, glob.glob(known_args.uris)) + metadata_list = FileSystems.match([known_args.uris])[0].metadata_list + uris = [metadata.path for metadata in metadata_list] + pcoll = read_documents(p, uris) # Compute TF-IDF information for each word. output = pcoll | TfIdf() # Write the output using a "Write" transform that has side effects. diff --git a/sdks/python/apache_beam/examples/complete/tfidf_it_test.py b/sdks/python/apache_beam/examples/complete/tfidf_it_test.py new file mode 100644 index 000000000000..fe1649bbfa35 --- /dev/null +++ b/sdks/python/apache_beam/examples/complete/tfidf_it_test.py @@ -0,0 +1,75 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""End-to-end test for TF-IDF example.""" + +# pytype: skip-file + +import logging +import re +import unittest +import uuid + +import pytest + +from apache_beam.examples.complete import tfidf +from apache_beam.testing.test_pipeline import TestPipeline +from apache_beam.testing.test_utils import create_file +from apache_beam.testing.test_utils import read_files_from_pattern + +EXPECTED_RESULTS = set([ + ('ghi', '1.txt', 0.3662040962227032), ('abc', '1.txt', 0.0), + ('abc', '3.txt', 0.0), ('abc', '2.txt', 0.0), + ('def', '1.txt', 0.13515503603605478), ('def', '2.txt', 0.2027325540540822) +]) + +EXPECTED_LINE_RE = r'\(u?\'([a-z]*)\', \(\'.*([0-9]\.txt)\', (.*)\)\)' + + +class TfIdfIT(unittest.TestCase): + @pytest.mark.examples_postcommit + def test_basics(self): + test_pipeline = TestPipeline(is_integration_test=True) + + # Setup the files with expected content. + temp_location = test_pipeline.get_option('temp_location') + input_folder = '/'.join([temp_location, str(uuid.uuid4())]) + create_file('/'.join([input_folder, '1.txt']), 'abc def ghi') + create_file('/'.join([input_folder, '2.txt']), 'abc def') + create_file('/'.join([input_folder, '3.txt']), 'abc') + output = '/'.join([temp_location, str(uuid.uuid4()), 'result']) + + extra_opts = {'uris': '%s/**' % input_folder, 'output': output} + tfidf.run( + test_pipeline.get_full_options_as_args(**extra_opts), + save_main_session=False) + + # Parse result file and compare. + results = [] + lines = read_files_from_pattern('%s*' % output).splitlines() + for line in lines: + match = re.search(EXPECTED_LINE_RE, line) + logging.info('Result line: %s', line) + if match is not None: + results.append((match.group(1), match.group(2), float(match.group(3)))) + logging.info('Computed results: %s', set(results)) + self.assertEqual(set(results), EXPECTED_RESULTS) + + +if __name__ == '__main__': + logging.getLogger().setLevel(logging.INFO) + unittest.main() diff --git a/sdks/python/apache_beam/examples/complete/tfidf_test.py b/sdks/python/apache_beam/examples/complete/tfidf_test.py index 07138cd11e4f..085b9e2dd186 100644 --- a/sdks/python/apache_beam/examples/complete/tfidf_test.py +++ b/sdks/python/apache_beam/examples/complete/tfidf_test.py @@ -20,19 +20,13 @@ # pytype: skip-file import logging -import os -import re -import tempfile import unittest -import pytest - import apache_beam as beam from apache_beam.examples.complete import tfidf from apache_beam.testing.test_pipeline import TestPipeline from apache_beam.testing.util import assert_that from apache_beam.testing.util import equal_to -from apache_beam.testing.util import open_shards EXPECTED_RESULTS = set([ ('ghi', '1.txt', 0.3662040962227032), ('abc', '1.txt', 0.0), @@ -40,15 +34,8 @@ ('def', '1.txt', 0.13515503603605478), ('def', '2.txt', 0.2027325540540822) ]) -EXPECTED_LINE_RE = r'\(u?\'([a-z]*)\', \(\'.*([0-9]\.txt)\', (.*)\)\)' - class TfIdfTest(unittest.TestCase): - def create_file(self, path, contents): - logging.info('Creating temp file: %s', path) - with open(path, 'wb') as f: - f.write(contents.encode('utf-8')) - def test_tfidf_transform(self): with TestPipeline() as p: @@ -65,31 +52,6 @@ def re_key(word_uri_tfidf): # To actually trigger the check the pipeline must be run (e.g. by # exiting the with context). - @pytest.mark.examples_postcommit - def test_basics(self): - # Setup the files with expected content. - temp_folder = tempfile.mkdtemp() - self.create_file(os.path.join(temp_folder, '1.txt'), 'abc def ghi') - self.create_file(os.path.join(temp_folder, '2.txt'), 'abc def') - self.create_file(os.path.join(temp_folder, '3.txt'), 'abc') - tfidf.run([ - '--uris=%s/*' % temp_folder, - '--output', - os.path.join(temp_folder, 'result') - ], - save_main_session=False) - # Parse result file and compare. - results = [] - with open_shards(os.path.join(temp_folder, 'result-*-of-*')) as result_file: - for line in result_file: - match = re.search(EXPECTED_LINE_RE, line) - logging.info('Result line: %s', line) - if match is not None: - results.append( - (match.group(1), match.group(2), float(match.group(3)))) - logging.info('Computed results: %s', set(results)) - self.assertEqual(set(results), EXPECTED_RESULTS) - if __name__ == '__main__': logging.getLogger().setLevel(logging.INFO) diff --git a/sdks/python/apache_beam/examples/complete/top_wikipedia_sessions_it_test.py b/sdks/python/apache_beam/examples/complete/top_wikipedia_sessions_it_test.py index 64dcd06c8ecb..caae4a32d8e7 100644 --- a/sdks/python/apache_beam/examples/complete/top_wikipedia_sessions_it_test.py +++ b/sdks/python/apache_beam/examples/complete/top_wikipedia_sessions_it_test.py @@ -27,28 +27,8 @@ from apache_beam.examples.complete import top_wikipedia_sessions from apache_beam.testing.test_pipeline import TestPipeline - -# Protect against environments where gcsio library is not available. -try: - from apache_beam.io.gcp import gcsio -except ImportError: - gcsio = None - - -def read_gcs_output_file(file_pattern): - gcs = gcsio.GcsIO() - file_names = gcs.list_prefix(file_pattern).keys() - output = [] - for file_name in file_names: - output.append(gcs.open(file_name).read().decode('utf-8')) - return '\n'.join(output) - - -def create_content_input_file(path, contents): - logging.info('Creating file: %s', path) - gcs = gcsio.GcsIO() - with gcs.open(path, 'w') as f: - f.write(str.encode(contents, 'utf-8')) +from apache_beam.testing.test_utils import create_file +from apache_beam.testing.test_utils import read_files_from_pattern class ComputeTopSessionsIT(unittest.TestCase): @@ -102,13 +82,13 @@ def test_top_wikipedia_sessions_output_files_on_small_input(self): INPUT_FILE_DIR = \ 'gs://temp-storage-for-end-to-end-tests/py-it-cloud/input' input = '/'.join([INPUT_FILE_DIR, str(uuid.uuid4()), 'input.txt']) - create_content_input_file(input, '\n'.join(self.EDITS)) + create_file(input, '\n'.join(self.EDITS)) extra_opts = {'input': input, 'output': output, 'sampling_threshold': '1.0'} top_wikipedia_sessions.run( test_pipeline.get_full_options_as_args(**extra_opts)) # Load result file and compare. - result = read_gcs_output_file(output).strip().splitlines() + result = read_files_from_pattern('%s*' % output).strip().splitlines() self.assertEqual(self.EXPECTED, sorted(result, key=lambda x: x.split()[0])) diff --git a/sdks/python/apache_beam/examples/cookbook/coders_it_test.py b/sdks/python/apache_beam/examples/cookbook/coders_it_test.py index c40200348507..941311ce5dc3 100644 --- a/sdks/python/apache_beam/examples/cookbook/coders_it_test.py +++ b/sdks/python/apache_beam/examples/cookbook/coders_it_test.py @@ -27,28 +27,8 @@ from apache_beam.examples.cookbook import coders from apache_beam.testing.test_pipeline import TestPipeline - -# Protect against environments where gcsio library is not available. -try: - from apache_beam.io.gcp import gcsio -except ImportError: - gcsio = None - - -def read_gcs_output_file(file_pattern): - gcs = gcsio.GcsIO() - file_names = gcs.list_prefix(file_pattern).keys() - output = [] - for file_name in file_names: - output.append(gcs.open(file_name).read().decode('utf-8').strip()) - return '\n'.join(output) - - -def create_content_input_file(path, contents): - logging.info('Creating file: %s', path) - gcs = gcsio.GcsIO() - with gcs.open(path, 'w') as f: - f.write(str.encode(contents, 'utf-8')) +from apache_beam.testing.test_utils import create_file +from apache_beam.testing.test_utils import read_files_from_pattern def format_result(result_string): @@ -87,13 +67,12 @@ def test_coders_output_files_on_small_input(self): INPUT_FILE_DIR = \ 'gs://temp-storage-for-end-to-end-tests/py-it-cloud/input' input = '/'.join([INPUT_FILE_DIR, str(uuid.uuid4()), 'input.txt']) - create_content_input_file( - input, '\n'.join(map(json.dumps, self.SAMPLE_RECORDS))) + create_file(input, '\n'.join(map(json.dumps, self.SAMPLE_RECORDS))) extra_opts = {'input': input, 'output': output} coders.run(test_pipeline.get_full_options_as_args(**extra_opts)) # Load result file and compare. - result = read_gcs_output_file(output).strip() + result = read_files_from_pattern('%s*' % output).strip() self.assertEqual( sorted(self.EXPECTED_RESULT), sorted(format_result(result))) diff --git a/sdks/python/apache_beam/examples/cookbook/custom_ptransform_it_test.py b/sdks/python/apache_beam/examples/cookbook/custom_ptransform_it_test.py new file mode 100644 index 000000000000..9ad0c52bf23c --- /dev/null +++ b/sdks/python/apache_beam/examples/cookbook/custom_ptransform_it_test.py @@ -0,0 +1,70 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""End-to-end test for Custom PTransform example.""" +# pytype: skip-file + +import logging +import unittest +import uuid + +import pytest + +from apache_beam.examples.cookbook import custom_ptransform +from apache_beam.testing.test_pipeline import TestPipeline +from apache_beam.testing.test_utils import create_file +from apache_beam.testing.test_utils import read_files_from_pattern + + +def format_result(result_string): + def format_tuple(result_elem_list): + [country, counter] = result_elem_list + return country, int(counter.strip()) + + result_list = list( + map( + lambda result_elem: format_tuple(result_elem.split(',')), + result_string.replace('\'', '').replace('[', + '').replace(']', '').replace( + '\"', '').split('\n'))) + return result_list + + +class CustomPTransformIT(unittest.TestCase): + WORDS = ['CAT', 'DOG', 'CAT', 'CAT', 'DOG'] + EXPECTED_RESULT = "('CAT DOG CAT CAT DOG', 2)" + + @pytest.mark.examples_postcommit + def test_custom_ptransform_output_files_on_small_input(self): + test_pipeline = TestPipeline(is_integration_test=True) + + # Setup the files with expected content. + temp_location = test_pipeline.get_option('temp_location') + input = '/'.join([temp_location, str(uuid.uuid4()), 'input.txt']) + output = '/'.join([temp_location, str(uuid.uuid4()), 'result']) + create_file(input, ' '.join(self.WORDS)) + extra_opts = {'input': input, 'output': output} + custom_ptransform.run(test_pipeline.get_full_options_as_args(**extra_opts)) + + # Load result file and compare. + result = read_files_from_pattern('%s*' % output).strip() + self.assertEqual(result, self.EXPECTED_RESULT) + + +if __name__ == '__main__': + logging.getLogger().setLevel(logging.INFO) + unittest.main() diff --git a/sdks/python/apache_beam/examples/cookbook/custom_ptransform_test.py b/sdks/python/apache_beam/examples/cookbook/custom_ptransform_test.py index 4bcd09566796..cac4eae15b93 100644 --- a/sdks/python/apache_beam/examples/cookbook/custom_ptransform_test.py +++ b/sdks/python/apache_beam/examples/cookbook/custom_ptransform_test.py @@ -20,18 +20,13 @@ # pytype: skip-file import logging -import os -import tempfile import unittest -import pytest - import apache_beam as beam from apache_beam.examples.cookbook import custom_ptransform from apache_beam.testing.test_pipeline import TestPipeline from apache_beam.testing.util import assert_that from apache_beam.testing.util import equal_to -from apache_beam.testing.util import open_shards class CustomCountTest(unittest.TestCase): @@ -59,26 +54,6 @@ def run_pipeline(self, count_implementation, factor=1): assert_that( result, equal_to([('CAT', (3 * factor)), ('DOG', (2 * factor))])) - @pytest.mark.examples_postcommit - def test_custom_ptransform_output_files_on_small_input(self): - EXPECTED_RESULT = "('CAT DOG CAT CAT DOG', 2)" - - # Setup the files with expected content. - temp_folder = tempfile.mkdtemp() - self.create_content_input_file( - os.path.join(temp_folder, 'input.txt'), ' '.join(self.WORDS)) - custom_ptransform.run([ - '--input=%s/input.txt' % temp_folder, - '--output', - os.path.join(temp_folder, 'result') - ]) - - # Load result file and compare. - with open_shards(os.path.join(temp_folder, 'result-*-of-*')) as result_file: - result = result_file.read().strip() - - self.assertEqual(result, EXPECTED_RESULT) - if __name__ == '__main__': logging.getLogger().setLevel(logging.INFO) diff --git a/sdks/python/apache_beam/examples/cookbook/group_with_coder_test.py b/sdks/python/apache_beam/examples/cookbook/group_with_coder_test.py index f6bdf3e146ad..9cf36e70e45a 100644 --- a/sdks/python/apache_beam/examples/cookbook/group_with_coder_test.py +++ b/sdks/python/apache_beam/examples/cookbook/group_with_coder_test.py @@ -20,13 +20,20 @@ # pytype: skip-file import logging -import tempfile import unittest +import uuid import pytest from apache_beam.examples.cookbook import group_with_coder -from apache_beam.testing.util import open_shards +from apache_beam.testing.test_pipeline import TestPipeline +from apache_beam.testing.test_utils import read_files_from_pattern + +# Protect against environments where gcsio library is not available. +try: + from apache_beam.io.gcp import gcsio +except ImportError: + gcsio = None # Patch group_with_coder.PlayerCoder.decode(). To test that the PlayerCoder was # used, we do not strip the prepended 'x:' string when decoding a Player object. @@ -34,6 +41,16 @@ s.decode('utf-8')) +def create_content_input_file(path, records): + logging.info('Creating file: %s', path) + gcs = gcsio.GcsIO() + with gcs.open(path, 'w') as f: + for record in records: + f.write(b'%s\n' % record.encode('utf-8')) + return path + + +@unittest.skipIf(gcsio is None, 'GCP dependencies are not installed') @pytest.mark.examples_postcommit class GroupWithCoderTest(unittest.TestCase): @@ -49,27 +66,32 @@ class GroupWithCoderTest(unittest.TestCase): 'mary,1' ] - def create_temp_file(self, records): - with tempfile.NamedTemporaryFile(delete=False) as f: - for record in records: - f.write(b'%s\n' % record.encode('utf-8')) - return f.name + def setUp(self): + self.test_pipeline = TestPipeline(is_integration_test=True) + # Setup the file with expected content. + self.temp_location = self.test_pipeline.get_option('temp_location') + self.input_file = create_content_input_file( + '/'.join([self.temp_location, str(uuid.uuid4()), 'input.txt']), + self.SAMPLE_RECORDS) + #TODO(https://github.com/apache/beam/issues/23608) Fix and enable + @pytest.mark.sickbay_dataflow def test_basics_with_type_check(self): # Run the workflow with pipeline_type_check option. This will make sure # the typehints associated with all transforms will have non-default values # and therefore any custom coders will be used. In our case we want to make # sure the coder for the Player class will be used. - temp_path = self.create_temp_file(self.SAMPLE_RECORDS) + output = '/'.join([self.temp_location, str(uuid.uuid4()), 'result']) + extra_opts = {'input': self.input_file, 'output': output} group_with_coder.run( - ['--input=%s*' % temp_path, '--output=%s.result' % temp_path], + self.test_pipeline.get_full_options_as_args(**extra_opts), save_main_session=False) # Parse result file and compare. results = [] - with open_shards(temp_path + '.result-*-of-*') as result_file: - for line in result_file: - name, points = line.split(',') - results.append((name, int(points))) + lines = read_files_from_pattern('%s*' % output).splitlines() + for line in lines: + name, points = line.split(',') + results.append((name, int(points))) logging.info('result: %s', results) self.assertEqual( sorted(results), @@ -80,15 +102,13 @@ def test_basics_without_type_check(self): # the typehints associated with all transforms will have default values and # therefore any custom coders will not be used. The default coder (pickler) # will be used instead. - temp_path = self.create_temp_file(self.SAMPLE_RECORDS) + output = '/'.join([self.temp_location, str(uuid.uuid4()), 'result']) + extra_opts = {'input': self.input_file, 'output': output} with self.assertRaises(Exception) as context: # yapf: disable group_with_coder.run( - [ - '--no_pipeline_type_check', - '--input=%s*' % temp_path, - '--output=%s.result' % temp_path - ], + self.test_pipeline.get_full_options_as_args(**extra_opts) + + ['--no_pipeline_type_check'], save_main_session=False) self.assertIn('Unable to deterministically encode', str(context.exception)) self.assertIn('CombinePerKey(sum)/GroupByKey', str(context.exception)) diff --git a/sdks/python/apache_beam/examples/cookbook/mergecontacts_test.py b/sdks/python/apache_beam/examples/cookbook/mergecontacts_test.py index e6f790e71c23..e996084b81cd 100644 --- a/sdks/python/apache_beam/examples/cookbook/mergecontacts_test.py +++ b/sdks/python/apache_beam/examples/cookbook/mergecontacts_test.py @@ -20,13 +20,15 @@ # pytype: skip-file import logging -import tempfile import unittest +import uuid import pytest from apache_beam.examples.cookbook import mergecontacts -from apache_beam.testing.util import open_shards +from apache_beam.testing.test_pipeline import TestPipeline +from apache_beam.testing.test_utils import create_file +from apache_beam.testing.test_utils import read_files_from_pattern class MergeContactsTest(unittest.TestCase): @@ -118,11 +120,6 @@ class MergeContactsTest(unittest.TestCase): EXPECTED_STATS = '\n'.join(['2 luddites', '1 writers', '3 nomads', '']) - def create_temp_file(self, contents): - with tempfile.NamedTemporaryFile(delete=False) as f: - f.write(contents.encode('utf-8')) - return f.name - def normalize_tsv_results(self, tsv_data): """Sort .tsv file data so we can compare it with expected output.""" lines_in = tsv_data.strip().split('\n') @@ -140,25 +137,38 @@ def normalize_tsv_results(self, tsv_data): @pytest.mark.examples_postcommit def test_mergecontacts(self): - path_email = self.create_temp_file(self.CONTACTS_EMAIL) - path_phone = self.create_temp_file(self.CONTACTS_PHONE) - path_snailmail = self.create_temp_file(self.CONTACTS_SNAILMAIL) - - result_prefix = self.create_temp_file('') - - mergecontacts.run([ - '--input_email=%s' % path_email, - '--input_phone=%s' % path_phone, - '--input_snailmail=%s' % path_snailmail, - '--output_tsv=%s.tsv' % result_prefix, - '--output_stats=%s.stats' % result_prefix - ], - assert_results=(2, 1, 3), - save_main_session=False) - - with open_shards('%s.tsv-*-of-*' % result_prefix) as f: - contents = f.read() - self.assertEqual(self.EXPECTED_TSV, self.normalize_tsv_results(contents)) + test_pipeline = TestPipeline(is_integration_test=True) + + # Setup the files with expected content. + temp_location = test_pipeline.get_option('temp_location') + input_folder = '/'.join([temp_location, str(uuid.uuid4())]) + path_email = create_file( + '/'.join([input_folder, 'path_email.txt']), self.CONTACTS_EMAIL) + path_phone = create_file( + '/'.join([input_folder, 'path_phone.txt']), self.CONTACTS_PHONE) + path_snailmail = create_file( + '/'.join([input_folder, 'path_snailmail.txt']), self.CONTACTS_SNAILMAIL) + + result_prefix = '/'.join([temp_location, str(uuid.uuid4()), 'result']) + extra_opts = { + 'input_email': path_email, + 'input_phone': path_phone, + 'input_snailmail': path_snailmail, + 'output_tsv': '%s.tsv' % result_prefix, + 'output_stats': '%s.stats' % result_prefix + } + + pipeline_opts = test_pipeline.get_full_options_as_args(**extra_opts) + # Prevent ambiguous option error between output in + # args and expected output_tsv and output_stats + output_arg = [i for i in pipeline_opts if i.startswith('--output=')] + if output_arg: + pipeline_opts.remove(output_arg[0]) + mergecontacts.run( + pipeline_opts, assert_results=(2, 1, 3), save_main_session=False) + + contents = read_files_from_pattern('%s*' % result_prefix) + self.assertEqual(self.EXPECTED_TSV, self.normalize_tsv_results(contents)) if __name__ == '__main__': diff --git a/sdks/python/apache_beam/examples/cookbook/multiple_output_pardo_test.py b/sdks/python/apache_beam/examples/cookbook/multiple_output_pardo_test.py index 62bf505fcbff..706cff70ba70 100644 --- a/sdks/python/apache_beam/examples/cookbook/multiple_output_pardo_test.py +++ b/sdks/python/apache_beam/examples/cookbook/multiple_output_pardo_test.py @@ -21,13 +21,15 @@ import logging import re -import tempfile import unittest +import uuid import pytest from apache_beam.examples.cookbook import multiple_output_pardo -from apache_beam.testing.util import open_shards +from apache_beam.testing.test_pipeline import TestPipeline +from apache_beam.testing.test_utils import create_file +from apache_beam.testing.test_utils import read_files_from_pattern class MultipleOutputParDo(unittest.TestCase): @@ -37,39 +39,38 @@ class MultipleOutputParDo(unittest.TestCase): EXPECTED_WORDS = [('whole', 1), ('world', 1), ('fantastic', 1), ('point', 1), ('view', 1)] - def create_temp_file(self, contents): - with tempfile.NamedTemporaryFile(delete=False) as f: - f.write(contents.encode('utf-8')) - return f.name - def get_wordcount_results(self, result_path): results = [] - with open_shards(result_path) as result_file: - for line in result_file: - match = re.search(r'([A-Za-z]+): ([0-9]+)', line) - if match is not None: - results.append((match.group(1), int(match.group(2)))) + lines = read_files_from_pattern(result_path).splitlines() + for line in lines: + match = re.search(r'([A-Za-z]+): ([0-9]+)', line) + if match is not None: + results.append((match.group(1), int(match.group(2)))) return results @pytest.mark.examples_postcommit def test_multiple_output_pardo(self): - temp_path = self.create_temp_file(self.SAMPLE_TEXT) - result_prefix = temp_path + '.result' + test_pipeline = TestPipeline(is_integration_test=True) + + # Setup the files with expected content. + temp_location = test_pipeline.get_option('temp_location') + input_folder = '/'.join([temp_location, str(uuid.uuid4())]) + input = create_file('/'.join([input_folder, 'input.txt']), self.SAMPLE_TEXT) + result_prefix = '/'.join([temp_location, str(uuid.uuid4()), 'result']) + extra_opts = {'input': input, 'output': result_prefix} multiple_output_pardo.run( - ['--input=%s*' % temp_path, '--output=%s' % result_prefix], + test_pipeline.get_full_options_as_args(**extra_opts), save_main_session=False) expected_char_count = len(''.join(self.SAMPLE_TEXT.split('\n'))) - with open_shards(result_prefix + '-chars-*-of-*') as f: - contents = f.read() - self.assertEqual(expected_char_count, int(contents)) + contents = read_files_from_pattern(result_prefix + '-chars*') + self.assertEqual(expected_char_count, int(contents)) - short_words = self.get_wordcount_results( - result_prefix + '-short-words-*-of-*') + short_words = self.get_wordcount_results(result_prefix + '-short-words*') self.assertEqual(sorted(short_words), sorted(self.EXPECTED_SHORT_WORDS)) - words = self.get_wordcount_results(result_prefix + '-words-*-of-*') + words = self.get_wordcount_results(result_prefix + '-words*') self.assertEqual(sorted(words), sorted(self.EXPECTED_WORDS)) diff --git a/sdks/python/apache_beam/examples/dataframe/wordcount_test.py b/sdks/python/apache_beam/examples/dataframe/wordcount_test.py index 12180b9506e4..25cd401cb9a9 100644 --- a/sdks/python/apache_beam/examples/dataframe/wordcount_test.py +++ b/sdks/python/apache_beam/examples/dataframe/wordcount_test.py @@ -23,13 +23,15 @@ import collections import logging import re -import tempfile import unittest +import uuid import pytest from apache_beam.examples.dataframe import wordcount -from apache_beam.testing.util import open_shards +from apache_beam.testing.test_pipeline import TestPipeline +from apache_beam.testing.test_utils import create_file +from apache_beam.testing.test_utils import read_files_from_pattern class WordCountTest(unittest.TestCase): @@ -41,27 +43,27 @@ class WordCountTest(unittest.TestCase): loooooonger words """ - def create_temp_file(self, contents): - with tempfile.NamedTemporaryFile(delete=False) as f: - f.write(contents.encode('utf-8')) - return f.name - @pytest.mark.examples_postcommit def test_basics(self): - temp_path = self.create_temp_file(self.SAMPLE_TEXT) + test_pipeline = TestPipeline(is_integration_test=True) + # Setup the files with expected content. + temp_location = test_pipeline.get_option('temp_location') + temp_path = '/'.join([temp_location, str(uuid.uuid4())]) + input = create_file('/'.join([temp_path, 'input.txt']), self.SAMPLE_TEXT) expected_words = collections.defaultdict(int) for word in re.findall(r'[\w]+', self.SAMPLE_TEXT): expected_words[word] += 1 - wordcount.run(['--input=%s*' % temp_path, '--output=%s.result' % temp_path]) + extra_opts = {'input': input, 'output': '%s.result' % temp_path} + wordcount.run(test_pipeline.get_full_options_as_args(**extra_opts)) # Parse result file and compare. results = [] - with open_shards(temp_path + '.result-*') as result_file: - for line in result_file: - match = re.search(r'(\S+),([0-9]+)', line) - if match is not None: - results.append((match.group(1), int(match.group(2)))) - elif line.strip(): - self.assertEqual(line.strip(), 'word,count') + lines = read_files_from_pattern(temp_path + '.result*').splitlines() + for line in lines: + match = re.search(r'(\S+),([0-9]+)', line) + if match is not None: + results.append((match.group(1), int(match.group(2)))) + elif line.strip(): + self.assertEqual(line.strip(), 'word,count') self.assertEqual(sorted(results), sorted(expected_words.items())) diff --git a/sdks/python/apache_beam/examples/inference/runinference_metrics/__init__.py b/sdks/python/apache_beam/examples/inference/runinference_metrics/__init__.py new file mode 100644 index 000000000000..cce3acad34a4 --- /dev/null +++ b/sdks/python/apache_beam/examples/inference/runinference_metrics/__init__.py @@ -0,0 +1,16 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/sdks/python/apache_beam/examples/inference/runinference_metrics/config.py b/sdks/python/apache_beam/examples/inference/runinference_metrics/config.py new file mode 100644 index 000000000000..61b9a21bea4a --- /dev/null +++ b/sdks/python/apache_beam/examples/inference/runinference_metrics/config.py @@ -0,0 +1,30 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""The file defines global variables.""" + +PROJECT_ID = "" +REGION = "us-central1" +JOB_NAME = "benchmarking-runinference" +NUM_WORKERS = 1 +TOKENIZER_NAME = "distilbert-base-uncased-finetuned-sst-2-english" +MODEL_STATE_DICT_PATH = ( + f"gs://{PROJECT_ID}-ml-examples/{TOKENIZER_NAME}/pytorch_model.bin") +MODEL_CONFIG_PATH = TOKENIZER_NAME +IMG_NAME = "kfp-components-preprocessing/pytorch-gpu" +TAG = "latest" +DOCKER_IMG = f"{REGION}-docker.pkg.dev/{PROJECT_ID}/{IMG_NAME}:{TAG}" diff --git a/sdks/python/apache_beam/examples/inference/runinference_metrics/main.py b/sdks/python/apache_beam/examples/inference/runinference_metrics/main.py new file mode 100644 index 000000000000..7feeda4ea8e0 --- /dev/null +++ b/sdks/python/apache_beam/examples/inference/runinference_metrics/main.py @@ -0,0 +1,127 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""This file contains the pipeline for loading a ML model, and exploring +the different RunInference metrics.""" +import argparse +import logging +import sys + +import apache_beam as beam +import config as cfg +from apache_beam.ml.inference import RunInference +from apache_beam.ml.inference.base import KeyedModelHandler +from apache_beam.ml.inference.pytorch_inference import PytorchModelHandlerKeyedTensor +from pipeline.options import get_pipeline_options +from pipeline.transformations import CustomPytorchModelHandlerKeyedTensor +from pipeline.transformations import HuggingFaceStripBatchingWrapper +from pipeline.transformations import PostProcessor +from pipeline.transformations import Tokenize +from transformers import DistilBertConfig + + +def parse_arguments(argv): + """ + Parses the arguments passed to the command line and + returns them as an object + Args: + argv: The arguments passed to the command line. + Returns: + The arguments that are being passed in. + """ + parser = argparse.ArgumentParser(description="benchmark-runinference") + + parser.add_argument( + "-m", + "--mode", + help="Mode to run pipeline in.", + choices=["local", "cloud"], + default="local", + ) + parser.add_argument( + "-p", + "--project", + help="GCP project to run pipeline on.", + default=cfg.PROJECT_ID, + ) + parser.add_argument( + "-d", + "--device", + help="Device to run the dataflow job on", + choices=["CPU", "GPU"], + default="CPU", + ) + + args, _ = parser.parse_known_args(args=argv) + return args + + +def run(): + """ + Runs the pipeline that loads a transformer based text classification model + and does inference on a list of sentences. + At the end of pipeline, different metrics like latency, + throughput and others are printed. + """ + args = parse_arguments(sys.argv) + + inputs = [ + "This is the worst food I have ever eaten", + "In my soul and in my heart, I’m convinced I’m wrong!", + "Be with me always—take any form—drive me mad!"\ + "only do not leave me in this abyss, where I cannot find you!", + "Do I want to live? Would you like to live with your soul in the grave?", + "Honest people don’t hide their deeds.", + "Nelly, I am Heathcliff! He’s always,"\ + "always in my mind: not as a pleasure,"\ + "any more than I am always a pleasure to myself, but as my own being.", + ] * 1000 + + pipeline_options = get_pipeline_options( + job_name=cfg.JOB_NAME, + num_workers=cfg.NUM_WORKERS, + project=args.project, + mode=args.mode, + device=args.device, + ) + model_handler_class = ( + PytorchModelHandlerKeyedTensor + if args.device == "GPU" else CustomPytorchModelHandlerKeyedTensor) + device = "cuda:0" if args.device == "GPU" else args.device + model_handler = model_handler_class( + state_dict_path=cfg.MODEL_STATE_DICT_PATH, + model_class=HuggingFaceStripBatchingWrapper, + model_params={ + "config": DistilBertConfig.from_pretrained(cfg.MODEL_CONFIG_PATH) + }, + device=device, + ) + + with beam.Pipeline(options=pipeline_options) as pipeline: + _ = ( + pipeline + | "Create inputs" >> beam.Create(inputs) + | "Tokenize" >> beam.ParDo(Tokenize(cfg.TOKENIZER_NAME)) + | "Inference" >> + RunInference(model_handler=KeyedModelHandler(model_handler)) + | "Decode Predictions" >> beam.ParDo(PostProcessor())) + metrics = pipeline.result.metrics().query(beam.metrics.MetricsFilter()) + logging.info(metrics) + + +if __name__ == "__main__": + run() diff --git a/sdks/python/apache_beam/examples/inference/runinference_metrics/pipeline/__init__.py b/sdks/python/apache_beam/examples/inference/runinference_metrics/pipeline/__init__.py new file mode 100644 index 000000000000..cce3acad34a4 --- /dev/null +++ b/sdks/python/apache_beam/examples/inference/runinference_metrics/pipeline/__init__.py @@ -0,0 +1,16 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/sdks/python/apache_beam/examples/inference/runinference_metrics/pipeline/options.py b/sdks/python/apache_beam/examples/inference/runinference_metrics/pipeline/options.py new file mode 100644 index 000000000000..b32200ed7331 --- /dev/null +++ b/sdks/python/apache_beam/examples/inference/runinference_metrics/pipeline/options.py @@ -0,0 +1,74 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""This file contains the pipeline options to configure +the Dataflow pipeline.""" + +from datetime import datetime +from typing import Any + +import config as cfg +from apache_beam.options.pipeline_options import PipelineOptions + + +def get_pipeline_options( + project: str, + job_name: str, + mode: str, + device: str, + num_workers: int = cfg.NUM_WORKERS, + **kwargs: Any, +) -> PipelineOptions: + """Function to retrieve the pipeline options. + Args: + project: GCP project to run on + mode: Indicator to run local, cloud or template + num_workers: Number of Workers for running the job parallely + Returns: + Dataflow pipeline options + """ + job_name = f'{job_name}-{datetime.now().strftime("%Y%m%d%H%M%S")}' + + staging_bucket = f"gs://{cfg.PROJECT_ID}-ml-examples" + + # For a list of available options, check: + # https://cloud.google.com/dataflow/docs/guides/specifying-exec-params#setting-other-cloud-dataflow-pipeline-options + dataflow_options = { + "runner": "DirectRunner" if mode == "local" else "DataflowRunner", + "job_name": job_name, + "project": project, + "region": cfg.REGION, + "staging_location": f"{staging_bucket}/dflow-staging", + "temp_location": f"{staging_bucket}/dflow-temp", + "setup_file": "./setup.py", + } + flags = [] + if device == "GPU": + flags = [ + "--experiment=worker_accelerator=type:nvidia-tesla-p4;count:1;"\ + "install-nvidia-driver", + "--experiment=use_runner_v2", + ] + dataflow_options.update({ + "sdk_container_image": cfg.DOCKER_IMG, + "machine_type": "n1-standard-4", + }) + + # Optional parameters + if num_workers: + dataflow_options.update({"num_workers": num_workers}) + return PipelineOptions(flags=flags, **dataflow_options) diff --git a/sdks/python/apache_beam/examples/inference/runinference_metrics/pipeline/transformations.py b/sdks/python/apache_beam/examples/inference/runinference_metrics/pipeline/transformations.py new file mode 100644 index 000000000000..e7f6f9d44689 --- /dev/null +++ b/sdks/python/apache_beam/examples/inference/runinference_metrics/pipeline/transformations.py @@ -0,0 +1,94 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""This file contains the transformations and utility functions for +the pipeline.""" +import apache_beam as beam +import torch +from apache_beam.io.filesystems import FileSystems +from apache_beam.ml.inference.pytorch_inference import PytorchModelHandlerKeyedTensor +from transformers import DistilBertForSequenceClassification +from transformers import DistilBertTokenizer + + +class CustomPytorchModelHandlerKeyedTensor(PytorchModelHandlerKeyedTensor): + """Wrapper around PytorchModelHandlerKeyedTensor to load a model on CPU.""" + def load_model(self) -> torch.nn.Module: + """Loads and initializes a Pytorch model for processing.""" + model = self._model_class(**self._model_params) + model.to(self._device) + file = FileSystems.open(self._state_dict_path, "rb") + model.load_state_dict(torch.load(file, map_location=self._device)) + model.eval() + return model + + +# Can be removed once https://github.com/apache/beam/issues/21863 is fixed +class HuggingFaceStripBatchingWrapper(DistilBertForSequenceClassification): + """Wrapper around HuggingFace model because RunInference requires a batch + as a list of dicts instead of a dict of lists. Another workaround + can be found here where they disable batching instead. + https://github.com/apache/beam/blob/master/sdks/python/apache_beam/examples/inference/pytorch_language_modeling.py""" + def forward(self, **kwargs): + output = super().forward(**kwargs) + return [dict(zip(output, v)) for v in zip(*output.values())] + + +class Tokenize(beam.DoFn): + """A DoFn for tokenizing texts""" + def __init__(self, model_name: str): + """Initialises a tokenizer based on the model_name""" + self._model_name = model_name + + def setup(self): + """Loads the tokenizer""" + self._tokenizer = DistilBertTokenizer.from_pretrained(self._model_name) + + def process(self, text_input: str): + """Prepocesses the text using the tokenizer""" + # We need to pad the tokens tensors to max length to make sure + # that all the tensors are of the same length and hence + # stack-able by the RunInference API, normally you would batch first + # and tokenize the batch after and pad each tensor + # the the max length in the batch. + tokens = self._tokenizer( + text_input, return_tensors="pt", padding="max_length", max_length=512) + # squeeze because tokenization add an extra dimension, which is empty + # in this case because we're tokenizing one element at a time. + tokens = {key: torch.squeeze(val) for key, val in tokens.items()} + return [(text_input, tokens)] + + +class PostProcessor(beam.DoFn): + """Postprocess the RunInference output""" + def process(self, element): + """ + Takes the input text and the prediction result, and returns a dictionary + with the input text and the softmax probabilities + + Args: + element: The tuple of input text and the prediction result + + Returns: + A list of dictionaries, each containing the input text + and the softmax output. + """ + text_input, prediction_result = element + softmax = ( + torch.nn.Softmax(dim=-1)( + prediction_result.inference["logits"]).detach().numpy()) + return [{"input": text_input, "softmax": softmax}] diff --git a/sdks/python/apache_beam/examples/inference/runinference_metrics/setup.py b/sdks/python/apache_beam/examples/inference/runinference_metrics/setup.py new file mode 100644 index 000000000000..d6fb9742ac4c --- /dev/null +++ b/sdks/python/apache_beam/examples/inference/runinference_metrics/setup.py @@ -0,0 +1,43 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Setup.py module for the workflow's worker utilities. + +All the workflow related code is gathered in a package that will be built as a +source distribution, staged in the staging area for the workflow being run and +then installed in the workers when they start running. + +This behavior is triggered by specifying the --setup_file command line option +when running the workflow for remote execution. +""" + +import setuptools +from setuptools import find_packages + +REQUIREMENTS = [ + "apache-beam[gcp]==2.41.0", "transformers==4.21.0", "torch==1.12.0" +] + +setuptools.setup( + name="write-to-pubsub-pipeline", + version="1.1.1", + install_requires=REQUIREMENTS, + packages=find_packages(), + author="Apache Software Foundation", + author_email="dev@beam.apache.org", + py_modules=["config"], +) diff --git a/sdks/python/apache_beam/examples/ml-orchestration/README.md b/sdks/python/apache_beam/examples/ml-orchestration/README.md new file mode 100644 index 000000000000..2f886f09e582 --- /dev/null +++ b/sdks/python/apache_beam/examples/ml-orchestration/README.md @@ -0,0 +1,22 @@ + + +# Example ML workflow orchestration with Kubeflow Pipelines and Tensorflow Extended + +This module contains two examples of simple, orchestrated machine learning workflows that rely on Apache Beam for data preprocessing. A detailed explanation can be found on the Beam website [here](https://beam.apache.org/documentation/ml/orchestration/) \ No newline at end of file diff --git a/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/ingestion/Dockerfile b/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/ingestion/Dockerfile new file mode 100644 index 000000000000..98f9262c7f3e --- /dev/null +++ b/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/ingestion/Dockerfile @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM python:3.9-slim + +# optional: install extra dependencies + +# install python packages +# (the requirements file is currently empty +# because this is a stub ingestion example) +COPY requirements.txt / +RUN python3 -m pip install --no-cache-dir -r requirements.txt + +# copy src files and set working directory +COPY src /src +WORKDIR /src diff --git a/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/ingestion/component.yaml b/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/ingestion/component.yaml new file mode 100644 index 000000000000..e25d240c5f77 --- /dev/null +++ b/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/ingestion/component.yaml @@ -0,0 +1,36 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Ingestion +description: Component that mimicks scraping data from the web and outputs it to a jsonlines format file +inputs: + - name: base_artifact_path + description: base path to store data + type: String +outputs: + - name: ingested_dataset_path + description: target uri for the ingested dataset + type: String +implementation: + container: + image: + command: [ + python3, + ingest.py, + --base-artifact-path, + {inputValue: base_artifact_path}, + --ingested-dataset-path, + {outputPath: ingested_dataset_path} + ] diff --git a/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/ingestion/requirements.txt b/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/ingestion/requirements.txt new file mode 100644 index 000000000000..91eacc92e8be --- /dev/null +++ b/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/ingestion/requirements.txt @@ -0,0 +1,14 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. \ No newline at end of file diff --git a/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/ingestion/src/ingest.py b/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/ingestion/src/ingest.py new file mode 100644 index 000000000000..5369e95bf927 --- /dev/null +++ b/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/ingestion/src/ingest.py @@ -0,0 +1,74 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Ingestion function that fetches data from one file and simply copies it to another.""" + +import argparse +import time +from pathlib import Path + + +def parse_args(): + """Parse ingestion arguments.""" + parser = argparse.ArgumentParser() + parser.add_argument( + "--ingested-dataset-path", + type=str, + help="Path to save the ingested dataset to.", + required=True) + parser.add_argument( + "--base-artifact-path", + type=str, + help="Base path to store pipeline artifacts.", + required=True) + return parser.parse_args() + + +def ingest_data(ingested_dataset_path: str, base_artifact_path: str): + """Data ingestion step that returns an uri + to the data it has 'ingested' as jsonlines. + + Args: + data_ingestion_target (str): uri to the data that was scraped and + ingested by the component""" + # timestamp as unique id for the component execution + timestamp = int(time.time()) + + # create directory to store the actual data + target_path = f"{base_artifact_path}/ingestion/ingested_dataset_{timestamp}.jsonl" + # if the target path is a google cloud storage path convert the path to the gcsfuse path + target_path_gcsfuse = target_path.replace("gs://", "/gcs/") + Path(target_path_gcsfuse).parent.mkdir(parents=True, exist_ok=True) + + with open(target_path_gcsfuse, 'w') as f: + f.writelines([ + """{"image_id": 318556, "id": 255, "caption": "An angled view of a beautifully decorated bathroom.", "image_url": "http://farm4.staticflickr.com/3133/3378902101_3c9fa16b84_z.jpg", "image_name": "COCO_train2014_000000318556.jpg", "image_license": "Attribution-NonCommercial-ShareAlike License"}\n""", + """{"image_id": 476220, "id": 314, "caption": "An empty kitchen with white and black appliances.", "image_url": "http://farm7.staticflickr.com/6173/6207941582_b69380c020_z.jpg", "image_name": "COCO_train2014_000000476220.jpg", "image_license": "Attribution-NonCommercial License"}\n""", + """{"image_id": 134754, "id": 425, "caption": "Two people carrying surf boards on a beach.", "image_url": "http://farm9.staticflickr.com/8500/8398513396_b6a1f11a4b_z.jpg", "image_name": "COCO_train2014_000000134754.jpg", "image_license": "Attribution-NonCommercial-NoDerivs License"}""" + ]) + + # the directory where the output file is created may or may not exists + # so we have to create it. + # KFP v1 components can only write output to files. The output of this + # component is written to ingested_dataset_path and contains the path + # of the actual ingested data + Path(ingested_dataset_path).parent.mkdir(parents=True, exist_ok=True) + with open(ingested_dataset_path, 'w') as f: + f.write(target_path) + + +if __name__ == "__main__": + args = parse_args() + ingest_data(**vars(args)) diff --git a/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/preprocessing/Dockerfile b/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/preprocessing/Dockerfile new file mode 100644 index 000000000000..f46feded1da4 --- /dev/null +++ b/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/preprocessing/Dockerfile @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START component_dockerfile] +FROM python:3.9-slim + +# (Optional) install extra dependencies + +# install pypi dependencies +COPY requirements.txt / +RUN python3 -m pip install --no-cache-dir -r requirements.txt + +# copy src files and set working directory +COPY src /src +WORKDIR /src +# [END component_dockerfile] diff --git a/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/preprocessing/component.yaml b/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/preprocessing/component.yaml new file mode 100644 index 000000000000..f64c3c11fb69 --- /dev/null +++ b/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/preprocessing/component.yaml @@ -0,0 +1,64 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START preprocessing_component_definition] +name: preprocessing +description: Component that mimicks scraping data from the web and outputs it to a jsonlines format file +inputs: + - name: ingested_dataset_path + description: source uri of the data to scrape + type: String + - name: base_artifact_path + description: base path to store data + type: String + - name: gcp_project_id + description: ID for the google cloud project to deploy the pipeline to. + type: String + - name: region + description: Region in which to deploy the Dataflow pipeline. + type: String + - name: dataflow_staging_root + description: Path to staging directory for the dataflow runner. + type: String + - name: beam_runner + description: Beam runner, DataflowRunner or DirectRunner. + type: String +outputs: + - name: preprocessed_dataset_path + description: target uri for the ingested dataset + type: String +implementation: + container: + image: + command: [ + python3, + preprocess.py, + --ingested-dataset-path, + {inputValue: ingested_dataset_path}, + --base-artifact-path, + {inputValue: base_artifact_path}, + --preprocessed-dataset-path, + {outputPath: preprocessed_dataset_path}, + --gcp-project-id, + {inputValue: gcp_project_id}, + --region, + {inputValue: region}, + --dataflow-staging-root, + {inputValue: dataflow_staging_root}, + --beam-runner, + {inputValue: beam_runner}, + ] +# [END preprocessing_component_definition] + diff --git a/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/preprocessing/requirements.txt b/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/preprocessing/requirements.txt new file mode 100644 index 000000000000..2ebbd8cf2149 --- /dev/null +++ b/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/preprocessing/requirements.txt @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apache_beam[gcp]==2.40.0 +requests==2.28.1 +torch==1.12.0 +torchvision==0.13.0 +numpy==1.22.4 +Pillow==9.2.0 diff --git a/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/preprocessing/src/preprocess.py b/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/preprocessing/src/preprocess.py new file mode 100644 index 000000000000..7cf6d6ead4a3 --- /dev/null +++ b/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/preprocessing/src/preprocess.py @@ -0,0 +1,208 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Functionality for the data preprocessing step.""" + +import re +import json +import io +import argparse +import time +from pathlib import Path +import logging +from collections.abc import Iterable + +import requests +from PIL import Image, UnidentifiedImageError +import numpy as np +import torch +import torchvision.transforms as T +import torchvision.transforms.functional as TF +import apache_beam as beam +from apache_beam.options.pipeline_options import PipelineOptions + +IMAGE_SIZE = (224, 244) + + +# [START preprocess_component_argparse] +def parse_args(): + """Parse preprocessing arguments.""" + parser = argparse.ArgumentParser() + parser.add_argument( + "--ingested-dataset-path", + type=str, + help="Path to the ingested dataset", + required=True) + parser.add_argument( + "--preprocessed-dataset-path", + type=str, + help="The target directory for the ingested dataset.", + required=True) + parser.add_argument( + "--base-artifact-path", + type=str, + help="Base path to store pipeline artifacts.", + required=True) + parser.add_argument( + "--gcp-project-id", + type=str, + help="ID for the google cloud project to deploy the pipeline to.", + required=True) + parser.add_argument( + "--region", + type=str, + help="Region in which to deploy the pipeline.", + required=True) + parser.add_argument( + "--dataflow-staging-root", + type=str, + help="Path to staging directory for dataflow.", + required=True) + parser.add_argument( + "--beam-runner", + type=str, + help="Beam runner: DataflowRunner or DirectRunner.", + default="DirectRunner") + + return parser.parse_args() + + +# [END preprocess_component_argparse] + + +def preprocess_dataset( + ingested_dataset_path: str, + preprocessed_dataset_path: str, + base_artifact_path: str, + gcp_project_id: str, + region: str, + dataflow_staging_root: str, + beam_runner: str): + """Preprocess the ingested raw dataset and write the result to avro format. + + Args: + ingested_dataset_path (str): Path to the ingested dataset + preprocessed_dataset_path (str): Path to where the preprocessed dataset will be saved + base_artifact_path (str): path to the base directory of where artifacts can be stored for + this component. + gcp_project_id (str): ID for the google cloud project to deploy the pipeline to. + region (str): Region in which to deploy the pipeline. + dataflow_staging_root (str): Path to staging directory for the dataflow runner. + beam_runner (str): Beam runner: DataflowRunner or DirectRunner. + """ + # [START kfp_component_input_output] + timestamp = time.time() + target_path = f"{base_artifact_path}/preprocessing/preprocessed_dataset_{timestamp}" + + # the directory where the output file is created may or may not exists + # so we have to create it. + Path(preprocessed_dataset_path).parent.mkdir(parents=True, exist_ok=True) + with open(preprocessed_dataset_path, 'w') as f: + f.write(target_path) + # [END kfp_component_input_output] + + # [START deploy_preprocessing_beam_pipeline] + # We use the save_main_session option because one or more DoFn's in this + # workflow rely on global context (e.g., a module imported at module level). + pipeline_options = PipelineOptions( + runner=beam_runner, + project=gcp_project_id, + job_name=f'preprocessing-{int(time.time())}', + temp_location=dataflow_staging_root, + region=region, + requirements_file="/requirements.txt", + save_main_session=True, + ) + + with beam.Pipeline(options=pipeline_options) as pipeline: + ( + pipeline + | "Read input jsonlines file" >> + beam.io.ReadFromText(ingested_dataset_path) + | "Load json" >> beam.Map(json.loads) + | "Filter licenses" >> beam.Filter(valid_license) + | "Download image from URL" >> beam.FlatMap(download_image_from_url) + | "Resize image" >> beam.Map(resize_image, size=IMAGE_SIZE) + | "Clean Text" >> beam.Map(clean_text) + | "Serialize Example" >> beam.Map(serialize_example) + | "Write to Avro files" >> beam.io.WriteToAvro( + file_path_prefix=target_path, + schema={ + "namespace": "preprocessing.example", + "type": "record", + "name": "Sample", + "fields": [{ + "name": "id", "type": "int" + }, { + "name": "caption", "type": "string" + }, { + "name": "image", "type": "bytes" + }] + }, + file_name_suffix=".avro")) + # [END deploy_preprocessing_beam_pipeline] + + +def download_image_from_url(element: dict) -> Iterable[dict]: + """download the images from their uri.""" + response = requests.get(element['image_url']) + try: + image = Image.open(io.BytesIO(response.content)) + image = T.ToTensor()(image) + yield {**element, 'image': image} + except UnidentifiedImageError as e: + logging.exception(e) + + +def resize_image(element: dict, size=(256, 256)): + "Resize the element's PIL image to the target resolution." + image = TF.resize(element['image'], size) + return {**element, 'image': image} + + +def clean_text(element: dict): + """Perform a series of string cleaning operations.""" + text = element['caption'] + text = text.lower() # lower case + text = re.sub(r"http\S+", "", text) # remove urls + text = re.sub("\s+", " ", text) # remove extra spaces (including \n and \t) + text = re.sub( + "[()[\].,|:;?!=+~\-\/{}]", ",", + text) # all puncutation are replace w commas + text = f" {text}" # always start with a space + text = text.strip(',') # remove commas at the start or end of the caption + text = text[:-1] if text and text[-1] == "," else text + text = text[1:] if text and text[0] == "," else text + return {**element, "preprocessed_caption": text} + + +def valid_license(element): + """Checks whether an element's image has the correct license for our use case.""" + license = element['image_license'] + return license in ["Attribution License", "No known copyright restrictions"] + + +def serialize_example(element): + """Serialize an elements image.""" + buffer = io.BytesIO() + torch.save(element['image'], buffer) + buffer.seek(0) + image = buffer.read() + return {**element, 'image': image} + + +if __name__ == "__main__": + args = parse_args() + preprocess_dataset(**vars(args)) diff --git a/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/train/Dockerfile b/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/train/Dockerfile new file mode 100644 index 000000000000..8e2bf86d8113 --- /dev/null +++ b/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/train/Dockerfile @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM python:3.9-slim + +# optional install extra dependencies + +# install pypi dependencies +COPY requirements.txt / +RUN python3 -m pip install --no-cache-dir -r requirements.txt + +# copy src files and set working directory +COPY src /src +WORKDIR /src diff --git a/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/train/component.yaml b/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/train/component.yaml new file mode 100644 index 000000000000..240ed13acf9a --- /dev/null +++ b/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/train/component.yaml @@ -0,0 +1,41 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: model training +description: Train a pytorch model +inputs: + - name: base_artifact_path + description: base path to store data + type: String + - name: preprocessed_dataset_path + description: path to the preprocessed dataset + type: String +outputs: + - name: trained_model_path + description: trained model file + type: String +implementation: + container: + image: + command: [ + python3, + train.py, + --preprocessed-dataset-path, + {inputValue: preprocessed_dataset_path}, + --base-artifact-path, + {inputValue: base_artifact_path}, + --trained-model-path, + {outputPath: trained_model_path} + ] diff --git a/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/train/requirements.txt b/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/train/requirements.txt new file mode 100644 index 000000000000..72eb22959697 --- /dev/null +++ b/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/train/requirements.txt @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +torch==1.12.0 +numpy==1.22.4 +Pillow==9.2.0 \ No newline at end of file diff --git a/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/train/src/train.py b/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/train/src/train.py new file mode 100644 index 000000000000..b15473f22483 --- /dev/null +++ b/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/train/src/train.py @@ -0,0 +1,83 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Simple training function that loads a pretrained model from the torch hub and saves it.""" + +import argparse +import time +from pathlib import Path + +import torch + + +def parse_args(): + """Parse ingestion arguments.""" + parser = argparse.ArgumentParser() + parser.add_argument( + "--preprocessed-dataset-path", + type=str, + help="Path to the preprocessed dataset.", + required=True) + parser.add_argument( + "--trained-model-path", + type=str, + help="Output path to the trained model.", + required=True) + parser.add_argument( + "--base-artifact-path", + type=str, + help="Base path to store pipeline artifacts.", + required=True) + return parser.parse_args() + + +def train_model( + preprocessed_dataset_path: str, + trained_model_path: str, + base_artifact_path: str): + """Placeholder method to load a model from the torch hub and save it. + + Args: + preprocessed_dataset_path (str): Path to the preprocessed dataset + trained_model_path (str): Output path for the trained model + base_artifact_path (str): path to the base directory of where artifacts can be stored for + this component + """ + # timestamp for the component execution + timestamp = time.time() + + # create model or load a pretrained one + model = torch.hub.load('pytorch/vision:v0.10.0', 'vgg16', pretrained=True) + + # to implement: train on preprocessed dataset + # + + # create directory to export the model to + target_path = f"{base_artifact_path}/training/trained_model_{timestamp}.pt" + target_path_gcsfuse = target_path.replace("gs://", "/gcs/") + Path(target_path_gcsfuse).parent.mkdir(parents=True, exist_ok=True) + + # save and export the model + torch.save(model.state_dict(), target_path_gcsfuse) + + # Write the model path to the component output file + Path(trained_model_path).parent.mkdir(parents=True, exist_ok=True) + with open(trained_model_path, 'w') as f: + f.write(target_path) + + +if __name__ == "__main__": + args = parse_args() + train_model(**vars(args)) diff --git a/sdks/python/apache_beam/examples/ml-orchestration/kfp/pipeline.json b/sdks/python/apache_beam/examples/ml-orchestration/kfp/pipeline.json new file mode 100644 index 000000000000..a39bb8d253ad --- /dev/null +++ b/sdks/python/apache_beam/examples/ml-orchestration/kfp/pipeline.json @@ -0,0 +1,247 @@ +{ + "pipelineSpec": { + "components": { + "comp-ingestion": { + "executorLabel": "exec-ingestion", + "inputDefinitions": { + "parameters": { + "base_artifact_path": { + "type": "STRING" + } + } + }, + "outputDefinitions": { + "parameters": { + "ingested_dataset_path": { + "type": "STRING" + } + } + } + }, + "comp-model-training": { + "executorLabel": "exec-model-training", + "inputDefinitions": { + "parameters": { + "base_artifact_path": { + "type": "STRING" + }, + "preprocessed_dataset_path": { + "type": "STRING" + } + } + }, + "outputDefinitions": { + "parameters": { + "trained_model_path": { + "type": "STRING" + } + } + } + }, + "comp-preprocessing": { + "executorLabel": "exec-preprocessing", + "inputDefinitions": { + "parameters": { + "base_artifact_path": { + "type": "STRING" + }, + "beam_runner": { + "type": "STRING" + }, + "dataflow_staging_root": { + "type": "STRING" + }, + "gcp_project_id": { + "type": "STRING" + }, + "ingested_dataset_path": { + "type": "STRING" + }, + "region": { + "type": "STRING" + } + } + }, + "outputDefinitions": { + "parameters": { + "preprocessed_dataset_path": { + "type": "STRING" + } + } + } + } + }, + "deploymentSpec": { + "executors": { + "exec-ingestion": { + "container": { + "command": [ + "python3", + "ingest.py", + "--base-artifact-path", + "{{$.inputs.parameters['base_artifact_path']}}", + "--ingested-dataset-path", + "{{$.outputs.parameters['ingested_dataset_path'].output_file}}" + ], + "image": "" + } + }, + "exec-model-training": { + "container": { + "command": [ + "python3", + "train.py", + "--preprocessed-dataset-path", + "{{$.inputs.parameters['preprocessed_dataset_path']}}", + "--base-artifact-path", + "{{$.inputs.parameters['base_artifact_path']}}", + "--trained-model-path", + "{{$.outputs.parameters['trained_model_path'].output_file}}" + ], + "image": "" + } + }, + "exec-preprocessing": { + "container": { + "command": [ + "python3", + "preprocess.py", + "--ingested-dataset-path", + "{{$.inputs.parameters['ingested_dataset_path']}}", + "--base-artifact-path", + "{{$.inputs.parameters['base_artifact_path']}}", + "--preprocessed-dataset-path", + "{{$.outputs.parameters['preprocessed_dataset_path'].output_file}}", + "--gcp-project-id", + "{{$.inputs.parameters['gcp_project_id']}}", + "--region", + "{{$.inputs.parameters['region']}}", + "--dataflow-staging-root", + "{{$.inputs.parameters['dataflow_staging_root']}}", + "--beam-runner", + "{{$.inputs.parameters['beam_runner']}}" + ], + "image": "" + } + } + } + }, + "pipelineInfo": { + "name": "beam-preprocessing-kfp-example" + }, + "root": { + "dag": { + "tasks": { + "ingestion": { + "cachingOptions": { + "enableCache": true + }, + "componentRef": { + "name": "comp-ingestion" + }, + "inputs": { + "parameters": { + "base_artifact_path": { + "componentInputParameter": "component_artifact_root" + } + } + }, + "taskInfo": { + "name": "ingestion" + } + }, + "model-training": { + "cachingOptions": { + "enableCache": true + }, + "componentRef": { + "name": "comp-model-training" + }, + "dependentTasks": [ + "preprocessing" + ], + "inputs": { + "parameters": { + "base_artifact_path": { + "componentInputParameter": "component_artifact_root" + }, + "preprocessed_dataset_path": { + "taskOutputParameter": { + "outputParameterKey": "preprocessed_dataset_path", + "producerTask": "preprocessing" + } + } + } + }, + "taskInfo": { + "name": "model-training" + } + }, + "preprocessing": { + "cachingOptions": { + "enableCache": true + }, + "componentRef": { + "name": "comp-preprocessing" + }, + "dependentTasks": [ + "ingestion" + ], + "inputs": { + "parameters": { + "base_artifact_path": { + "componentInputParameter": "component_artifact_root" + }, + "beam_runner": { + "componentInputParameter": "beam_runner" + }, + "dataflow_staging_root": { + "componentInputParameter": "dataflow_staging_root" + }, + "gcp_project_id": { + "componentInputParameter": "gcp_project_id" + }, + "ingested_dataset_path": { + "taskOutputParameter": { + "outputParameterKey": "ingested_dataset_path", + "producerTask": "ingestion" + } + }, + "region": { + "componentInputParameter": "region" + } + } + }, + "taskInfo": { + "name": "preprocessing" + } + } + } + }, + "inputDefinitions": { + "parameters": { + "beam_runner": { + "type": "STRING" + }, + "component_artifact_root": { + "type": "STRING" + }, + "dataflow_staging_root": { + "type": "STRING" + }, + "gcp_project_id": { + "type": "STRING" + }, + "region": { + "type": "STRING" + } + } + } + }, + "schemaVersion": "2.0.0", + "sdkVersion": "kfp-1.8.14" + }, + "runtimeConfig": { + "gcsOutputDirectory": "gs://test/test" + } +} \ No newline at end of file diff --git a/sdks/python/apache_beam/examples/ml-orchestration/kfp/pipeline.py b/sdks/python/apache_beam/examples/ml-orchestration/kfp/pipeline.py new file mode 100644 index 000000000000..f687f3dd6477 --- /dev/null +++ b/sdks/python/apache_beam/examples/ml-orchestration/kfp/pipeline.py @@ -0,0 +1,132 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +import kfp +from kfp import components as comp +from kfp.v2 import dsl +from kfp.v2.compiler import Compiler + + +def parse_args(): + """Parse arguments.""" + parser = argparse.ArgumentParser() + parser.add_argument( + "--gcp-project-id", + type=str, + help="ID for the google cloud project to deploy the pipeline to.", + required=True) + parser.add_argument( + "--region", + type=str, + help="Region in which to deploy the pipeline.", + required=True) + parser.add_argument( + "--pipeline-root", + type=str, + help= + "Path to artifact repository where Kubeflow Pipelines stores a pipeline’s artifacts.", + required=True) + parser.add_argument( + "--component-artifact-root", + type=str, + help= + "Path to artifact repository where Kubeflow Pipelines components can store artifacts.", + required=True) + parser.add_argument( + "--dataflow-staging-root", + type=str, + help="Path to staging directory for dataflow.", + required=True) + parser.add_argument( + "--beam-runner", + type=str, + help="Beam runner: DataflowRunner or DirectRunner.", + default="DirectRunner") + return parser.parse_args() + + +# arguments are parsed as a global variable so +# they can be used in the pipeline decorator below +ARGS = parse_args() +PIPELINE_ROOT = vars(ARGS)['pipeline_root'] + +# [START load_kfp_components] +# load the kfp components from their yaml files +DataIngestOp = comp.load_component('components/ingestion/component.yaml') +DataPreprocessingOp = comp.load_component( + 'components/preprocessing/component.yaml') +TrainModelOp = comp.load_component('components/train/component.yaml') +# [END load_kfp_components] + + +# [START define_kfp_pipeline] +@dsl.pipeline( + pipeline_root=PIPELINE_ROOT, + name="beam-preprocessing-kfp-example", + description="Pipeline to show an apache beam preprocessing example in KFP") +def pipeline( + gcp_project_id: str, + region: str, + component_artifact_root: str, + dataflow_staging_root: str, + beam_runner: str): + """KFP pipeline definition. + + Args: + gcp_project_id (str): ID for the google cloud project to deploy the pipeline to. + region (str): Region in which to deploy the pipeline. + component_artifact_root (str): Path to artifact repository where Kubeflow Pipelines + components can store artifacts. + dataflow_staging_root (str): Path to staging directory for the dataflow runner. + beam_runner (str): Beam runner: DataflowRunner or DirectRunner. + """ + + ingest_data_task = DataIngestOp(base_artifact_path=component_artifact_root) + + data_preprocessing_task = DataPreprocessingOp( + ingested_dataset_path=ingest_data_task.outputs["ingested_dataset_path"], + base_artifact_path=component_artifact_root, + gcp_project_id=gcp_project_id, + region=region, + dataflow_staging_root=dataflow_staging_root, + beam_runner=beam_runner) + + train_model_task = TrainModelOp( + preprocessed_dataset_path=data_preprocessing_task. + outputs["preprocessed_dataset_path"], + base_artifact_path=component_artifact_root) + + +# [END define_kfp_pipeline] + +if __name__ == "__main__": + # [START compile_kfp_pipeline] + Compiler().compile(pipeline_func=pipeline, package_path="pipeline.json") + # [END compile_kfp_pipeline] + + run_arguments = vars(ARGS) + del run_arguments['pipeline_root'] + + # [START execute_kfp_pipeline] + client = kfp.Client() + experiment = client.create_experiment("KFP orchestration example") + run_result = client.run_pipeline( + experiment_id=experiment.id, + job_name="KFP orchestration job", + pipeline_package_path="pipeline.json", + params=run_arguments) + # [END execute_kfp_pipeline] diff --git a/sdks/python/apache_beam/examples/ml-orchestration/kfp/requirements.txt b/sdks/python/apache_beam/examples/ml-orchestration/kfp/requirements.txt new file mode 100644 index 000000000000..7b2ec602a0a2 --- /dev/null +++ b/sdks/python/apache_beam/examples/ml-orchestration/kfp/requirements.txt @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# requirements to compile the pipeline and execute it on Dataflow +kfp==1.8.13 +google-cloud-aiplatform==1.15 \ No newline at end of file diff --git a/sdks/python/apache_beam/examples/ml-orchestration/tfx/coco_captions_local.py b/sdks/python/apache_beam/examples/ml-orchestration/tfx/coco_captions_local.py new file mode 100644 index 000000000000..2204285a14bf --- /dev/null +++ b/sdks/python/apache_beam/examples/ml-orchestration/tfx/coco_captions_local.py @@ -0,0 +1,141 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Preprocessing example with TFX with the LocalDagRunner and +either the beam DirectRunner or DataflowRunner""" +import argparse +import os + +from tfx import v1 as tfx + + +def parse_args(): + """Parse arguments.""" + parser = argparse.ArgumentParser() + parser.add_argument( + "--gcp-project-id", + type=str, + help="ID for the google cloud project to deploy the pipeline to.", + required=True) + parser.add_argument( + "--region", + type=str, + help="Region in which to deploy the pipeline.", + required=True) + parser.add_argument( + "--pipeline-name", + type=str, + help="Name for the Beam pipeline.", + required=True) + parser.add_argument( + "--pipeline-root", + type=str, + help= + "Path to artifact repository where TFX stores a pipeline’s artifacts.", + required=True) + parser.add_argument( + "--csv-file", type=str, help="Path to the csv input file.", required=True) + parser.add_argument( + "--csv-file", type=str, help="Path to the csv input file.", required=True) + parser.add_argument( + "--module-file", + type=str, + help="Path to module file containing the preprocessing_fn and run_fn.", + default="coco_captions_utils.py") + parser.add_argument( + "--beam-runner", + type=str, + help="Beam runner: DataflowRunner or DirectRunner.", + default="DirectRunner") + parser.add_argument( + "--metadata-file", + type=str, + help="Path to store a metadata file as a mock metadata database", + default="metadata.db") + return parser.parse_args() + + +# [START tfx_pipeline] +def create_pipeline( + gcp_project_id, + region, + pipeline_name, + pipeline_root, + csv_file, + module_file, + beam_runner, + metadata_file): + """Create the TFX pipeline. + + Args: + gcp_project_id (str): ID for the google cloud project to deploy the pipeline to. + region (str): Region in which to deploy the pipeline. + pipeline_name (str): Name for the Beam pipeline + pipeline_root (str): Path to artifact repository where TFX + stores a pipeline’s artifacts. + csv_file (str): Path to the csv input file. + module_file (str): Path to module file containing the preprocessing_fn and run_fn. + beam_runner (str): Beam runner: DataflowRunner or DirectRunner. + metadata_file (str): Path to store a metadata file as a mock metadata database. + """ + example_gen = tfx.components.CsvExampleGen(input_base=csv_file) + + # Computes statistics over data for visualization and example validation. + statistics_gen = tfx.components.StatisticsGen( + examples=example_gen.outputs['examples']) + + schema_gen = tfx.components.SchemaGen( + statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True) + + transform = tfx.components.Transform( + examples=example_gen.outputs['examples'], + schema=schema_gen.outputs['schema'], + module_file=module_file) + + trainer = tfx.components.Trainer( + module_file=module_file, + examples=transform.outputs['transformed_examples'], + transform_graph=transform.outputs['transform_graph']) + + components = [example_gen, statistics_gen, schema_gen, transform, trainer] + + beam_pipeline_args_by_runner = { + 'DirectRunner': [], + 'DataflowRunner': [ + '--runner=DataflowRunner', + '--project=' + gcp_project_id, + '--temp_location=' + os.path.join(pipeline_root, 'tmp'), + '--region=' + region, + ] + } + + return tfx.dsl.Pipeline( + pipeline_name=pipeline_name, + pipeline_root=pipeline_root, + components=components, + enable_cache=True, + metadata_connection_config=tfx.orchestration.metadata. + sqlite_metadata_connection_config(metadata_file), + beam_pipeline_args=beam_pipeline_args_by_runner[beam_runner]) + + +# [END tfx_pipeline] + +if __name__ == "__main__": + + # [START tfx_execute_pipeline] + args = parse_args() + tfx.orchestration.LocalDagRunner().run(create_pipeline(**vars(args))) + # [END tfx_execute_pipeline] diff --git a/sdks/python/apache_beam/examples/ml-orchestration/tfx/coco_captions_utils.py b/sdks/python/apache_beam/examples/ml-orchestration/tfx/coco_captions_utils.py new file mode 100644 index 000000000000..c28f54cae19e --- /dev/null +++ b/sdks/python/apache_beam/examples/ml-orchestration/tfx/coco_captions_utils.py @@ -0,0 +1,87 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Implementation of the tfx component functions for the coco captions example.""" + +import tempfile + +import tensorflow as tf +import tensorflow_transform as tft +import tensorflow_transform.beam as tft_beam +from tfx import v1 as tfx + + +# [START tfx_run_fn] +def run_fn(fn_args: tfx.components.FnArgs) -> None: + """Build the TF model, train it and export it.""" + # create a model + model = tf.keras.Sequential() + model.add(tf.keras.layers.Dense(1, input_dim=10)) + model.compile() + + # train the model on the preprocessed data + # model.fit(...) + + # Save model to fn_args.serving_model_dir. + model.save(fn_args.serving_model_dir) + + +# [END tfx_run_fn] + + +# [START tfx_preprocessing_fn] +def preprocessing_fn(inputs): + """Transform raw data.""" + # convert the captions to lowercase + # split the captions into separate words + lower = tf.strings.lower(inputs['caption']) + + # compute the vocabulary of the captions during a full pass + # over the dataset and use this to tokenize. + mean_length = tft.mean(tf.strings.length(lower)) + # + + return { + 'caption_lower': lower, + } + + +# [END tfx_preprocessing_fn] + +# [START tfx_analyze_and_transform] +if __name__ == "__main__": + # Test processing_fn directly without the tfx pipeline + raw_data = [ + { + "caption": "A bicycle replica with a clock as the front wheel." + }, { + "caption": "A black Honda motorcycle parked in front of a garage." + }, { + "caption": "A room with blue walls and a white sink and door." + } + ] + + # define the feature_spec (in a tfx pipeline this would be generated by a SchemaGen component) + feature_spec = dict(caption=tf.io.FixedLenFeature([], tf.string)) + raw_data_metadata = tft.DatasetMetadata.from_feature_spec(feature_spec) + + # test out the beam implementation of the + # processing_fn with AnalyzeAndTransformDataset + with tft_beam.Context(temp_dir=tempfile.mkdtemp()): + transformed_dataset, transform_fn = ( + (raw_data, raw_data_metadata) + | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) + transformed_data, transformed_metadata = transformed_dataset +# [END tfx_analyze_and_transform] diff --git a/sdks/python/apache_beam/examples/ml-orchestration/tfx/requirements.txt b/sdks/python/apache_beam/examples/ml-orchestration/tfx/requirements.txt new file mode 100644 index 000000000000..3e43eb6dc3cb --- /dev/null +++ b/sdks/python/apache_beam/examples/ml-orchestration/tfx/requirements.txt @@ -0,0 +1,17 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +tfx==1.9.0 +tensorflow==2.9.1 \ No newline at end of file diff --git a/sdks/python/apache_beam/examples/wordcount_debugging_test.py b/sdks/python/apache_beam/examples/wordcount_debugging_test.py index a36354b389f6..8a19bce777ad 100644 --- a/sdks/python/apache_beam/examples/wordcount_debugging_test.py +++ b/sdks/python/apache_beam/examples/wordcount_debugging_test.py @@ -21,13 +21,15 @@ import logging import re -import tempfile import unittest +import uuid import pytest from apache_beam.examples import wordcount_debugging -from apache_beam.testing.util import open_shards +from apache_beam.testing.test_pipeline import TestPipeline +from apache_beam.testing.test_utils import create_file +from apache_beam.testing.test_utils import read_files_from_pattern @pytest.mark.examples_postcommit @@ -35,25 +37,25 @@ class WordCountDebuggingTest(unittest.TestCase): SAMPLE_TEXT = 'xx yy Flourish\n zz Flourish Flourish stomach\n aa\n bb cc dd' - def create_temp_file(self, contents): - with tempfile.NamedTemporaryFile(delete=False) as f: - f.write(contents.encode('utf-8')) - return f.name - def get_results(self, temp_path): results = [] - with open_shards(temp_path + '.result-*-of-*') as result_file: - for line in result_file: - match = re.search(r'([A-Za-z]+): ([0-9]+)', line) - if match is not None: - results.append((match.group(1), int(match.group(2)))) + lines = read_files_from_pattern(temp_path + '.result*').splitlines() + for line in lines: + match = re.search(r'([A-Za-z]+): ([0-9]+)', line) + if match is not None: + results.append((match.group(1), int(match.group(2)))) return results def test_basics(self): - temp_path = self.create_temp_file(self.SAMPLE_TEXT) + test_pipeline = TestPipeline(is_integration_test=True) + # Setup the files with expected content. + temp_location = test_pipeline.get_option('temp_location') + temp_path = '/'.join([temp_location, str(uuid.uuid4())]) + input = create_file('/'.join([temp_path, 'input.txt']), self.SAMPLE_TEXT) + extra_opts = {'input': input, 'output': '%s.result' % temp_path} expected_words = [('Flourish', 3), ('stomach', 1)] wordcount_debugging.run( - ['--input=%s*' % temp_path, '--output=%s.result' % temp_path], + test_pipeline.get_full_options_as_args(**extra_opts), save_main_session=False) # Parse result file and compare. diff --git a/sdks/python/apache_beam/examples/wordcount_minimal_test.py b/sdks/python/apache_beam/examples/wordcount_minimal_test.py index 732b808cc33a..b4882d2dd32c 100644 --- a/sdks/python/apache_beam/examples/wordcount_minimal_test.py +++ b/sdks/python/apache_beam/examples/wordcount_minimal_test.py @@ -35,13 +35,15 @@ import collections import logging import re -import tempfile import unittest +import uuid import pytest from apache_beam.examples import wordcount_minimal -from apache_beam.testing.util import open_shards +from apache_beam.testing.test_pipeline import TestPipeline +from apache_beam.testing.test_utils import create_file +from apache_beam.testing.test_utils import read_files_from_pattern @pytest.mark.examples_postcommit @@ -50,26 +52,29 @@ class WordCountMinimalTest(unittest.TestCase): SAMPLE_TEXT = 'a b c a b a\n aa bb cc aa bb aa' - def create_temp_file(self, contents): - with tempfile.NamedTemporaryFile(delete=False) as f: - f.write(contents.encode('utf-8')) - return f.name - def test_basics(self): - temp_path = self.create_temp_file(self.SAMPLE_TEXT) + test_pipeline = TestPipeline(is_integration_test=True) + + # Setup the files with expected content. + temp_location = test_pipeline.get_option('temp_location') + temp_path = '/'.join([temp_location, str(uuid.uuid4())]) + input = create_file('/'.join([temp_path, 'input.txt']), self.SAMPLE_TEXT) + + extra_opts = {'input': input, 'output': '%s.result' % temp_path} expected_words = collections.defaultdict(int) for word in re.findall(r'\w+', self.SAMPLE_TEXT): expected_words[word] += 1 wordcount_minimal.main( - ['--input=%s*' % temp_path, '--output=%s.result' % temp_path], + test_pipeline.get_full_options_as_args(**extra_opts), save_main_session=False) + # Parse result file and compare. results = [] - with open_shards(temp_path + '.result-*-of-*') as result_file: - for line in result_file: - match = re.search(r'([a-z]+): ([0-9]+)', line) - if match is not None: - results.append((match.group(1), int(match.group(2)))) + lines = read_files_from_pattern(temp_path + '.result*').splitlines() + for line in lines: + match = re.search(r'([a-z]+): ([0-9]+)', line) + if match is not None: + results.append((match.group(1), int(match.group(2)))) self.assertEqual(sorted(results), sorted(expected_words.items())) diff --git a/sdks/python/apache_beam/examples/wordcount_test.py b/sdks/python/apache_beam/examples/wordcount_test.py index 96428c2ee075..7a0f1093c244 100644 --- a/sdks/python/apache_beam/examples/wordcount_test.py +++ b/sdks/python/apache_beam/examples/wordcount_test.py @@ -36,13 +36,15 @@ import collections import logging import re -import tempfile import unittest +import uuid import pytest from apache_beam.examples import wordcount -from apache_beam.testing.util import open_shards +from apache_beam.testing.test_pipeline import TestPipeline +from apache_beam.testing.test_utils import create_file +from apache_beam.testing.test_utils import read_files_from_pattern @pytest.mark.examples_postcommit @@ -51,25 +53,26 @@ class WordCountTest(unittest.TestCase): SAMPLE_TEXT = ( u'a b c a b a\nacento gráfico\nJuly 30, 2018\n\n aa bb cc aa bb aa') - def create_temp_file(self, contents): - with tempfile.NamedTemporaryFile(delete=False) as f: - f.write(contents.encode('utf-8')) - return f.name - def test_basics(self): - temp_path = self.create_temp_file(self.SAMPLE_TEXT) + test_pipeline = TestPipeline(is_integration_test=True) + # Setup the files with expected content. + temp_location = test_pipeline.get_option('temp_location') + temp_path = '/'.join([temp_location, str(uuid.uuid4())]) + input = create_file('/'.join([temp_path, 'input.txt']), self.SAMPLE_TEXT) + extra_opts = {'input': input, 'output': '%s.result' % temp_path} expected_words = collections.defaultdict(int) for word in re.findall(r'[\w\']+', self.SAMPLE_TEXT, re.UNICODE): expected_words[word] += 1 - wordcount.run(['--input=%s*' % temp_path, '--output=%s.result' % temp_path], - save_main_session=False) + wordcount.run( + test_pipeline.get_full_options_as_args(**extra_opts), + save_main_session=False) # Parse result file and compare. results = [] - with open_shards(temp_path + '.result-*-of-*') as result_file: - for line in result_file: - match = re.search(r'(\S+): ([0-9]+)', line) - if match is not None: - results.append((match.group(1), int(match.group(2)))) + lines = read_files_from_pattern(temp_path + '.result*').splitlines() + for line in lines: + match = re.search(r'(\S+): ([0-9]+)', line) + if match is not None: + results.append((match.group(1), int(match.group(2)))) self.assertEqual(sorted(results), sorted(expected_words.items())) diff --git a/sdks/python/apache_beam/internal/gcp/auth.py b/sdks/python/apache_beam/internal/gcp/auth.py index 699ec79d4b8a..47c3416babd4 100644 --- a/sdks/python/apache_beam/internal/gcp/auth.py +++ b/sdks/python/apache_beam/internal/gcp/auth.py @@ -22,8 +22,10 @@ import logging import socket import threading +from typing import Optional from apache_beam.options.pipeline_options import GoogleCloudOptions +from apache_beam.options.pipeline_options import PipelineOptions # google.auth is only available when Beam is installed with the gcp extra. try: @@ -63,6 +65,8 @@ def set_running_in_gce(worker_executing_project): def get_service_credentials(pipeline_options): + # type: (PipelineOptions) -> Optional[google.auth.credentials.Credentials] + """For internal use only; no backwards-compatibility guarantees. Get credentials to access Google services. @@ -115,6 +119,7 @@ class _Credentials(object): @classmethod def get_service_credentials(cls, pipeline_options): + # type: (PipelineOptions) -> Optional[google.auth.credentials.Credentials] with cls._credentials_lock: if cls._credentials_init: return cls._credentials @@ -134,6 +139,7 @@ def get_service_credentials(cls, pipeline_options): @staticmethod def _get_service_credentials(pipeline_options): + # type: (PipelineOptions) -> Optional[google.auth.credentials.Credentials] if not _GOOGLE_AUTH_AVAILABLE: _LOGGER.warning( 'Unable to find default credentials because the google-auth library ' diff --git a/sdks/python/apache_beam/io/external/xlang_jdbcio_it_test.py b/sdks/python/apache_beam/io/external/xlang_jdbcio_it_test.py index 75dad46124f4..1dcb56c51eca 100644 --- a/sdks/python/apache_beam/io/external/xlang_jdbcio_it_test.py +++ b/sdks/python/apache_beam/io/external/xlang_jdbcio_it_test.py @@ -17,7 +17,6 @@ # pytype: skip-file -import datetime import logging import time import typing @@ -26,7 +25,6 @@ from typing import Callable from typing import Union -import pytz from parameterized import parameterized import apache_beam as beam @@ -58,18 +56,13 @@ ROW_COUNT = 10 -JdbcReadTestRow = typing.NamedTuple( - "JdbcReadTestRow", - [("f_int", int), ("f_timestamp", Timestamp), ("f_decimal", Decimal)], +JdbcTestRow = typing.NamedTuple( + "JdbcTestRow", + [("f_id", int), ("f_float", float), ("f_char", str), ("f_varchar", str), + ("f_bytes", bytes), ("f_varbytes", bytes), ("f_timestamp", Timestamp), + ("f_decimal", Decimal)], ) -coders.registry.register_coder(JdbcReadTestRow, coders.RowCoder) - -JdbcWriteTestRow = typing.NamedTuple( - "JdbcWriteTestRow", - [("f_id", int), ("f_real", float), ("f_string", str), - ("f_timestamp", Timestamp), ("f_decimal", Decimal)], -) -coders.registry.register_coder(JdbcWriteTestRow, coders.RowCoder) +coders.registry.register_coder(JdbcTestRow, coders.RowCoder) @unittest.skipIf(sqlalchemy is None, 'sql alchemy package is not installed.') @@ -123,29 +116,60 @@ def tearDown(self): logging.error('Could not stop the postgreSQL container.') @parameterized.expand(['postgres', 'mysql']) - def test_xlang_jdbc_write(self, database): + def test_xlang_jdbc_write_read(self, database): container_init, classpath, db_string, driver = ( CrossLanguageJdbcIOTest.DB_CONTAINER_CLASSPATH_STRING[database]) self._setUpTestCase(container_init, db_string, driver) - table_name = 'jdbc_external_test_write' + table_name = 'jdbc_external_test' + if database == 'postgres': + # postgres does not have BINARY and VARBINARY type, use equvalent. + binary_type = ('BYTEA', 'BYTEA') + else: + binary_type = ('BINARY(10)', 'VARBINARY(10)') + self.engine.execute( - "CREATE TABLE {}(f_id INTEGER, f_real FLOAT, f_string VARCHAR(100), f_timestamp TIMESTAMP(3), f_decimal DECIMAL(10, 2))" # pylint: disable=line-too-long - .format(table_name)) + "CREATE TABLE IF NOT EXISTS {}".format(table_name) + "(f_id INTEGER, " + + "f_float DOUBLE PRECISION, " + "f_char CHAR(10), " + + "f_varchar VARCHAR(10), " + f"f_bytes {binary_type[0]}, " + + f"f_varbytes {binary_type[1]}, " + "f_timestamp TIMESTAMP(3), " + + "f_decimal DECIMAL(10, 2))") inserted_rows = [ - JdbcWriteTestRow( + JdbcTestRow( i, i + 0.1, - 'Test{}'.format(i), + f'Test{i}', + f'Test{i}', + f'Test{i}'.encode(), + f'Test{i}'.encode(), # In alignment with Java Instant which supports milli precision. Timestamp.of(seconds=round(time.time(), 3)), + # Test both positive and negative numbers. Decimal(f'{i-1}.23')) for i in range(ROW_COUNT) ] + expected_row = [] + for row in inserted_rows: + f_char = row.f_char + ' ' * (10 - len(row.f_char)) + if database != 'postgres': + # padding expected results + f_bytes = row.f_bytes + b'\0' * (10 - len(row.f_bytes)) + else: + f_bytes = row.f_bytes + expected_row.append( + JdbcTestRow( + row.f_id, + row.f_float, + f_char, + row.f_varchar, + f_bytes, + row.f_bytes, + row.f_timestamp, + row.f_decimal)) with TestPipeline() as p: p.not_use_test_runner_api = True _ = ( p - | beam.Create(inserted_rows).with_output_types(JdbcWriteTestRow) + | beam.Create(inserted_rows).with_output_types(JdbcTestRow) # TODO(https://github.com/apache/beam/issues/20446) Add test with # overridden write_statement | 'Write to jdbc' >> WriteToJdbc( @@ -157,46 +181,6 @@ def test_xlang_jdbc_write(self, database): classpath=classpath, )) - fetched_data = self.engine.execute("SELECT * FROM {}".format(table_name)) - fetched_rows = [ - JdbcWriteTestRow( - int(row[0]), - float(row[1]), - str(row[2]), - Timestamp.from_utc_datetime(row[3].replace(tzinfo=pytz.UTC)), - Decimal(row[4])) for row in fetched_data - ] - - self.assertEqual( - set(fetched_rows), - set(inserted_rows), - 'Inserted data does not fit data fetched from table', - ) - - @parameterized.expand(['postgres', 'mysql']) - def test_xlang_jdbc_read(self, database): - container_init, classpath, db_string, driver = ( - CrossLanguageJdbcIOTest.DB_CONTAINER_CLASSPATH_STRING[database]) - self._setUpTestCase(container_init, db_string, driver) - table_name = 'jdbc_external_test_read' - self.engine.execute( - "CREATE TABLE {}(f_int INTEGER, f_timestamp TIMESTAMP, f_decimal DECIMAL(10,2))" # pylint: disable=line-too-long - .format(table_name)) - - all_timestamps = [] - for i in range(ROW_COUNT): - # prepare timestamp - strtime = Timestamp.now().to_utc_datetime().strftime('%Y-%m-%dT%H:%M:%S') - dttime = datetime.datetime.strptime( - strtime, '%Y-%m-%dT%H:%M:%S').replace(tzinfo=pytz.UTC) - all_timestamps.append(Timestamp.from_utc_datetime(dttime)) - decimal_value = Decimal(f'{i-1}.23') - - # write records using sqlalchemy engine - self.engine.execute( - "INSERT INTO {} VALUES({},'{}','{}')".format( - table_name, i, strtime, decimal_value)) - # Register MillisInstant logical type to override the mapping from Timestamp # originally handled by MicrosInstant. LogicalType.register_logical_type(MillisInstant) @@ -215,12 +199,7 @@ def test_xlang_jdbc_read(self, database): password=self.password, classpath=classpath)) - assert_that( - result, - equal_to([ - JdbcReadTestRow(i, all_timestamps[i], Decimal(f'{i-1}.23')) - for i in range(ROW_COUNT) - ])) + assert_that(result, equal_to(expected_row)) # Creating a container with testcontainers sometimes raises ReadTimeout # error. In java there are 2 retries set by default. diff --git a/sdks/python/apache_beam/io/gcp/bigquery.py b/sdks/python/apache_beam/io/gcp/bigquery.py index bad20f69243f..7233326ce0c2 100644 --- a/sdks/python/apache_beam/io/gcp/bigquery.py +++ b/sdks/python/apache_beam/io/gcp/bigquery.py @@ -369,6 +369,7 @@ def chain_after(result): from apache_beam.io.avroio import _create_avro_source as create_avro_source from apache_beam.io.filesystems import CompressionTypes from apache_beam.io.filesystems import FileSystems +from apache_beam.io.gcp import bigquery_schema_tools from apache_beam.io.gcp import bigquery_tools from apache_beam.io.gcp.bigquery_io_metadata import create_bigquery_io_metadata from apache_beam.io.gcp.bigquery_read_internal import _BigQueryReadSplit @@ -2471,9 +2472,9 @@ def _expand_output_type(self, output_pcollection): raise TypeError( '%s: table must be of type string' '; got a callable instead' % self.__class__.__name__) - return output_pcollection | beam.io.gcp.bigquery_schema_tools.\ + return output_pcollection | bigquery_schema_tools.\ convert_to_usertype( - beam.io.gcp.bigquery.bigquery_tools.BigQueryWrapper().get_table( + bigquery_tools.BigQueryWrapper().get_table( project_id=table_details.projectId, dataset_id=table_details.datasetId, table_id=table_details.tableId).schema) diff --git a/sdks/python/apache_beam/io/gcp/bigquery_file_loads.py b/sdks/python/apache_beam/io/gcp/bigquery_file_loads.py index 8b899a343d35..f438949d428b 100644 --- a/sdks/python/apache_beam/io/gcp/bigquery_file_loads.py +++ b/sdks/python/apache_beam/io/gcp/bigquery_file_loads.py @@ -437,18 +437,18 @@ def process(self, element, schema_mod_job_name_prefix): # Trigger potential schema modification by loading zero rows into the # destination table with the temporary table schema. schema_update_job_reference = self.bq_wrapper.perform_load_job( - destination=table_reference, - source_stream=io.BytesIO(), # file with zero rows - job_id=job_name, - schema=temp_table_schema, - write_disposition='WRITE_APPEND', - create_disposition='CREATE_NEVER', - additional_load_parameters=additional_parameters, - job_labels=self._bq_io_metadata.add_additional_bq_job_labels(), - # JSON format is hardcoded because zero rows load(unlike AVRO) and - # a nested schema(unlike CSV, which a default one) is permitted. - source_format="NEWLINE_DELIMITED_JSON", - load_job_project_id=self._load_job_project_id) + destination=table_reference, + source_stream=io.BytesIO(), # file with zero rows + job_id=job_name, + schema=temp_table_schema, + write_disposition='WRITE_APPEND', + create_disposition='CREATE_NEVER', + additional_load_parameters=additional_parameters, + job_labels=self._bq_io_metadata.add_additional_bq_job_labels(), + # JSON format is hardcoded because zero rows load(unlike AVRO) and + # a nested schema(unlike CSV, which a default one) is permitted. + source_format="NEWLINE_DELIMITED_JSON", + load_job_project_id=self._load_job_project_id) self.pending_jobs.append( GlobalWindows.windowed_value( (destination, schema_update_job_reference))) @@ -597,6 +597,7 @@ class TriggerLoadJobs(beam.DoFn): """ TEMP_TABLES = 'TemporaryTables' + ONGOING_JOBS = 'OngoingJobs' def __init__( self, @@ -718,6 +719,8 @@ def process(self, element, load_job_name_prefix, *schema_side_inputs): source_format=self.source_format, job_labels=self.bq_io_metadata.add_additional_bq_job_labels(), load_job_project_id=self.load_job_project_id) + yield pvalue.TaggedOutput( + TriggerLoadJobs.ONGOING_JOBS, (destination, job_reference)) self.pending_jobs.append( GlobalWindows.windowed_value((destination, job_reference))) @@ -761,6 +764,13 @@ def process(self, element): files = element[1] partitions = [] + if not files: + _LOGGER.warning( + 'Ignoring a BigQuery batch load partition to %s ' + 'that contains no source URIs.', + destination) + return + latest_partition = PartitionFiles.Partition( self.max_partition_size, self.max_files_per_partition) @@ -1054,13 +1064,17 @@ def _load_data( load_job_project_id=self.load_job_project_id), load_job_name_pcv, *self.schema_side_inputs).with_outputs( - TriggerLoadJobs.TEMP_TABLES, main='main')) + TriggerLoadJobs.TEMP_TABLES, + TriggerLoadJobs.ONGOING_JOBS, + main='main')) - temp_tables_load_job_ids_pc = trigger_loads_outputs['main'] + finished_temp_tables_load_job_ids_pc = trigger_loads_outputs['main'] + temp_tables_load_job_ids_pc = trigger_loads_outputs[ + TriggerLoadJobs.ONGOING_JOBS] temp_tables_pc = trigger_loads_outputs[TriggerLoadJobs.TEMP_TABLES] schema_mod_job_ids_pc = ( - temp_tables_load_job_ids_pc + finished_temp_tables_load_job_ids_pc | beam.ParDo( UpdateDestinationSchema( project=self.project, @@ -1072,7 +1086,7 @@ def _load_data( schema_mod_job_name_pcv)) copy_job_outputs = ( - temp_tables_load_job_ids_pc + finished_temp_tables_load_job_ids_pc | beam.ParDo( TriggerCopyJobs( project=self.project, @@ -1113,7 +1127,9 @@ def _load_data( step_name=step_name, load_job_project_id=self.load_job_project_id), load_job_name_pcv, - *self.schema_side_inputs)) + *self.schema_side_inputs).with_outputs( + TriggerLoadJobs.ONGOING_JOBS, main='main') + )[TriggerLoadJobs.ONGOING_JOBS] destination_load_job_ids_pc = ( (temp_tables_load_job_ids_pc, destination_load_job_ids_pc) diff --git a/sdks/python/apache_beam/io/gcp/bigquery_file_loads_test.py b/sdks/python/apache_beam/io/gcp/bigquery_file_loads_test.py index 724032abfa7e..0c0e136eae4b 100644 --- a/sdks/python/apache_beam/io/gcp/bigquery_file_loads_test.py +++ b/sdks/python/apache_beam/io/gcp/bigquery_file_loads_test.py @@ -400,6 +400,23 @@ def test_partition_files_dofn_size_split(self): class TestBigQueryFileLoads(_TestCaseWithTempDirCleanUp): + def test_trigger_load_jobs_with_empty_files(self): + destination = "project:dataset.table" + empty_files = [] + load_job_prefix = "test_prefix" + + with beam.Pipeline() as p: + partitions = ( + p + | beam.Create([(destination, empty_files)]) + | beam.ParDo(bqfl.PartitionFiles(1000, 10)).with_outputs( + bqfl.PartitionFiles.MULTIPLE_PARTITIONS_TAG, + bqfl.PartitionFiles.SINGLE_PARTITION_TAG)) + + _ = ( + partitions[bqfl.PartitionFiles.SINGLE_PARTITION_TAG] + | beam.ParDo(bqfl.TriggerLoadJobs(), load_job_prefix)) + def test_records_traverse_transform_with_mocks(self): destination = 'project1:dataset1.table1' diff --git a/sdks/python/apache_beam/io/gcp/bigquery_schema_tools.py b/sdks/python/apache_beam/io/gcp/bigquery_schema_tools.py index e78f7bd5a7f7..4c25aa62e0bd 100644 --- a/sdks/python/apache_beam/io/gcp/bigquery_schema_tools.py +++ b/sdks/python/apache_beam/io/gcp/bigquery_schema_tools.py @@ -28,9 +28,13 @@ import numpy as np import apache_beam as beam +import apache_beam.io.gcp.bigquery_tools +import apache_beam.typehints.schemas +import apache_beam.utils.proto_utils import apache_beam.utils.timestamp from apache_beam.io.gcp.internal.clients import bigquery from apache_beam.portability.api import schema_pb2 +from apache_beam.transforms import DoFn # BigQuery types as listed in # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types @@ -91,13 +95,11 @@ def bq_field_to_type(field, mode): def convert_to_usertype(table_schema): - usertype = beam.io.gcp.bigquery_schema_tools. \ - generate_user_type_from_bq_schema(table_schema) - return beam.ParDo( - beam.io.gcp.bigquery_schema_tools.BeamSchemaConversionDoFn(usertype)) + usertype = generate_user_type_from_bq_schema(table_schema) + return beam.ParDo(BeamSchemaConversionDoFn(usertype)) -class BeamSchemaConversionDoFn(beam.DoFn): +class BeamSchemaConversionDoFn(DoFn): def __init__(self, pcoll_val_ctor): self._pcoll_val_ctor = pcoll_val_ctor @@ -113,8 +115,9 @@ def infer_output_type(self, input_type): @classmethod def _from_serialized_schema(cls, schema_str): return cls( - beam.typehints.schemas.named_tuple_from_schema( - beam.utils.proto_utils.parse_Bytes(schema_str, schema_pb2.Schema))) + apache_beam.typehints.schemas.named_tuple_from_schema( + apache_beam.utils.proto_utils.parse_Bytes( + schema_str, schema_pb2.Schema))) def __reduce__(self): # when pickling, use bytes representation of the schema. diff --git a/sdks/python/apache_beam/io/gcp/gcsio.py b/sdks/python/apache_beam/io/gcp/gcsio.py index d4ceeda4bd9d..e34a0b774535 100644 --- a/sdks/python/apache_beam/io/gcp/gcsio.py +++ b/sdks/python/apache_beam/io/gcp/gcsio.py @@ -38,6 +38,8 @@ import time import traceback from itertools import islice +from typing import Optional +from typing import Union import apache_beam from apache_beam.internal.http_client import get_new_http @@ -49,6 +51,7 @@ from apache_beam.io.filesystemio import UploaderStream from apache_beam.io.gcp import resource_identifiers from apache_beam.metrics import monitoring_infos +from apache_beam.options.pipeline_options import PipelineOptions from apache_beam.utils import retry __all__ = ['GcsIO'] @@ -158,7 +161,12 @@ class GcsIOError(IOError, retry.PermanentException): class GcsIO(object): """Google Cloud Storage I/O client.""" def __init__(self, storage_client=None, pipeline_options=None): + # type: (Optional[storage.StorageV1], Optional[Union[dict, PipelineOptions]]) -> None if storage_client is None: + if not pipeline_options: + pipeline_options = PipelineOptions() + elif isinstance(pipeline_options, dict): + pipeline_options = PipelineOptions.from_dictionary(pipeline_options) storage_client = storage.StorageV1( credentials=auth.get_service_credentials(pipeline_options), get_credentials=False, diff --git a/sdks/python/apache_beam/io/kinesis.py b/sdks/python/apache_beam/io/kinesis.py index c87dc0122a2e..bc5e1fa787b4 100644 --- a/sdks/python/apache_beam/io/kinesis.py +++ b/sdks/python/apache_beam/io/kinesis.py @@ -199,7 +199,7 @@ class ReadDataFromKinesis(ExternalTransform): Experimental; no backwards compatibility guarantees. """ - URN = 'beam:transform:org.apache.beam:kinesis_read:v1' + URN = 'beam:transform:org.apache.beam:kinesis_read_data:v1' def __init__( self, diff --git a/sdks/python/apache_beam/io/parquetio.py b/sdks/python/apache_beam/io/parquetio.py index acbf1e23f203..dfcc1abec29a 100644 --- a/sdks/python/apache_beam/io/parquetio.py +++ b/sdks/python/apache_beam/io/parquetio.py @@ -43,6 +43,7 @@ from apache_beam.transforms import DoFn from apache_beam.transforms import ParDo from apache_beam.transforms import PTransform +from apache_beam.transforms import window try: import pyarrow as pa @@ -60,7 +61,8 @@ 'ReadAllFromParquet', 'ReadFromParquetBatched', 'ReadAllFromParquetBatched', - 'WriteToParquet' + 'WriteToParquet', + 'WriteToParquetBatched' ] @@ -83,6 +85,67 @@ def process(self, table, with_filename=False): yield row +class _RowDictionariesToArrowTable(DoFn): + """ A DoFn that consumes python dictionarys and yields a pyarrow table.""" + def __init__( + self, + schema, + row_group_buffer_size=64 * 1024 * 1024, + record_batch_size=1000): + self._schema = schema + self._row_group_buffer_size = row_group_buffer_size + self._buffer = [[] for _ in range(len(schema.names))] + self._buffer_size = record_batch_size + self._record_batches = [] + self._record_batches_byte_size = 0 + + def process(self, row): + if len(self._buffer[0]) >= self._buffer_size: + self._flush_buffer() + + if self._record_batches_byte_size >= self._row_group_buffer_size: + table = self._create_table() + yield table + + # reorder the data in columnar format. + for i, n in enumerate(self._schema.names): + self._buffer[i].append(row[n]) + + def finish_bundle(self): + if len(self._buffer[0]) > 0: + self._flush_buffer() + if self._record_batches_byte_size > 0: + table = self._create_table() + yield window.GlobalWindows.windowed_value_at_end_of_window(table) + + def display_data(self): + res = super().display_data() + res['row_group_buffer_size'] = str(self._row_group_buffer_size) + res['buffer_size'] = str(self._buffer_size) + + return res + + def _create_table(self): + table = pa.Table.from_batches(self._record_batches, schema=self._schema) + self._record_batches = [] + self._record_batches_byte_size = 0 + return table + + def _flush_buffer(self): + arrays = [[] for _ in range(len(self._schema.names))] + for x, y in enumerate(self._buffer): + arrays[x] = pa.array(y, type=self._schema.types[x]) + self._buffer[x] = [] + rb = pa.RecordBatch.from_arrays(arrays, schema=self._schema) + self._record_batches.append(rb) + size = 0 + for x in arrays: + for b in x.buffers(): + if b is not None: + size = size + b.size + self._record_batches_byte_size = self._record_batches_byte_size + size + + class ReadFromParquetBatched(PTransform): """A :class:`~apache_beam.transforms.ptransform.PTransform` for reading Parquet files as a `PCollection` of `pyarrow.Table`. This `PTransform` is @@ -453,13 +516,127 @@ def __init__( A WriteToParquet transform usable for writing. """ super().__init__() + self._schema = schema + self._row_group_buffer_size = row_group_buffer_size + self._record_batch_size = record_batch_size + + self._sink = \ + _create_parquet_sink( + file_path_prefix, + schema, + codec, + use_deprecated_int96_timestamps, + use_compliant_nested_type, + file_name_suffix, + num_shards, + shard_name_template, + mime_type + ) + + def expand(self, pcoll): + return pcoll | ParDo( + _RowDictionariesToArrowTable( + self._schema, self._row_group_buffer_size, + self._record_batch_size)) | Write(self._sink) + + def display_data(self): + return { + 'sink_dd': self._sink, + 'row_group_buffer_size': str(self._row_group_buffer_size) + } + + +class WriteToParquetBatched(PTransform): + """A ``PTransform`` for writing parquet files from a `PCollection` of + `pyarrow.Table`. + + This ``PTransform`` is currently experimental. No backward-compatibility + guarantees. + """ + def __init__( + self, + file_path_prefix, + schema=None, + codec='none', + use_deprecated_int96_timestamps=False, + use_compliant_nested_type=False, + file_name_suffix='', + num_shards=0, + shard_name_template=None, + mime_type='application/x-parquet', + ): + """Initialize a WriteToParquetBatched transform. + + Writes parquet files from a :class:`~apache_beam.pvalue.PCollection` of + records. Each record is a pa.Table Schema must be specified like the + example below. + + .. testsetup:: batched + + from tempfile import NamedTemporaryFile + import glob + import os + import pyarrow + + filename = NamedTemporaryFile(delete=False).name + + .. testcode:: batched + + table = pyarrow.Table.from_pylist([{'name': 'foo', 'age': 10}, + {'name': 'bar', 'age': 20}]) + with beam.Pipeline() as p: + records = p | 'Read' >> beam.Create([table]) + _ = records | 'Write' >> beam.io.WriteToParquetBatched(filename, + pyarrow.schema( + [('name', pyarrow.string()), ('age', pyarrow.int64())] + ) + ) + + .. testcleanup:: batched + + for output in glob.glob('{}*'.format(filename)): + os.remove(output) + + For more information on supported types and schema, please see the pyarrow + document. + + Args: + file_path_prefix: The file path to write to. The files written will begin + with this prefix, followed by a shard identifier (see num_shards), and + end in a common extension, if given by file_name_suffix. In most cases, + only this argument is specified and num_shards, shard_name_template, and + file_name_suffix use default values. + schema: The schema to use, as type of ``pyarrow.Schema``. + codec: The codec to use for block-level compression. Any string supported + by the pyarrow specification is accepted. + use_deprecated_int96_timestamps: Write nanosecond resolution timestamps to + INT96 Parquet format. Defaults to False. + use_compliant_nested_type: Write compliant Parquet nested type (lists). + file_name_suffix: Suffix for the files written. + num_shards: The number of files (shards) used for output. If not set, the + service will decide on the optimal number of shards. + Constraining the number of shards is likely to reduce + the performance of a pipeline. Setting this value is not recommended + unless you require a specific number of output files. + shard_name_template: A template string containing placeholders for + the shard number and shard count. When constructing a filename for a + particular shard number, the upper-case letters 'S' and 'N' are + replaced with the 0-padded shard number and shard count respectively. + This argument can be '' in which case it behaves as if num_shards was + set to 1 and only one file will be generated. The default pattern used + is '-SSSSS-of-NNNNN' if None is passed as the shard_name_template. + mime_type: The MIME type to use for the produced files, if the filesystem + supports specifying MIME types. + + Returns: + A WriteToParquetBatched transform usable for writing. + """ + super().__init__() self._sink = \ _create_parquet_sink( file_path_prefix, schema, codec, - row_group_buffer_size, - record_batch_size, use_deprecated_int96_timestamps, use_compliant_nested_type, file_name_suffix, @@ -479,8 +656,6 @@ def _create_parquet_sink( file_path_prefix, schema, codec, - row_group_buffer_size, - record_batch_size, use_deprecated_int96_timestamps, use_compliant_nested_type, file_name_suffix, @@ -492,8 +667,6 @@ def _create_parquet_sink( file_path_prefix, schema, codec, - row_group_buffer_size, - record_batch_size, use_deprecated_int96_timestamps, use_compliant_nested_type, file_name_suffix, @@ -504,14 +677,12 @@ def _create_parquet_sink( class _ParquetSink(filebasedsink.FileBasedSink): - """A sink for parquet files.""" + """A sink for parquet files from batches.""" def __init__( self, file_path_prefix, schema, codec, - row_group_buffer_size, - record_batch_size, use_deprecated_int96_timestamps, use_compliant_nested_type, file_name_suffix, @@ -535,7 +706,6 @@ def __init__( "Due to ARROW-9424, writing with LZ4 compression is not supported in " "pyarrow 1.x, please use a different pyarrow version or a different " f"codec. Your pyarrow version: {pa.__version__}") - self._row_group_buffer_size = row_group_buffer_size self._use_deprecated_int96_timestamps = use_deprecated_int96_timestamps if use_compliant_nested_type and ARROW_MAJOR_VERSION < 4: raise ValueError( @@ -543,10 +713,6 @@ def __init__( "pyarrow version >= 4.x, please use a different pyarrow version. " f"Your pyarrow version: {pa.__version__}") self._use_compliant_nested_type = use_compliant_nested_type - self._buffer = [[] for _ in range(len(schema.names))] - self._buffer_size = record_batch_size - self._record_batches = [] - self._record_batches_byte_size = 0 self._file_handle = None def open(self, temp_path): @@ -564,23 +730,10 @@ def open(self, temp_path): use_deprecated_int96_timestamps=self._use_deprecated_int96_timestamps, use_compliant_nested_type=self._use_compliant_nested_type) - def write_record(self, writer, value): - if len(self._buffer[0]) >= self._buffer_size: - self._flush_buffer() - - if self._record_batches_byte_size >= self._row_group_buffer_size: - self._write_batches(writer) - - # reorder the data in columnar format. - for i, n in enumerate(self._schema.names): - self._buffer[i].append(value[n]) + def write_record(self, writer, table: pa.Table): + writer.write_table(table) def close(self, writer): - if len(self._buffer[0]) > 0: - self._flush_buffer() - if self._record_batches_byte_size > 0: - self._write_batches(writer) - writer.close() if self._file_handle: self._file_handle.close() @@ -590,25 +743,4 @@ def display_data(self): res = super().display_data() res['codec'] = str(self._codec) res['schema'] = str(self._schema) - res['row_group_buffer_size'] = str(self._row_group_buffer_size) return res - - def _write_batches(self, writer): - table = pa.Table.from_batches(self._record_batches, schema=self._schema) - self._record_batches = [] - self._record_batches_byte_size = 0 - writer.write_table(table) - - def _flush_buffer(self): - arrays = [[] for _ in range(len(self._schema.names))] - for x, y in enumerate(self._buffer): - arrays[x] = pa.array(y, type=self._schema.types[x]) - self._buffer[x] = [] - rb = pa.RecordBatch.from_arrays(arrays, schema=self._schema) - self._record_batches.append(rb) - size = 0 - for x in arrays: - for b in x.buffers(): - if b is not None: - size = size + b.size - self._record_batches_byte_size = self._record_batches_byte_size + size diff --git a/sdks/python/apache_beam/io/parquetio_test.py b/sdks/python/apache_beam/io/parquetio_test.py index 454a45493c4a..df018a3a776f 100644 --- a/sdks/python/apache_beam/io/parquetio_test.py +++ b/sdks/python/apache_beam/io/parquetio_test.py @@ -40,6 +40,7 @@ from apache_beam.io.parquetio import ReadFromParquet from apache_beam.io.parquetio import ReadFromParquetBatched from apache_beam.io.parquetio import WriteToParquet +from apache_beam.io.parquetio import WriteToParquetBatched from apache_beam.io.parquetio import _create_parquet_sink from apache_beam.io.parquetio import _create_parquet_source from apache_beam.testing.test_pipeline import TestPipeline @@ -284,8 +285,6 @@ def test_sink_display_data(self): file_name, self.SCHEMA, 'none', - 1024 * 1024, - 1000, False, False, '.end', @@ -299,7 +298,6 @@ def test_sink_display_data(self): 'file_pattern', 'some_parquet_sink-%(shard_num)05d-of-%(num_shards)05d.end'), DisplayDataItemMatcher('codec', 'none'), - DisplayDataItemMatcher('row_group_buffer_size', str(1024 * 1024)), DisplayDataItemMatcher('compression', 'uncompressed') ] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items)) @@ -308,6 +306,7 @@ def test_write_display_data(self): file_name = 'some_parquet_sink' write = WriteToParquet(file_name, self.SCHEMA) dd = DisplayData.create_from(write) + expected_items = [ DisplayDataItemMatcher('codec', 'none'), DisplayDataItemMatcher('schema', str(self.SCHEMA)), @@ -319,6 +318,21 @@ def test_write_display_data(self): ] hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items)) + def test_write_batched_display_data(self): + file_name = 'some_parquet_sink' + write = WriteToParquetBatched(file_name, self.SCHEMA) + dd = DisplayData.create_from(write) + + expected_items = [ + DisplayDataItemMatcher('codec', 'none'), + DisplayDataItemMatcher('schema', str(self.SCHEMA)), + DisplayDataItemMatcher( + 'file_pattern', + 'some_parquet_sink-%(shard_num)05d-of-%(num_shards)05d'), + DisplayDataItemMatcher('compression', 'uncompressed') + ] + hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items)) + def test_sink_transform_int96(self): with tempfile.NamedTemporaryFile() as dst: path = dst.name @@ -348,6 +362,22 @@ def test_sink_transform(self): | Map(json.dumps) assert_that(readback, equal_to([json.dumps(r) for r in self.RECORDS])) + def test_sink_transform_batched(self): + with TemporaryDirectory() as tmp_dirname: + path = os.path.join(tmp_dirname + "tmp_filename") + with TestPipeline() as p: + _ = p \ + | Create([self._records_as_arrow()]) \ + | WriteToParquetBatched( + path, self.SCHEMA, num_shards=1, shard_name_template='') + with TestPipeline() as p: + # json used for stable sortability + readback = \ + p \ + | ReadFromParquet(path) \ + | Map(json.dumps) + assert_that(readback, equal_to([json.dumps(r) for r in self.RECORDS])) + def test_sink_transform_compliant_nested_type(self): if ARROW_MAJOR_VERSION < 4: return unittest.skip( diff --git a/sdks/python/apache_beam/ml/gcp/visionml.py b/sdks/python/apache_beam/ml/gcp/visionml.py index 3e556b903c44..dd29dd377388 100644 --- a/sdks/python/apache_beam/ml/gcp/visionml.py +++ b/sdks/python/apache_beam/ml/gcp/visionml.py @@ -80,7 +80,7 @@ def __init__( metadata=None): """ Args: - features: (List[``vision.types.Feature.enums.Feature``]) Required. + features: (List[``vision.Feature``]) Required. The Vision API features to detect retry: (google.api_core.retry.Retry) Optional. A retry object used to retry requests. @@ -107,9 +107,9 @@ def __init__( image_contexts = [(''gs://cloud-samples-data/vision/ocr/sign.jpg'', Union[dict, - ``vision.types.ImageContext()``]), + ``vision.ImageContext()``]), (''gs://cloud-samples-data/vision/ocr/sign.jpg'', Union[dict, - ``vision.types.ImageContext()``]),] + ``vision.ImageContext()``]),] context_side_input = ( @@ -152,9 +152,8 @@ def expand(self, pvalue): client_options=self.client_options, metadata=self.metadata))) - @typehints.with_input_types( - Union[str, bytes], Optional[vision.types.ImageContext]) - @typehints.with_output_types(List[vision.types.AnnotateImageRequest]) + @typehints.with_input_types(Union[str, bytes], Optional[vision.ImageContext]) + @typehints.with_output_types(List[vision.AnnotateImageRequest]) def _create_image_annotation_pairs(self, element, context_side_input): if context_side_input: # If we have a side input image context, use that image_context = context_side_input.get(element) @@ -162,13 +161,18 @@ def _create_image_annotation_pairs(self, element, context_side_input): image_context = None if isinstance(element, str): - image = vision.types.Image( - source=vision.types.ImageSource(image_uri=element)) + + image = vision.Image( + {'source': vision.ImageSource({'image_uri': element})}) + else: # Typehint checks only allows str or bytes - image = vision.types.Image(content=element) + image = vision.Image(content=element) - request = vision.types.AnnotateImageRequest( - image=image, features=self.features, image_context=image_context) + request = vision.AnnotateImageRequest({ + 'image': image, + 'features': self.features, + 'image_context': image_context + }) yield request @@ -181,7 +185,7 @@ class AnnotateImageWithContext(AnnotateImage): Element is a tuple of:: (Union[str, bytes], - Optional[``vision.types.ImageContext``]) + Optional[``vision.ImageContext``]) where the former is either an URI (e.g. a GCS URI) or bytes base64-encoded image data. @@ -197,7 +201,7 @@ def __init__( metadata=None): """ Args: - features: (List[``vision.types.Feature.enums.Feature``]) Required. + features: (List[``vision.Feature``]) Required. The Vision API features to detect retry: (google.api_core.retry.Retry) Optional. A retry object used to retry requests. @@ -244,25 +248,28 @@ def expand(self, pvalue): metadata=self.metadata))) @typehints.with_input_types( - Tuple[Union[str, bytes], Optional[vision.types.ImageContext]]) - @typehints.with_output_types(List[vision.types.AnnotateImageRequest]) + Tuple[Union[str, bytes], Optional[vision.ImageContext]]) + @typehints.with_output_types(List[vision.AnnotateImageRequest]) def _create_image_annotation_pairs(self, element, **kwargs): element, image_context = element # Unpack (image, image_context) tuple if isinstance(element, str): - image = vision.types.Image( - source=vision.types.ImageSource(image_uri=element)) + image = vision.Image( + {'source': vision.ImageSource({'image_uri': element})}) else: # Typehint checks only allows str or bytes - image = vision.types.Image(content=element) + image = vision.Image({"content": element}) - request = vision.types.AnnotateImageRequest( - image=image, features=self.features, image_context=image_context) + request = vision.AnnotateImageRequest({ + 'image': image, + 'features': self.features, + 'image_context': image_context + }) yield request -@typehints.with_input_types(List[vision.types.AnnotateImageRequest]) +@typehints.with_input_types(List[vision.AnnotateImageRequest]) class _ImageAnnotateFn(DoFn): """A DoFn that sends each input element to the GCP Vision API. - Returns ``google.cloud.vision.types.BatchAnnotateImagesResponse``. + Returns ``google.cloud.vision.BatchAnnotateImagesResponse``. """ def __init__(self, features, retry, timeout, client_options, metadata): super().__init__() diff --git a/sdks/python/apache_beam/ml/gcp/visionml_test.py b/sdks/python/apache_beam/ml/gcp/visionml_test.py index f038442468f8..479b3d80e4de 100644 --- a/sdks/python/apache_beam/ml/gcp/visionml_test.py +++ b/sdks/python/apache_beam/ml/gcp/visionml_test.py @@ -45,12 +45,13 @@ def setUp(self): self._mock_client = mock.Mock() self._mock_client.batch_annotate_images.return_value = None - feature_type = vision.enums.Feature.Type.TEXT_DETECTION + feature_type = vision.Feature.Type.TEXT_DETECTION self.features = [ - vision.types.Feature( - type=feature_type, max_results=3, model="builtin/stable") + vision.Feature({ + 'type': feature_type, 'max_results': 3, 'model': "builtin/stable" + }) ] - self.img_ctx = vision.types.ImageContext() + self.img_ctx = vision.ImageContext() self.min_batch_size = 1 self.max_batch_size = 1 diff --git a/sdks/python/apache_beam/ml/gcp/visionml_test_it.py b/sdks/python/apache_beam/ml/gcp/visionml_test_it.py index 4413266dcc5c..ea3fc9768ff5 100644 --- a/sdks/python/apache_beam/ml/gcp/visionml_test_it.py +++ b/sdks/python/apache_beam/ml/gcp/visionml_test_it.py @@ -47,7 +47,8 @@ def test_text_detection_with_language_hint(self): IMAGES_TO_ANNOTATE = [ 'gs://apache-beam-samples/advanced_analytics/vision/sign.jpg' ] - IMAGE_CONTEXT = [vision.types.ImageContext(language_hints=['en'])] + + IMAGE_CONTEXT = [vision.ImageContext({'language_hints': ['en']})] with TestPipeline(is_integration_test=True) as p: contexts = p | 'Create context' >> beam.Create( @@ -57,7 +58,9 @@ def test_text_detection_with_language_hint(self): p | beam.Create(IMAGES_TO_ANNOTATE) | AnnotateImage( - features=[vision.types.Feature(type='TEXT_DETECTION')], + features=[ + vision.Feature({'type_': vision.Feature.Type.TEXT_DETECTION}) + ], context_side_input=beam.pvalue.AsDict(contexts)) | beam.ParDo(extract)) diff --git a/sdks/python/apache_beam/options/pipeline_options.py b/sdks/python/apache_beam/options/pipeline_options.py index 54eaaf19ed8e..036613ca5469 100644 --- a/sdks/python/apache_beam/options/pipeline_options.py +++ b/sdks/python/apache_beam/options/pipeline_options.py @@ -1497,9 +1497,10 @@ def _add_argparse_args(cls, parser): 'For example, http://hostname:6066') parser.add_argument( '--spark_version', - default='2', - choices=['2', '3'], - help='Spark major version to use.') + default='3', + choices=['3', '2'], + help='Spark major version to use. ' + 'Note, Spark 2 support is deprecated') class TestOptions(PipelineOptions): diff --git a/sdks/python/apache_beam/options/pipeline_options_test.py b/sdks/python/apache_beam/options/pipeline_options_test.py index 38d362d6870f..3f51c3f52b74 100644 --- a/sdks/python/apache_beam/options/pipeline_options_test.py +++ b/sdks/python/apache_beam/options/pipeline_options_test.py @@ -218,8 +218,6 @@ def _add_argparse_args(cls, parser): parser.add_argument( '--fake_multi_option', action='append', help='fake multi option') - @unittest.skip( - "TODO(https://github.com/apache/beam/issues/21116): Flaky test.") def test_display_data(self): for case in PipelineOptionsTest.TEST_CASES: options = PipelineOptions(flags=case['flags']) diff --git a/sdks/python/apache_beam/portability/common_urns.py b/sdks/python/apache_beam/portability/common_urns.py index 199ce4d7058f..3b47f1ab1e40 100644 --- a/sdks/python/apache_beam/portability/common_urns.py +++ b/sdks/python/apache_beam/portability/common_urns.py @@ -83,3 +83,7 @@ micros_instant = LogicalTypes.Enum.MICROS_INSTANT millis_instant = LogicalTypes.Enum.MILLIS_INSTANT python_callable = LogicalTypes.Enum.PYTHON_CALLABLE +fixed_bytes = LogicalTypes.Enum.FIXED_BYTES +var_bytes = LogicalTypes.Enum.VAR_BYTES +fixed_char = LogicalTypes.Enum.FIXED_CHAR +var_char = LogicalTypes.Enum.VAR_CHAR diff --git a/sdks/python/apache_beam/runners/dask/__init__.py b/sdks/python/apache_beam/runners/dask/__init__.py new file mode 100644 index 000000000000..cce3acad34a4 --- /dev/null +++ b/sdks/python/apache_beam/runners/dask/__init__.py @@ -0,0 +1,16 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/sdks/python/apache_beam/runners/dask/dask_runner.py b/sdks/python/apache_beam/runners/dask/dask_runner.py new file mode 100644 index 000000000000..109c4379b45d --- /dev/null +++ b/sdks/python/apache_beam/runners/dask/dask_runner.py @@ -0,0 +1,182 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""DaskRunner, executing remote jobs on Dask.distributed. + +The DaskRunner is a runner implementation that executes a graph of +transformations across processes and workers via Dask distributed's +scheduler. +""" +import argparse +import dataclasses +import typing as t + +from apache_beam import pvalue +from apache_beam.options.pipeline_options import PipelineOptions +from apache_beam.pipeline import AppliedPTransform +from apache_beam.pipeline import PipelineVisitor +from apache_beam.runners.dask.overrides import dask_overrides +from apache_beam.runners.dask.transform_evaluator import TRANSLATIONS +from apache_beam.runners.dask.transform_evaluator import NoOp +from apache_beam.runners.direct.direct_runner import BundleBasedDirectRunner +from apache_beam.runners.runner import PipelineResult +from apache_beam.runners.runner import PipelineState +from apache_beam.utils.interactive_utils import is_in_notebook + + +class DaskOptions(PipelineOptions): + @staticmethod + def _parse_timeout(candidate): + try: + return int(candidate) + except (TypeError, ValueError): + import dask + return dask.config.no_default + + @classmethod + def _add_argparse_args(cls, parser: argparse.ArgumentParser) -> None: + parser.add_argument( + '--dask_client_address', + dest='address', + type=str, + default=None, + help='Address of a dask Scheduler server. Will default to a ' + '`dask.LocalCluster()`.') + parser.add_argument( + '--dask_connection_timeout', + dest='timeout', + type=DaskOptions._parse_timeout, + help='Timeout duration for initial connection to the scheduler.') + parser.add_argument( + '--dask_scheduler_file', + dest='scheduler_file', + type=str, + default=None, + help='Path to a file with scheduler information if available.') + # TODO(alxr): Add options for security. + parser.add_argument( + '--dask_client_name', + dest='name', + type=str, + default=None, + help='Gives the client a name that will be included in logs generated ' + 'on the scheduler for matters relating to this client.') + parser.add_argument( + '--dask_connection_limit', + dest='connection_limit', + type=int, + default=512, + help='The number of open comms to maintain at once in the connection ' + 'pool.') + + +@dataclasses.dataclass +class DaskRunnerResult(PipelineResult): + from dask import distributed + + client: distributed.Client + futures: t.Sequence[distributed.Future] + + def __post_init__(self): + super().__init__(PipelineState.RUNNING) + + def wait_until_finish(self, duration=None) -> str: + try: + if duration is not None: + # Convert milliseconds to seconds + duration /= 1000 + self.client.wait_for_workers(timeout=duration) + self.client.gather(self.futures, errors='raise') + self._state = PipelineState.DONE + except: # pylint: disable=broad-except + self._state = PipelineState.FAILED + raise + return self._state + + def cancel(self) -> str: + self._state = PipelineState.CANCELLING + self.client.cancel(self.futures) + self._state = PipelineState.CANCELLED + return self._state + + def metrics(self): + # TODO(alxr): Collect and return metrics... + raise NotImplementedError('collecting metrics will come later!') + + +class DaskRunner(BundleBasedDirectRunner): + """Executes a pipeline on a Dask distributed client.""" + @staticmethod + def to_dask_bag_visitor() -> PipelineVisitor: + from dask import bag as db + + @dataclasses.dataclass + class DaskBagVisitor(PipelineVisitor): + bags: t.Dict[AppliedPTransform, + db.Bag] = dataclasses.field(default_factory=dict) + + def visit_transform(self, transform_node: AppliedPTransform) -> None: + op_class = TRANSLATIONS.get(transform_node.transform.__class__, NoOp) + op = op_class(transform_node) + + inputs = list(transform_node.inputs) + if inputs: + bag_inputs = [] + for input_value in inputs: + if isinstance(input_value, pvalue.PBegin): + bag_inputs.append(None) + + prev_op = input_value.producer + if prev_op in self.bags: + bag_inputs.append(self.bags[prev_op]) + + if len(bag_inputs) == 1: + self.bags[transform_node] = op.apply(bag_inputs[0]) + else: + self.bags[transform_node] = op.apply(bag_inputs) + + else: + self.bags[transform_node] = op.apply(None) + + return DaskBagVisitor() + + @staticmethod + def is_fnapi_compatible(): + return False + + def run_pipeline(self, pipeline, options): + # TODO(alxr): Create interactive notebook support. + if is_in_notebook(): + raise NotImplementedError('interactive support will come later!') + + try: + import dask.distributed as ddist + except ImportError: + raise ImportError( + 'DaskRunner is not available. Please install apache_beam[dask].') + + dask_options = options.view_as(DaskOptions).get_all_options( + drop_default=True) + client = ddist.Client(**dask_options) + + pipeline.replace_all(dask_overrides()) + + dask_visitor = self.to_dask_bag_visitor() + pipeline.visit(dask_visitor) + + futures = client.compute(list(dask_visitor.bags.values())) + return DaskRunnerResult(client, futures) diff --git a/sdks/python/apache_beam/runners/dask/dask_runner_test.py b/sdks/python/apache_beam/runners/dask/dask_runner_test.py new file mode 100644 index 000000000000..d8b3e17d8a56 --- /dev/null +++ b/sdks/python/apache_beam/runners/dask/dask_runner_test.py @@ -0,0 +1,94 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import inspect +import unittest + +import apache_beam as beam +from apache_beam.options.pipeline_options import PipelineOptions +from apache_beam.testing import test_pipeline +from apache_beam.testing.util import assert_that +from apache_beam.testing.util import equal_to + +try: + from apache_beam.runners.dask.dask_runner import DaskOptions + from apache_beam.runners.dask.dask_runner import DaskRunner + import dask + import dask.distributed as ddist +except (ImportError, ModuleNotFoundError): + raise unittest.SkipTest('Dask must be installed to run tests.') + + +class DaskOptionsTest(unittest.TestCase): + def test_parses_connection_timeout__defaults_to_none(self): + default_options = PipelineOptions([]) + default_dask_options = default_options.view_as(DaskOptions) + self.assertEqual(None, default_dask_options.timeout) + + def test_parses_connection_timeout__parses_int(self): + conn_options = PipelineOptions('--dask_connection_timeout 12'.split()) + dask_conn_options = conn_options.view_as(DaskOptions) + self.assertEqual(12, dask_conn_options.timeout) + + def test_parses_connection_timeout__handles_bad_input(self): + err_options = PipelineOptions('--dask_connection_timeout foo'.split()) + dask_err_options = err_options.view_as(DaskOptions) + self.assertEqual(dask.config.no_default, dask_err_options.timeout) + + def test_parser_destinations__agree_with_dask_client(self): + options = PipelineOptions( + '--dask_client_address localhost:8080 --dask_connection_timeout 600 ' + '--dask_scheduler_file foobar.cfg --dask_client_name charlie ' + '--dask_connection_limit 1024'.split()) + dask_options = options.view_as(DaskOptions) + + # Get the argument names for the constructor. + client_args = list(inspect.signature(ddist.Client).parameters) + + for opt_name in dask_options.get_all_options(drop_default=True).keys(): + with self.subTest(f'{opt_name} in dask.distributed.Client constructor'): + self.assertIn(opt_name, client_args) + + +class DaskRunnerRunPipelineTest(unittest.TestCase): + """Test class used to introspect the dask runner via a debugger.""" + def setUp(self) -> None: + self.pipeline = test_pipeline.TestPipeline(runner=DaskRunner()) + + def test_create(self): + with self.pipeline as p: + pcoll = p | beam.Create([1]) + assert_that(pcoll, equal_to([1])) + + def test_create_and_map(self): + def double(x): + return x * 2 + + with self.pipeline as p: + pcoll = p | beam.Create([1]) | beam.Map(double) + assert_that(pcoll, equal_to([2])) + + def test_create_map_and_groupby(self): + def double(x): + return x * 2, x + + with self.pipeline as p: + pcoll = p | beam.Create([1]) | beam.Map(double) | beam.GroupByKey() + assert_that(pcoll, equal_to([(2, [1])])) + + +if __name__ == '__main__': + unittest.main() diff --git a/sdks/python/apache_beam/runners/dask/overrides.py b/sdks/python/apache_beam/runners/dask/overrides.py new file mode 100644 index 000000000000..d07c7cd518af --- /dev/null +++ b/sdks/python/apache_beam/runners/dask/overrides.py @@ -0,0 +1,145 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import dataclasses +import typing as t + +import apache_beam as beam +from apache_beam import typehints +from apache_beam.io.iobase import SourceBase +from apache_beam.pipeline import AppliedPTransform +from apache_beam.pipeline import PTransformOverride +from apache_beam.runners.direct.direct_runner import _GroupAlsoByWindowDoFn +from apache_beam.transforms import ptransform +from apache_beam.transforms.window import GlobalWindows + +K = t.TypeVar("K") +V = t.TypeVar("V") + + +@dataclasses.dataclass +class _Create(beam.PTransform): + values: t.Tuple[t.Any] + + def expand(self, input_or_inputs): + return beam.pvalue.PCollection.from_(input_or_inputs) + + def get_windowing(self, inputs: t.Any) -> beam.Windowing: + return beam.Windowing(GlobalWindows()) + + +@typehints.with_input_types(K) +@typehints.with_output_types(K) +class _Reshuffle(beam.PTransform): + def expand(self, input_or_inputs): + return beam.pvalue.PCollection.from_(input_or_inputs) + + +@dataclasses.dataclass +class _Read(beam.PTransform): + source: SourceBase + + def expand(self, input_or_inputs): + return beam.pvalue.PCollection.from_(input_or_inputs) + + +@typehints.with_input_types(t.Tuple[K, V]) +@typehints.with_output_types(t.Tuple[K, t.Iterable[V]]) +class _GroupByKeyOnly(beam.PTransform): + def expand(self, input_or_inputs): + return beam.pvalue.PCollection.from_(input_or_inputs) + + def infer_output_type(self, input_type): + + key_type, value_type = typehints.trivial_inference.key_value_types( + input_type + ) + return typehints.KV[key_type, typehints.Iterable[value_type]] + + +@typehints.with_input_types(t.Tuple[K, t.Iterable[V]]) +@typehints.with_output_types(t.Tuple[K, t.Iterable[V]]) +class _GroupAlsoByWindow(beam.ParDo): + """Not used yet...""" + def __init__(self, windowing): + super().__init__(_GroupAlsoByWindowDoFn(windowing)) + self.windowing = windowing + + def expand(self, input_or_inputs): + return beam.pvalue.PCollection.from_(input_or_inputs) + + +@typehints.with_input_types(t.Tuple[K, V]) +@typehints.with_output_types(t.Tuple[K, t.Iterable[V]]) +class _GroupByKey(beam.PTransform): + def expand(self, input_or_inputs): + return input_or_inputs | "GroupByKey" >> _GroupByKeyOnly() + + +class _Flatten(beam.PTransform): + def expand(self, input_or_inputs): + is_bounded = all(pcoll.is_bounded for pcoll in input_or_inputs) + return beam.pvalue.PCollection(self.pipeline, is_bounded=is_bounded) + + +def dask_overrides() -> t.List[PTransformOverride]: + class CreateOverride(PTransformOverride): + def matches(self, applied_ptransform: AppliedPTransform) -> bool: + return applied_ptransform.transform.__class__ == beam.Create + + def get_replacement_transform_for_applied_ptransform( + self, applied_ptransform: AppliedPTransform) -> ptransform.PTransform: + return _Create(t.cast(beam.Create, applied_ptransform.transform).values) + + class ReshuffleOverride(PTransformOverride): + def matches(self, applied_ptransform: AppliedPTransform) -> bool: + return applied_ptransform.transform.__class__ == beam.Reshuffle + + def get_replacement_transform_for_applied_ptransform( + self, applied_ptransform: AppliedPTransform) -> ptransform.PTransform: + return _Reshuffle() + + class ReadOverride(PTransformOverride): + def matches(self, applied_ptransform: AppliedPTransform) -> bool: + return applied_ptransform.transform.__class__ == beam.io.Read + + def get_replacement_transform_for_applied_ptransform( + self, applied_ptransform: AppliedPTransform) -> ptransform.PTransform: + return _Read(t.cast(beam.io.Read, applied_ptransform.transform).source) + + class GroupByKeyOverride(PTransformOverride): + def matches(self, applied_ptransform: AppliedPTransform) -> bool: + return applied_ptransform.transform.__class__ == beam.GroupByKey + + def get_replacement_transform_for_applied_ptransform( + self, applied_ptransform: AppliedPTransform) -> ptransform.PTransform: + return _GroupByKey() + + class FlattenOverride(PTransformOverride): + def matches(self, applied_ptransform: AppliedPTransform) -> bool: + return applied_ptransform.transform.__class__ == beam.Flatten + + def get_replacement_transform_for_applied_ptransform( + self, applied_ptransform: AppliedPTransform) -> ptransform.PTransform: + return _Flatten() + + return [ + CreateOverride(), + ReshuffleOverride(), + ReadOverride(), + GroupByKeyOverride(), + FlattenOverride(), + ] diff --git a/sdks/python/apache_beam/runners/dask/transform_evaluator.py b/sdks/python/apache_beam/runners/dask/transform_evaluator.py new file mode 100644 index 000000000000..c4aac7f2111f --- /dev/null +++ b/sdks/python/apache_beam/runners/dask/transform_evaluator.py @@ -0,0 +1,103 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Transform Beam PTransforms into Dask Bag operations. + +A minimum set of operation substitutions, to adap Beam's PTransform model +to Dask Bag functions. + +TODO(alxr): Translate ops from https://docs.dask.org/en/latest/bag-api.html. +""" +import abc +import dataclasses +import typing as t + +import apache_beam +import dask.bag as db +from apache_beam.pipeline import AppliedPTransform +from apache_beam.runners.dask.overrides import _Create +from apache_beam.runners.dask.overrides import _Flatten +from apache_beam.runners.dask.overrides import _GroupByKeyOnly + +OpInput = t.Union[db.Bag, t.Sequence[db.Bag], None] + + +@dataclasses.dataclass +class DaskBagOp(abc.ABC): + applied: AppliedPTransform + + @property + def transform(self): + return self.applied.transform + + @abc.abstractmethod + def apply(self, input_bag: OpInput) -> db.Bag: + pass + + +class NoOp(DaskBagOp): + def apply(self, input_bag: OpInput) -> db.Bag: + return input_bag + + +class Create(DaskBagOp): + def apply(self, input_bag: OpInput) -> db.Bag: + assert input_bag is None, 'Create expects no input!' + original_transform = t.cast(_Create, self.transform) + items = original_transform.values + return db.from_sequence(items) + + +class ParDo(DaskBagOp): + def apply(self, input_bag: db.Bag) -> db.Bag: + transform = t.cast(apache_beam.ParDo, self.transform) + return input_bag.map( + transform.fn.process, *transform.args, **transform.kwargs).flatten() + + +class Map(DaskBagOp): + def apply(self, input_bag: db.Bag) -> db.Bag: + transform = t.cast(apache_beam.Map, self.transform) + return input_bag.map( + transform.fn.process, *transform.args, **transform.kwargs) + + +class GroupByKey(DaskBagOp): + def apply(self, input_bag: db.Bag) -> db.Bag: + def key(item): + return item[0] + + def value(item): + k, v = item + return k, [elm[1] for elm in v] + + return input_bag.groupby(key).map(value) + + +class Flatten(DaskBagOp): + def apply(self, input_bag: OpInput) -> db.Bag: + assert type(input_bag) is list, 'Must take a sequence of bags!' + return db.concat(input_bag) + + +TRANSLATIONS = { + _Create: Create, + apache_beam.ParDo: ParDo, + apache_beam.Map: Map, + _GroupByKeyOnly: GroupByKey, + _Flatten: Flatten, +} diff --git a/sdks/python/apache_beam/runners/dataflow/dataflow_runner.py b/sdks/python/apache_beam/runners/dataflow/dataflow_runner.py index e16c88ea9ee2..d581c48cee13 100644 --- a/sdks/python/apache_beam/runners/dataflow/dataflow_runner.py +++ b/sdks/python/apache_beam/runners/dataflow/dataflow_runner.py @@ -1207,8 +1207,6 @@ def run_Read(self, transform_node, options): traceback.format_exc()) step.add_property(PropertyNames.SOURCE_STEP_INPUT, source_dict) - elif transform.source.format == 'text': - step.add_property(PropertyNames.FILE_PATTERN, transform.source.path) elif transform.source.format == 'pubsub': if not standard_options.streaming: raise ValueError( @@ -1274,54 +1272,7 @@ def run__NativeWrite(self, transform_node, options): TransformNames.WRITE, transform_node.full_label, transform_node) # TODO(mairbek): refactor if-else tree to use registerable functions. # Initialize the sink specific properties. - if transform.sink.format == 'text': - # Note that it is important to use typed properties (@type/value dicts) - # for non-string properties and also for empty strings. For example, - # in the code below the num_shards must have type and also - # file_name_suffix and shard_name_template (could be empty strings). - step.add_property( - PropertyNames.FILE_NAME_PREFIX, - transform.sink.file_name_prefix, - with_type=True) - step.add_property( - PropertyNames.FILE_NAME_SUFFIX, - transform.sink.file_name_suffix, - with_type=True) - step.add_property( - PropertyNames.SHARD_NAME_TEMPLATE, - transform.sink.shard_name_template, - with_type=True) - if transform.sink.num_shards > 0: - step.add_property( - PropertyNames.NUM_SHARDS, transform.sink.num_shards, with_type=True) - # TODO(silviuc): Implement sink validation. - step.add_property(PropertyNames.VALIDATE_SINK, False, with_type=True) - elif transform.sink.format == 'bigquery': - # TODO(silviuc): Add table validation if transform.sink.validate. - step.add_property( - PropertyNames.BIGQUERY_DATASET, - transform.sink.table_reference.datasetId) - step.add_property( - PropertyNames.BIGQUERY_TABLE, transform.sink.table_reference.tableId) - # If project owning the table was not specified then the project owning - # the workflow (current project) will be used. - if transform.sink.table_reference.projectId is not None: - step.add_property( - PropertyNames.BIGQUERY_PROJECT, - transform.sink.table_reference.projectId) - step.add_property( - PropertyNames.BIGQUERY_CREATE_DISPOSITION, - transform.sink.create_disposition) - step.add_property( - PropertyNames.BIGQUERY_WRITE_DISPOSITION, - transform.sink.write_disposition) - if transform.sink.table_schema is not None: - step.add_property( - PropertyNames.BIGQUERY_SCHEMA, transform.sink.schema_as_json()) - if transform.sink.kms_key is not None: - step.add_property( - PropertyNames.BIGQUERY_KMS_KEY, transform.sink.kms_key) - elif transform.sink.format == 'pubsub': + if transform.sink.format == 'pubsub': standard_options = options.view_as(StandardOptions) if not standard_options.streaming: raise ValueError( diff --git a/sdks/python/apache_beam/runners/dataflow/internal/names.py b/sdks/python/apache_beam/runners/dataflow/internal/names.py index 4ec94616d6ab..aa6de4f7085e 100644 --- a/sdks/python/apache_beam/runners/dataflow/internal/names.py +++ b/sdks/python/apache_beam/runners/dataflow/internal/names.py @@ -36,10 +36,10 @@ # Update this version to the next version whenever there is a change that will # require changes to legacy Dataflow worker execution environment. -BEAM_CONTAINER_VERSION = 'beam-master-20221018' +BEAM_CONTAINER_VERSION = 'beam-master-20221021' # Update this version to the next version whenever there is a change that # requires changes to SDK harness container or SDK harness launcher. -BEAM_FNAPI_CONTAINER_VERSION = 'beam-master-20221018' +BEAM_FNAPI_CONTAINER_VERSION = 'beam-master-20221021' DATAFLOW_CONTAINER_IMAGE_REPOSITORY = 'gcr.io/cloud-dataflow/v1beta3' diff --git a/sdks/python/apache_beam/runners/portability/expansion_service_test.py b/sdks/python/apache_beam/runners/portability/expansion_service_test.py index e99a7aa90c7c..7aa2e5f16e5b 100644 --- a/sdks/python/apache_beam/runners/portability/expansion_service_test.py +++ b/sdks/python/apache_beam/runners/portability/expansion_service_test.py @@ -376,6 +376,8 @@ def cleanup(unused_signum, unused_frame): def main(unused_argv): + # TODO: use the regular expansion service (expansion_service_main) instead of + # this custom service for testing. PyPIArtifactRegistry.register_artifact('beautifulsoup4', '>=4.9,<5.0') parser = argparse.ArgumentParser() parser.add_argument( @@ -388,8 +390,14 @@ def main(unused_argv): options.fully_qualified_name_glob): server = grpc.server(thread_pool_executor.shared_unbounded_instance()) expansion_servicer = expansion_service.ExpansionServiceServicer( - PipelineOptions( - ["--experiments", "beam_fn_api", "--sdk_location", "container"])) + PipelineOptions([ + "--experiments", + "beam_fn_api", + "--sdk_location", + "container", + "--pickle_library", + "cloudpickle" + ])) update_sklearn_model_dependency(expansion_servicer._default_environment) beam_expansion_api_pb2_grpc.add_ExpansionServiceServicer_to_server( expansion_servicer, server) diff --git a/sdks/python/apache_beam/runners/portability/fn_api_runner/fn_runner_test.py b/sdks/python/apache_beam/runners/portability/fn_api_runner/fn_runner_test.py index eae78747f628..a76cdaf997a5 100644 --- a/sdks/python/apache_beam/runners/portability/fn_api_runner/fn_runner_test.py +++ b/sdks/python/apache_beam/runners/portability/fn_api_runner/fn_runner_test.py @@ -165,6 +165,7 @@ def get_output_batch_type(self, input_element_type): assert_that(res, equal_to([6, 12, 18])) + @unittest.skip('https://github.com/apache/beam/issues/23944') def test_batch_pardo_trigger_flush(self): try: utils.check_compiled('apache_beam.coders.coder_impl') @@ -368,6 +369,7 @@ def infer_output_type(self, input_type): assert_that(res, equal_to([6, 12, 12, 18, 18, 18])) + @unittest.skip('https://github.com/apache/beam/issues/23944') def test_pardo_large_input(self): try: utils.check_compiled('apache_beam.coders.coder_impl') diff --git a/sdks/python/apache_beam/runners/portability/spark_runner.py b/sdks/python/apache_beam/runners/portability/spark_runner.py index b1d754d89836..b4c46c0dac06 100644 --- a/sdks/python/apache_beam/runners/portability/spark_runner.py +++ b/sdks/python/apache_beam/runners/portability/spark_runner.py @@ -88,15 +88,15 @@ def path_to_jar(self): 'Unable to parse jar URL "%s". If using a full URL, make sure ' 'the scheme is specified. If using a local file path, make sure ' 'the file exists; you may have to first build the job server ' - 'using `./gradlew runners:spark:2:job-server:shadowJar`.' % + 'using `./gradlew runners:spark:3:job-server:shadowJar`.' % self._jar) return self._jar else: - if self._spark_version == '3': - return self.path_to_beam_jar(':runners:spark:3:job-server:shadowJar') - return self.path_to_beam_jar( - ':runners:spark:2:job-server:shadowJar', - artifact_id='beam-runners-spark-job-server') + if self._spark_version == '2': + return self.path_to_beam_jar( + ':runners:spark:2:job-server:shadowJar', + artifact_id='beam-runners-spark-job-server') + return self.path_to_beam_jar(':runners:spark:3:job-server:shadowJar') def java_arguments( self, job_port, artifact_port, expansion_port, artifacts_dir): diff --git a/sdks/python/apache_beam/runners/portability/spark_runner_test.py b/sdks/python/apache_beam/runners/portability/spark_runner_test.py index 488222f2f2fa..5530caa1e971 100644 --- a/sdks/python/apache_beam/runners/portability/spark_runner_test.py +++ b/sdks/python/apache_beam/runners/portability/spark_runner_test.py @@ -84,7 +84,7 @@ def parse_options(self, request): self.set_spark_job_server_jar( known_args.spark_job_server_jar or job_server.JavaJarJobServer.path_to_beam_jar( - ':runners:spark:2:job-server:shadowJar')) + ':runners:spark:3:job-server:shadowJar')) self.environment_type = known_args.environment_type self.environment_options = known_args.environment_options diff --git a/sdks/python/apache_beam/runners/portability/spark_uber_jar_job_server.py b/sdks/python/apache_beam/runners/portability/spark_uber_jar_job_server.py index 832f3142cb63..97fa6b629cee 100644 --- a/sdks/python/apache_beam/runners/portability/spark_uber_jar_job_server.py +++ b/sdks/python/apache_beam/runners/portability/spark_uber_jar_job_server.py @@ -69,17 +69,17 @@ def executable_jar(self): 'Unable to parse jar URL "%s". If using a full URL, make sure ' 'the scheme is specified. If using a local file path, make sure ' 'the file exists; you may have to first build the job server ' - 'using `./gradlew runners:spark:2:job-server:shadowJar`.' % + 'using `./gradlew runners:spark:3:job-server:shadowJar`.' % self._executable_jar) url = self._executable_jar else: - if self._spark_version == '3': - url = job_server.JavaJarJobServer.path_to_beam_jar( - ':runners:spark:3:job-server:shadowJar') - else: + if self._spark_version == '2': url = job_server.JavaJarJobServer.path_to_beam_jar( ':runners:spark:2:job-server:shadowJar', artifact_id='beam-runners-spark-job-server') + else: + url = job_server.JavaJarJobServer.path_to_beam_jar( + ':runners:spark:3:job-server:shadowJar') return job_server.JavaJarJobServer.local_jar(url) def create_beam_job(self, job_id, job_name, pipeline, options): diff --git a/sdks/python/apache_beam/runners/portability/stager.py b/sdks/python/apache_beam/runners/portability/stager.py index e06c71c917d2..abcef4679c20 100644 --- a/sdks/python/apache_beam/runners/portability/stager.py +++ b/sdks/python/apache_beam/runners/portability/stager.py @@ -224,9 +224,14 @@ def create_job_resources(options, # type: PipelineOptions 'The file %s cannot be found. It was specified in the ' '--requirements_file command line option.' % setup_options.requirements_file) + extra_packages, thinned_requirements_file = ( + Stager._extract_local_packages(setup_options.requirements_file)) + if extra_packages: + setup_options.extra_packages = ( + setup_options.extra_packages or []) + extra_packages resources.append( Stager._create_file_stage_to_artifact( - setup_options.requirements_file, REQUIREMENTS_FILE)) + thinned_requirements_file, REQUIREMENTS_FILE)) # Populate cache with packages from the requirement file option and # stage the files in the cache. if not use_beam_default_container: @@ -683,6 +688,25 @@ def _remove_dependency_from_requirements( return tmp_requirements_filename + @staticmethod + def _extract_local_packages(requirements_file): + local_deps = [] + pypi_deps = [] + with open(requirements_file, 'r') as fin: + for line in fin: + dep = line.strip() + if os.path.exists(dep): + local_deps.append(dep) + else: + pypi_deps.append(dep) + if local_deps: + with tempfile.NamedTemporaryFile(suffix='-requirements.txt', + delete=False) as fout: + fout.write('\n'.join(pypi_deps).encode('utf-8')) + return local_deps, fout.name + else: + return [], requirements_file + @staticmethod def _get_platform_for_default_sdk_container(): """ diff --git a/sdks/python/apache_beam/runners/portability/stager_test.py b/sdks/python/apache_beam/runners/portability/stager_test.py index b221bb1ec6f6..c1806c384941 100644 --- a/sdks/python/apache_beam/runners/portability/stager_test.py +++ b/sdks/python/apache_beam/runners/portability/stager_test.py @@ -832,6 +832,49 @@ def test_populate_requirements_cache_with_sdist(self): self.assertTrue('.tar.gz' in f) self.assertTrue('.whl' not in f) + def test_populate_requirements_cache_with_local_files(self): + staging_dir = self.make_temp_dir() + requirements_cache_dir = self.make_temp_dir() + source_dir = self.make_temp_dir() + pkg_dir = self.make_temp_dir() + + options = PipelineOptions() + self.update_options(options) + + options.view_as(SetupOptions).requirements_cache = requirements_cache_dir + options.view_as(SetupOptions).requirements_file = os.path.join( + source_dir, stager.REQUIREMENTS_FILE) + local_package = os.path.join(pkg_dir, 'local_package.tar.gz') + self.create_temp_file(local_package, 'local-package-content') + self.create_temp_file( + os.path.join(source_dir, stager.REQUIREMENTS_FILE), + '\n'.join(['fake_pypi', local_package])) + with mock.patch('apache_beam.runners.portability.stager_test' + '.stager.Stager._populate_requirements_cache', + staticmethod(self._populate_requitements_cache_fake)): + options.view_as(SetupOptions).requirements_cache_only_sources = True + resources = self.stager.create_and_stage_job_resources( + options, staging_location=staging_dir)[1] + + self.assertEqual( + sorted([ + stager.REQUIREMENTS_FILE, + stager.EXTRA_PACKAGES_FILE, + 'nothing.tar.gz', + 'local_package.tar.gz' + ]), + sorted(resources)) + + with open(os.path.join(staging_dir, stager.REQUIREMENTS_FILE)) as fin: + requirements_contents = fin.read() + self.assertIn('fake_pypi', requirements_contents) + self.assertNotIn('local_package', requirements_contents) + + with open(os.path.join(staging_dir, stager.EXTRA_PACKAGES_FILE)) as fin: + extra_packages_contents = fin.read() + self.assertNotIn('fake_pypi', extra_packages_contents) + self.assertIn('local_package', extra_packages_contents) + class TestStager(stager.Stager): def stage_artifact(self, local_path_to_artifact, artifact_name, sha256): diff --git a/sdks/python/apache_beam/runners/worker/statecache.py b/sdks/python/apache_beam/runners/worker/statecache.py index e3f37fec1144..dde4243057dd 100644 --- a/sdks/python/apache_beam/runners/worker/statecache.py +++ b/sdks/python/apache_beam/runners/worker/statecache.py @@ -176,7 +176,7 @@ def get_deep_size(*objs): """Calculates the deep size of all the arguments in bytes.""" return objsize.get_deep_size( - objs, + *objs, get_size_func=_size_func, get_referents_func=_get_referents_func, filter_func=_filter_func) @@ -274,6 +274,7 @@ def get(self, key, loading_fn): self._miss_count += 1 loading_value = _LoadingValue() self._cache[key] = loading_value + self._current_weight += loading_value.weight() # Ensure that we unlock the lock while loading to allow for parallel gets self._lock.release() diff --git a/sdks/python/apache_beam/runners/worker/statecache_test.py b/sdks/python/apache_beam/runners/worker/statecache_test.py index 6850cb212840..a5d1ff2e01e3 100644 --- a/sdks/python/apache_beam/runners/worker/statecache_test.py +++ b/sdks/python/apache_beam/runners/worker/statecache_test.py @@ -20,11 +20,13 @@ import logging import re +import sys import threading import time import unittest import weakref +import objsize from hamcrest import assert_that from hamcrest import contains_string @@ -32,6 +34,7 @@ from apache_beam.runners.worker.statecache import StateCache from apache_beam.runners.worker.statecache import WeightedValue from apache_beam.runners.worker.statecache import _LoadingValue +from apache_beam.runners.worker.statecache import get_deep_size class StateCacheTest(unittest.TestCase): @@ -356,6 +359,54 @@ def get_referents_for_cache(self): 'used/max 1/5 MB, hit 100.00%, lookups 0, ' 'avg load time 0 ns, loads 0, evictions 0')) + def test_get_deep_size_builtin_objects(self): + """ + `statecache.get_deep_copy` should work same with objsize unless the `objs` + has `CacheAware` or a filtered object. They should return the same size for + built-in objects. + """ + primitive_test_objects = [ + 1, # int + 2.0, # float + 1+1j, # complex + True, # bool + 'hello,world', # str + b'\00\01\02', # bytes + ] + + collection_test_objects = [ + [3, 4, 5], # list + (6, 7), # tuple + {'a', 'b', 'c'}, # set + {'k': 8, 'l': 9}, # dict + ] + + for obj in primitive_test_objects: + self.assertEqual( + get_deep_size(obj), + objsize.get_deep_size(obj), + f'different size for obj: `{obj}`, type: {type(obj)}') + self.assertEqual( + get_deep_size(obj), + sys.getsizeof(obj), + f'different size for obj: `{obj}`, type: {type(obj)}') + + for obj in collection_test_objects: + self.assertEqual( + get_deep_size(obj), + objsize.get_deep_size(obj), + f'different size for obj: `{obj}`, type: {type(obj)}') + + def test_current_weight_between_get_and_put(self): + value = 1234567 + get_cache = StateCache(100) + get_cache.get("key", lambda k: value) + + put_cache = StateCache(100) + put_cache.put("key", value) + + self.assertEqual(get_cache._current_weight, put_cache._current_weight) + if __name__ == '__main__': logging.getLogger().setLevel(logging.INFO) diff --git a/sdks/python/apache_beam/runners/worker/worker_status.py b/sdks/python/apache_beam/runners/worker/worker_status.py index 7604bd0867a5..40f0d927c0ee 100644 --- a/sdks/python/apache_beam/runners/worker/worker_status.py +++ b/sdks/python/apache_beam/runners/worker/worker_status.py @@ -230,7 +230,7 @@ def close(self): def _log_lull_in_bundle_processor(self, bundle_process_cache): while True: time.sleep(2 * 60) - if bundle_process_cache.active_bundle_processors: + if bundle_process_cache and bundle_process_cache.active_bundle_processors: for instruction in list( bundle_process_cache.active_bundle_processors.keys()): processor = bundle_process_cache.lookup(instruction) diff --git a/sdks/python/apache_beam/testing/benchmarks/nexmark/nexmark_launcher.py b/sdks/python/apache_beam/testing/benchmarks/nexmark/nexmark_launcher.py index e4babe5f42e8..04c6d325e194 100644 --- a/sdks/python/apache_beam/testing/benchmarks/nexmark/nexmark_launcher.py +++ b/sdks/python/apache_beam/testing/benchmarks/nexmark/nexmark_launcher.py @@ -60,10 +60,15 @@ # pytype: skip-file import argparse +import json import logging +import os import time import uuid +import requests +from requests.auth import HTTPBasicAuth + import apache_beam as beam from apache_beam.options.pipeline_options import GoogleCloudOptions from apache_beam.options.pipeline_options import PipelineOptions @@ -123,6 +128,13 @@ def __init__(self): logging.info('creating sub %s', self.topic_name) sub.create() + self.export_influxdb = self.args.export_summary_to_influxdb + if self.export_influxdb: + self.influx_database = self.args.influx_database + self.influx_host = self.args.influx_host + self.influx_base = self.args.base_influx_measurement + self.influx_retention = self.args.influx_retention_policy + def parse_args(self): parser = argparse.ArgumentParser() @@ -170,6 +182,32 @@ def parse_args(self): choices=['PUBLISH_ONLY', 'SUBSCRIBE_ONLY', 'COMBINED'], help='Pubsub mode used in the pipeline.') + parser.add_argument( + '--export_summary_to_influxdb', + default=False, + action='store_true', + help='If set store results in influxdb') + parser.add_argument( + '--influx_database', + type=str, + default='beam_test_metrics', + help='Influx database name') + parser.add_argument( + '--influx_host', + type=str, + default='http://localhost:8086', + help='Influx database url') + parser.add_argument( + '--base_influx_measurement', + type=str, + default='nexmark', + help='Prefix to influx measurement') + parser.add_argument( + '--influx_retention_policy', + type=str, + default='forever', + help='Retention policy for stored results') + self.args, self.pipeline_args = parser.parse_known_args() logging.basicConfig( level=getattr(logging, self.args.loglevel, None), @@ -243,7 +281,8 @@ def read_from_pubsub(self): | 'deserialization' >> beam.ParDo(nexmark_util.ParseJsonEventFn())) return events - def run_query(self, query, query_args, pipeline_options, query_errors): + def run_query( + self, query_num, query, query_args, pipeline_options, query_errors): try: self.pipeline = beam.Pipeline(options=self.pipeline_options) nexmark_util.setup_coder() @@ -269,6 +308,8 @@ def run_query(self, query, query_args, pipeline_options, query_errors): result.wait_until_finish() perf = self.monitor(result, event_monitor, result_monitor) self.log_performance(perf) + if self.export_influxdb: + self.publish_performance_influxdb(query_num, perf) except Exception as exc: query_errors.append(str(exc)) @@ -349,6 +390,47 @@ def log_performance(perf): 'query run took %.1f seconds and processed %.1f events per second' % (perf.runtime_sec, perf.event_per_sec)) + def publish_performance_influxdb(self, query_num, perf): + processingMode = "streaming" if self.streaming else "batch" + measurement = "%s_%d_python_%s" % ( + self.influx_base, query_num, processingMode) + + tags = {'runner': self.pipeline_options.view_as(StandardOptions).runner} + + mt = ','.join([measurement] + [k + "=" + v for k, v in tags.items()]) + + fields = { + 'numResults': "%di" % (perf.result_count), + 'runtimeMs': "%di" % (perf.runtime_sec * 1000), + } + + ts = int(time.time()) + payload = '\n'.join( + ["%s %s=%s %d" % (mt, k, v, ts) for k, v in fields.items()]) + + url = '%s/write' % (self.influx_host) + query_str = { + 'db': self.influx_database, + 'rp': self.influx_retention, + 'precision': 's', + } + + user = os.getenv('INFLUXDB_USER') + password = os.getenv('INFLUXDB_USER_PASSWORD') + auth = HTTPBasicAuth(user, password) + + try: + response = requests.post(url, params=query_str, data=payload, auth=auth) + except requests.exceptions.RequestException as e: + logging.warning('Failed to publish metrics to InfluxDB: ' + str(e)) + else: + if response.status_code != 204: + content = json.loads(response.content) + logging.warning( + 'Failed to publish metrics to InfluxDB. Received status code %s ' + 'with an error message: %s' % + (response.status_code, content['error'])) + @staticmethod def get_performance(result, event_monitor, result_monitor): event_count = nexmark_util.get_counter_metric( @@ -429,6 +511,7 @@ def run(self): for i in self.args.query: logging.info('Running query %d', i) self.run_query( + i, queries[i], query_args, self.pipeline_options, diff --git a/sdks/python/apache_beam/testing/test_utils.py b/sdks/python/apache_beam/testing/test_utils.py index 72067fdb5fc4..049ccc0c3d49 100644 --- a/sdks/python/apache_beam/testing/test_utils.py +++ b/sdks/python/apache_beam/testing/test_utils.py @@ -206,3 +206,20 @@ def create_pull_response(responses): res.received_messages.append(received_message) return res + + +def create_file(path, contents): + """Create a file to use as input to test pipelines""" + with FileSystems.create(path) as f: + f.write(str.encode(contents, 'utf-8')) + return path + + +def read_files_from_pattern(file_pattern): + """Reads the files that match a pattern""" + metadata_list = FileSystems.match([file_pattern])[0].metadata_list + output = [] + for metadata in metadata_list: + with FileSystems.open(metadata.path) as f: + output.append(f.read().decode('utf-8').strip()) + return '\n'.join(output) diff --git a/sdks/python/apache_beam/transforms/batch_dofn_test.py b/sdks/python/apache_beam/transforms/batch_dofn_test.py index de35c29024a5..d2aceb371492 100644 --- a/sdks/python/apache_beam/transforms/batch_dofn_test.py +++ b/sdks/python/apache_beam/transforms/batch_dofn_test.py @@ -41,12 +41,12 @@ def process_batch(self, batch: List[int], *args, yield [element / 2 for element in batch] -class BatchDoFnNoReturnAnnotation(beam.DoFn): +class NoReturnAnnotation(beam.DoFn): def process_batch(self, batch: List[int], *args, **kwargs): yield [element * 2 for element in batch] -class BatchDoFnOverrideTypeInference(beam.DoFn): +class OverrideTypeInference(beam.DoFn): def process_batch(self, batch, *args, **kwargs): yield [element * 2 for element in batch] @@ -104,7 +104,7 @@ def get_test_class_name(cls, num, params_dict): "expected_output_batch_type": beam.typehints.List[float] }, { - "dofn": BatchDoFnNoReturnAnnotation(), + "dofn": NoReturnAnnotation(), "input_element_type": int, "expected_process_defined": False, "expected_process_batch_defined": True, @@ -112,7 +112,7 @@ def get_test_class_name(cls, num, params_dict): "expected_output_batch_type": beam.typehints.List[int] }, { - "dofn": BatchDoFnOverrideTypeInference(), + "dofn": OverrideTypeInference(), "input_element_type": int, "expected_process_defined": False, "expected_process_batch_defined": True, @@ -168,7 +168,7 @@ def test_can_yield_batches(self): self.assertEqual(self.dofn._can_yield_batches, expected) -class BatchDoFnNoInputAnnotation(beam.DoFn): +class NoInputAnnotation(beam.DoFn): def process_batch(self, batch, *args, **kwargs): yield [element * 2 for element in batch] @@ -198,6 +198,12 @@ def process_batch(self, batch: List[int], *args, **kwargs) -> Iterator[int]: yield batch[0] +class NoElementOutputAnnotation(beam.DoFn): + def process_batch(self, batch: List[int], *args, + **kwargs) -> Iterator[List[int]]: + yield [element * 2 for element in batch] + + class BatchDoFnTest(unittest.TestCase): def test_map_pardo(self): # verify batch dofn accessors work well with beam.Map generated DoFn @@ -213,12 +219,11 @@ def test_no_input_annotation_raises(self): p = beam.Pipeline() pc = p | beam.Create([1, 2, 3]) - with self.assertRaisesRegex(TypeError, - r'BatchDoFnNoInputAnnotation.process_batch'): - _ = pc | beam.ParDo(BatchDoFnNoInputAnnotation()) + with self.assertRaisesRegex(TypeError, r'NoInputAnnotation.process_batch'): + _ = pc | beam.ParDo(NoInputAnnotation()) def test_unsupported_dofn_param_raises(self): - class BatchDoFnBadParam(beam.DoFn): + class BadParam(beam.DoFn): @no_type_check def process_batch(self, batch: List[int], key=beam.DoFn.KeyParam): yield batch * key @@ -226,9 +231,8 @@ def process_batch(self, batch: List[int], key=beam.DoFn.KeyParam): p = beam.Pipeline() pc = p | beam.Create([1, 2, 3]) - with self.assertRaisesRegex(NotImplementedError, - r'BatchDoFnBadParam.*KeyParam'): - _ = pc | beam.ParDo(BatchDoFnBadParam()) + with self.assertRaisesRegex(NotImplementedError, r'BadParam.*KeyParam'): + _ = pc | beam.ParDo(BadParam()) def test_mismatched_batch_producer_raises(self): p = beam.Pipeline() @@ -256,6 +260,27 @@ def test_mismatched_element_producer_raises(self): r'(?ms)MismatchedElementProducingDoFn.*process:.*process_batch:'): _ = pc | beam.ParDo(MismatchedElementProducingDoFn()) + def test_cant_infer_batchconverter_input_raises(self): + p = beam.Pipeline() + pc = p | beam.Create(['a', 'b', 'c']) + + with self.assertRaisesRegex( + TypeError, + # Error should mention "input", and the name of the DoFn + r'input.*BatchDoFn.*'): + _ = pc | beam.ParDo(BatchDoFn()) + + def test_cant_infer_batchconverter_output_raises(self): + p = beam.Pipeline() + pc = p | beam.Create([1, 2, 3]) + + with self.assertRaisesRegex( + TypeError, + # Error should mention "output", the name of the DoFn, and suggest + # overriding DoFn.infer_output_type + r'output.*NoElementOutputAnnotation.*DoFn\.infer_output_type'): + _ = pc | beam.ParDo(NoElementOutputAnnotation()) + def test_element_to_batch_dofn_typehint(self): # Verify that element to batch DoFn sets the correct typehint on the output # PCollection. diff --git a/sdks/python/apache_beam/transforms/core.py b/sdks/python/apache_beam/transforms/core.py index 50ff32e57a33..69c003ee5b8c 100644 --- a/sdks/python/apache_beam/transforms/core.py +++ b/sdks/python/apache_beam/transforms/core.py @@ -1511,10 +1511,17 @@ def infer_batch_converters(self, input_element_type): "process_batch method on {self.fn!r} does not have " "an input type annoation") - # Generate a batch converter to convert between the input type and the - # (batch) input type of process_batch - self.fn.input_batch_converter = BatchConverter.from_typehints( - element_type=input_element_type, batch_type=input_batch_type) + try: + # Generate a batch converter to convert between the input type and the + # (batch) input type of process_batch + self.fn.input_batch_converter = BatchConverter.from_typehints( + element_type=input_element_type, batch_type=input_batch_type) + except TypeError as e: + raise TypeError( + "Failed to find a BatchConverter for the input types of DoFn " + f"{self.fn!r} (element_type={input_element_type!r}, " + f"batch_type={input_batch_type!r}).") from e + else: self.fn.input_batch_converter = None @@ -1530,8 +1537,16 @@ def infer_batch_converters(self, input_element_type): # Generate a batch converter to convert between the output type and the # (batch) output type of process_batch output_element_type = self.infer_output_type(input_element_type) - self.fn.output_batch_converter = BatchConverter.from_typehints( - element_type=output_element_type, batch_type=output_batch_type) + + try: + self.fn.output_batch_converter = BatchConverter.from_typehints( + element_type=output_element_type, batch_type=output_batch_type) + except TypeError as e: + raise TypeError( + "Failed to find a BatchConverter for the *output* types of DoFn " + f"{self.fn!r} (element_type={output_element_type!r}, " + f"batch_type={output_batch_type!r}). Maybe you need to override " + "DoFn.infer_output_type to set the output element type?") from e else: self.fn.output_batch_converter = None diff --git a/sdks/python/apache_beam/transforms/util.py b/sdks/python/apache_beam/transforms/util.py index cb4b86245e00..dca5628118d6 100644 --- a/sdks/python/apache_beam/transforms/util.py +++ b/sdks/python/apache_beam/transforms/util.py @@ -83,6 +83,7 @@ 'Distinct', 'Keys', 'KvSwap', + 'LogElements', 'Regex', 'Reify', 'RemoveDuplicates', @@ -1105,6 +1106,49 @@ def Iterables(delimiter=None): Kvs = Iterables +@typehints.with_input_types(T) +@typehints.with_output_types(T) +class LogElements(PTransform): + """ + PTransform for printing the elements of a PCollection. + """ + class _LoggingFn(DoFn): + def __init__(self, prefix='', with_timestamp=False, with_window=False): + super().__init__() + self.prefix = prefix + self.with_timestamp = with_timestamp + self.with_window = with_window + + def process( + self, + element, + timestamp=DoFn.TimestampParam, + window=DoFn.WindowParam, + **kwargs): + log_line = self.prefix + str(element) + + if self.with_timestamp: + log_line += ', timestamp=' + repr(timestamp.to_rfc3339()) + + if self.with_window: + log_line += ', window(start=' + window.start.to_rfc3339() + log_line += ', end=' + window.end.to_rfc3339() + ')' + + print(log_line) + yield element + + def __init__( + self, label=None, prefix='', with_timestamp=False, with_window=False): + super().__init__(label) + self.prefix = prefix + self.with_timestamp = with_timestamp + self.with_window = with_window + + def expand(self, input): + return input | ParDo( + self._LoggingFn(self.prefix, self.with_timestamp, self.with_window)) + + class Reify(object): """PTransforms for converting between explicit and implicit form of various Beam values.""" diff --git a/sdks/python/apache_beam/transforms/util_test.py b/sdks/python/apache_beam/transforms/util_test.py index f236e3ce768e..180b7ae5f8e7 100644 --- a/sdks/python/apache_beam/transforms/util_test.py +++ b/sdks/python/apache_beam/transforms/util_test.py @@ -26,8 +26,10 @@ import time import unittest import warnings +from datetime import datetime import pytest +import pytz import apache_beam as beam from apache_beam import GroupByKey @@ -1063,6 +1065,46 @@ def test_tostring_kvs_empty_delimeter(self): assert_that(result, equal_to(["one1", "two2"])) +class LogElementsTest(unittest.TestCase): + @pytest.fixture(scope="function") + def _capture_stdout_log(request, capsys): + with TestPipeline() as p: + result = ( + p | beam.Create([ + TimestampedValue( + "event", + datetime(2022, 10, 1, 0, 0, 0, 0, + tzinfo=pytz.UTC).timestamp()), + TimestampedValue( + "event", + datetime(2022, 10, 2, 0, 0, 0, 0, + tzinfo=pytz.UTC).timestamp()), + ]) + | beam.WindowInto(FixedWindows(60)) + | util.LogElements( + prefix='prefix_', with_window=True, with_timestamp=True)) + + request.captured_stdout = capsys.readouterr().out + return result + + @pytest.mark.usefixtures("_capture_stdout_log") + def test_stdout_logs(self): + assert self.captured_stdout == \ + ("prefix_event, timestamp='2022-10-01T00:00:00Z', " + "window(start=2022-10-01T00:00:00Z, end=2022-10-01T00:01:00Z)\n" + "prefix_event, timestamp='2022-10-02T00:00:00Z', " + "window(start=2022-10-02T00:00:00Z, end=2022-10-02T00:01:00Z)\n"), \ + f'Received from stdout: {self.captured_stdout}' + + def test_ptransform_output(self): + with TestPipeline() as p: + result = ( + p + | beam.Create(['a', 'b', 'c']) + | util.LogElements(prefix='prefix_')) + assert_that(result, equal_to(['a', 'b', 'c'])) + + class ReifyTest(unittest.TestCase): def test_timestamp(self): l = [ diff --git a/sdks/python/apache_beam/typehints/__init__.py b/sdks/python/apache_beam/typehints/__init__.py index 46a8579c6c65..81ffc9f307d9 100644 --- a/sdks/python/apache_beam/typehints/__init__.py +++ b/sdks/python/apache_beam/typehints/__init__.py @@ -29,3 +29,10 @@ pass else: from apache_beam.typehints.pandas_type_compatibility import * + +try: + import pyarrow as _ +except ImportError: + pass +else: + from apache_beam.typehints.arrow_type_compatibility import * diff --git a/sdks/python/apache_beam/typehints/arrow_batching_microbenchmark.py b/sdks/python/apache_beam/typehints/arrow_batching_microbenchmark.py new file mode 100644 index 000000000000..d17a9b1a1818 --- /dev/null +++ b/sdks/python/apache_beam/typehints/arrow_batching_microbenchmark.py @@ -0,0 +1,78 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""A microbenchmark for pyarrow batch creation. + +This microbenchmark exercises the PyarrowBatchConverter.produce_batch method +for different batch sizes. +""" + +import argparse +import logging + +import pyarrow as pa + +from apache_beam.portability.api import schema_pb2 +from apache_beam.tools import utils +from apache_beam.typehints.arrow_type_compatibility import PyarrowBatchConverter +from apache_beam.typehints.arrow_type_compatibility import beam_schema_from_arrow_schema +from apache_beam.typehints.schemas import typing_from_runner_api + + +def benchmark_produce_batch(size): + batch = pa.Table.from_pydict({ + 'foo': pa.array(range(size), type=pa.int64()), + 'bar': pa.array([i / size for i in range(size)], type=pa.float64()), + 'baz': pa.array([str(i) for i in range(size)], type=pa.string()), + }) + beam_schema = beam_schema_from_arrow_schema(batch.schema) + element_type = typing_from_runner_api( + schema_pb2.FieldType(row_type=schema_pb2.RowType(schema=beam_schema))) + + batch_converter = PyarrowBatchConverter.from_typehints(element_type, pa.Table) + elements = list(batch_converter.explode_batch(batch)) + + def _do_benchmark(): + _ = batch_converter.produce_batch(elements) + + return _do_benchmark + + +def run_benchmark( + starting_point=1, num_runs=10, num_elements_step=300, verbose=True): + suite = [ + utils.LinearRegressionBenchmarkConfig( + benchmark_produce_batch, starting_point, num_elements_step, num_runs) + ] + return utils.run_benchmarks(suite, verbose=verbose) + + +if __name__ == '__main__': + logging.basicConfig() + + parser = argparse.ArgumentParser() + parser.add_argument('--num_runs', default=10, type=int) + parser.add_argument('--starting_point', default=50, type=int) + parser.add_argument('--increment', default=1000, type=int) + parser.add_argument('--verbose', default=True, type=bool) + options = parser.parse_args() + + run_benchmark( + options.starting_point, + options.num_runs, + options.increment, + options.verbose) diff --git a/sdks/python/apache_beam/typehints/arrow_type_compatibility.py b/sdks/python/apache_beam/typehints/arrow_type_compatibility.py new file mode 100644 index 000000000000..cad6ac8751ca --- /dev/null +++ b/sdks/python/apache_beam/typehints/arrow_type_compatibility.py @@ -0,0 +1,384 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Utilities for converting between Beam and Arrow schemas. + +For internal use only, no backward compatibility guarantees. +""" + +from functools import partial +from typing import Dict +from typing import List +from typing import Optional +from typing import Sequence +from typing import Tuple + +import pyarrow as pa + +from apache_beam.portability.api import schema_pb2 +from apache_beam.typehints.batch import BatchConverter +from apache_beam.typehints.row_type import RowTypeConstraint +from apache_beam.typehints.schemas import typing_from_runner_api +from apache_beam.typehints.schemas import typing_to_runner_api +from apache_beam.utils import proto_utils + +__all__ = [] + +# Get major, minor version +PYARROW_VERSION = tuple(map(int, pa.__version__.split('.')[0:2])) + +BEAM_SCHEMA_ID_KEY = b'beam:schema_id' +# We distinguish between schema and field options, because they have to be +# combined into arrow Field-level metadata for nested structs. +BEAM_SCHEMA_OPTION_KEY_PREFIX = b'beam:schema_option:' +BEAM_FIELD_OPTION_KEY_PREFIX = b'beam:field_option:' + + +def _hydrate_beam_option(encoded_option: bytes) -> schema_pb2.Option: + return proto_utils.parse_Bytes(encoded_option, schema_pb2.Option) + + +def beam_schema_from_arrow_schema(arrow_schema: pa.Schema) -> schema_pb2.Schema: + if arrow_schema.metadata: + schema_id = arrow_schema.metadata.get(BEAM_SCHEMA_ID_KEY, None) + schema_options = [ + _hydrate_beam_option(value) for key, + value in arrow_schema.metadata.items() + if key.startswith(BEAM_SCHEMA_OPTION_KEY_PREFIX) + ] + else: + schema_id = None + schema_options = [] + + return schema_pb2.Schema( + fields=[ + _beam_field_from_arrow_field(arrow_schema.field(i)) + for i in range(len(arrow_schema.types)) + ], + options=schema_options, + id=schema_id) + + +def _beam_field_from_arrow_field(arrow_field: pa.Field) -> schema_pb2.Field: + beam_fieldtype = _beam_fieldtype_from_arrow_field(arrow_field) + + if arrow_field.metadata: + field_options = [ + _hydrate_beam_option(value) for key, + value in arrow_field.metadata.items() + if key.startswith(BEAM_FIELD_OPTION_KEY_PREFIX) + ] + if isinstance(arrow_field.type, pa.StructType): + beam_fieldtype.row_type.schema.options.extend([ + _hydrate_beam_option(value) for key, + value in arrow_field.metadata.items() + if key.startswith(BEAM_SCHEMA_OPTION_KEY_PREFIX) + ]) + if BEAM_SCHEMA_ID_KEY in arrow_field.metadata: + beam_fieldtype.row_type.schema.id = arrow_field.metadata[ + BEAM_SCHEMA_ID_KEY] + + else: + field_options = None + + return schema_pb2.Field( + name=arrow_field.name, + type=beam_fieldtype, + options=field_options, + ) + + +def _beam_fieldtype_from_arrow_field( + arrow_field: pa.Field) -> schema_pb2.FieldType: + beam_fieldtype = _beam_fieldtype_from_arrow_type(arrow_field.type) + beam_fieldtype.nullable = arrow_field.nullable + + return beam_fieldtype + + +def _beam_fieldtype_from_arrow_type( + arrow_type: pa.DataType) -> schema_pb2.FieldType: + if arrow_type in PYARROW_TO_ATOMIC_TYPE: + return schema_pb2.FieldType(atomic_type=PYARROW_TO_ATOMIC_TYPE[arrow_type]) + elif isinstance(arrow_type, pa.ListType): + return schema_pb2.FieldType( + array_type=schema_pb2.ArrayType( + element_type=_beam_fieldtype_from_arrow_field( + arrow_type.value_field))) + elif isinstance(arrow_type, pa.MapType): + return schema_pb2.FieldType(map_type=_arrow_map_to_beam_map(arrow_type)) + elif isinstance(arrow_type, pa.StructType): + return schema_pb2.FieldType( + row_type=schema_pb2.RowType( + schema=schema_pb2.Schema( + fields=[ + _beam_field_from_arrow_field(arrow_type[i]) + for i in range(len(arrow_type)) + ], + ))) + + else: + raise ValueError(f"Unrecognized arrow type: {arrow_type!r}") + + +def _option_as_arrow_metadata(beam_option: schema_pb2.Option, *, + prefix: bytes) -> Tuple[bytes, bytes]: + return ( + prefix + beam_option.name.encode('UTF-8'), + beam_option.SerializeToString()) + + +_field_option_as_arrow_metadata = partial( + _option_as_arrow_metadata, prefix=BEAM_FIELD_OPTION_KEY_PREFIX) +_schema_option_as_arrow_metadata = partial( + _option_as_arrow_metadata, prefix=BEAM_SCHEMA_OPTION_KEY_PREFIX) + + +def arrow_schema_from_beam_schema(beam_schema: schema_pb2.Schema) -> pa.Schema: + return pa.schema( + [_arrow_field_from_beam_field(field) for field in beam_schema.fields], + { + BEAM_SCHEMA_ID_KEY: beam_schema.id, + **dict( + _schema_option_as_arrow_metadata(option) for option in beam_schema.options) # pylint: disable=line-too-long + }, + ) + + +def _arrow_field_from_beam_field(beam_field: schema_pb2.Field) -> pa.Field: + return _arrow_field_from_beam_fieldtype( + beam_field.type, name=beam_field.name, field_options=beam_field.options) + + +_ARROW_PRIMITIVE_MAPPING = [ + # TODO(https://github.com/apache/beam/issues/23816): Support unsigned ints + # and float16 + (schema_pb2.BYTE, pa.int8()), + (schema_pb2.INT16, pa.int16()), + (schema_pb2.INT32, pa.int32()), + (schema_pb2.INT64, pa.int64()), + (schema_pb2.FLOAT, pa.float32()), + (schema_pb2.DOUBLE, pa.float64()), + (schema_pb2.BOOLEAN, pa.bool_()), + (schema_pb2.STRING, pa.string()), + (schema_pb2.BYTES, pa.binary()), +] +ATOMIC_TYPE_TO_PYARROW = { + beam: arrow + for beam, arrow in _ARROW_PRIMITIVE_MAPPING +} +PYARROW_TO_ATOMIC_TYPE = { + arrow: beam + for beam, arrow in _ARROW_PRIMITIVE_MAPPING +} + + +def _arrow_field_from_beam_fieldtype( + beam_fieldtype: schema_pb2.FieldType, + name=b'', + field_options: Sequence[schema_pb2.Option] = None) -> pa.DataType: + arrow_type = _arrow_type_from_beam_fieldtype(beam_fieldtype) + if field_options is not None: + metadata = dict( + _field_option_as_arrow_metadata(field_option) + for field_option in field_options) + else: + metadata = {} + + type_info = beam_fieldtype.WhichOneof("type_info") + if type_info == "row_type": + schema = beam_fieldtype.row_type.schema + metadata.update( + dict( + _schema_option_as_arrow_metadata(schema_option) + for schema_option in schema.options)) + if schema.id: + metadata[BEAM_SCHEMA_ID_KEY] = schema.id + + return pa.field( + name=name, + type=arrow_type, + nullable=beam_fieldtype.nullable, + metadata=metadata, + ) + + +if PYARROW_VERSION < (6, 0): + # In pyarrow < 6.0.0 we cannot construct a MapType object from Field + # instances, pa.map_ will only accept DataType instances. This makes it + # impossible to propagate nullability. + # + # Note this was changed in: + # https://github.com/apache/arrow/commit/64bef2ad8d9cd2fea122cfa079f8ca3fea8cdf5d + # + # Here we define a custom arrow map conversion function to handle these cases + # and error as appropriate. + + def _make_arrow_map(beam_map_type: schema_pb2.MapType): + if beam_map_type.key_type.nullable: + raise TypeError('Arrow map key field cannot be nullable') + elif beam_map_type.value_type.nullable: + raise TypeError( + "pyarrow<6 does not support creating maps with nullable " + "values. Please use pyarrow>=6.0.0") + + return pa.map_( + _arrow_type_from_beam_fieldtype(beam_map_type.key_type), + _arrow_type_from_beam_fieldtype(beam_map_type.value_type)) + + def _arrow_map_to_beam_map(arrow_map_type): + return schema_pb2.MapType( + key_type=_beam_fieldtype_from_arrow_type(arrow_map_type.key_type), + value_type=_beam_fieldtype_from_arrow_type(arrow_map_type.item_type)) + +else: + + def _make_arrow_map(beam_map_type: schema_pb2.MapType): + return pa.map_( + _arrow_field_from_beam_fieldtype(beam_map_type.key_type), + _arrow_field_from_beam_fieldtype(beam_map_type.value_type)) + + def _arrow_map_to_beam_map(arrow_map_type): + return schema_pb2.MapType( + key_type=_beam_fieldtype_from_arrow_field(arrow_map_type.key_field), + value_type=_beam_fieldtype_from_arrow_field(arrow_map_type.item_field)) + + +def _arrow_type_from_beam_fieldtype( + beam_fieldtype: schema_pb2.FieldType, +) -> Tuple[pa.DataType, Optional[Dict[bytes, bytes]]]: + # Note this function is not concerned with beam_fieldtype.nullable, as + # nullability is a property of the Field in Arrow. + type_info = beam_fieldtype.WhichOneof("type_info") + if type_info == 'atomic_type': + try: + output_arrow_type = ATOMIC_TYPE_TO_PYARROW[beam_fieldtype.atomic_type] + except KeyError: + raise ValueError( + "Unsupported atomic type: {0}".format(beam_fieldtype.atomic_type)) + elif type_info == "array_type": + output_arrow_type = pa.list_( + _arrow_field_from_beam_fieldtype( + beam_fieldtype.array_type.element_type)) + elif type_info == "map_type": + output_arrow_type = _make_arrow_map(beam_fieldtype.map_type) + elif type_info == "row_type": + schema = beam_fieldtype.row_type.schema + # Note schema id and options are handled at the arrow field level, they are + # added at field-level metadata. + output_arrow_type = pa.struct( + [_arrow_field_from_beam_field(field) for field in schema.fields]) + elif type_info == "logical_type": + # TODO(https://github.com/apache/beam/issues/23817): Add support for logical + # types. + raise NotImplementedError( + "Beam logical types are not currently supported " + "in arrow_type_compatibility.") + else: + raise ValueError(f"Unrecognized type_info: {type_info!r}") + + return output_arrow_type + + +class PyarrowBatchConverter(BatchConverter): + def __init__(self, element_type: RowTypeConstraint): + super().__init__(pa.Table, element_type) + self._beam_schema = typing_to_runner_api(element_type).row_type.schema + arrow_schema = arrow_schema_from_beam_schema(self._beam_schema) + + self._arrow_schema = arrow_schema + + @staticmethod + @BatchConverter.register + def from_typehints(element_type, + batch_type) -> Optional['PyarrowBatchConverter']: + if isinstance(element_type, RowTypeConstraint) and batch_type == pa.Table: + return PyarrowBatchConverter(element_type) + + return None + + def produce_batch(self, elements): + arrays = [ + pa.array([getattr(el, name) for el in elements], + type=self._arrow_schema.field(name).type) for name, + _ in self._element_type._fields + ] + return pa.Table.from_arrays(arrays, schema=self._arrow_schema) + + def explode_batch(self, batch: pa.Table): + """Convert an instance of B to Generator[E].""" + for row_values in zip(*batch.columns): + yield self._element_type.user_type( + **{ + name: val.as_py() + for name, + val in zip(self._arrow_schema.names, row_values) + }) + + def combine_batches(self, batches: List[pa.Table]): + return pa.concat_tables(batches) + + def get_length(self, batch: pa.Table): + return batch.num_rows + + def estimate_byte_size(self, batch: pa.Table): + return batch.nbytes + + @staticmethod + def _from_serialized_schema(serialized_schema): + beam_schema = proto_utils.parse_Bytes(serialized_schema, schema_pb2.Schema) + element_type = typing_from_runner_api( + schema_pb2.FieldType(row_type=schema_pb2.RowType(schema=beam_schema))) + return PyarrowBatchConverter(element_type) + + def __reduce__(self): + return self._from_serialized_schema, ( + self._beam_schema.SerializeToString(), ) + + +class PyarrowArrayBatchConverter(BatchConverter): + def __init__(self, element_type: type): + super().__init__(pa.Array, element_type) + self._element_type = element_type + beam_fieldtype = typing_to_runner_api(element_type) + self._arrow_type = _arrow_type_from_beam_fieldtype(beam_fieldtype) + + @staticmethod + @BatchConverter.register + def from_typehints(element_type, + batch_type) -> Optional['PyarrowArrayBatchConverter']: + if batch_type == pa.Array: + return PyarrowArrayBatchConverter(element_type) + + return None + + def produce_batch(self, elements): + return pa.array(list(elements), type=self._arrow_type) + + def explode_batch(self, batch: pa.Array): + """Convert an instance of B to Generator[E].""" + for val in batch: + yield val.as_py() + + def combine_batches(self, batches: List[pa.Array]): + return pa.concat_arrays(batches) + + def get_length(self, batch: pa.Array): + return batch.num_rows + + def estimate_byte_size(self, batch: pa.Array): + return batch.nbytes diff --git a/sdks/python/apache_beam/typehints/arrow_type_compatibility_test.py b/sdks/python/apache_beam/typehints/arrow_type_compatibility_test.py new file mode 100644 index 000000000000..6a8649cff1ea --- /dev/null +++ b/sdks/python/apache_beam/typehints/arrow_type_compatibility_test.py @@ -0,0 +1,197 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Tests for arrow_type_compatibility.""" + +import logging +import unittest +from typing import Optional + +import pyarrow as pa +import pytest +from parameterized import parameterized +from parameterized import parameterized_class + +from apache_beam.typehints import row_type +from apache_beam.typehints import typehints +from apache_beam.typehints.arrow_type_compatibility import arrow_schema_from_beam_schema +from apache_beam.typehints.arrow_type_compatibility import beam_schema_from_arrow_schema +from apache_beam.typehints.batch import BatchConverter +from apache_beam.typehints.batch_test import temp_seed +from apache_beam.typehints.schemas_test import get_test_beam_schemas_protos + + +@pytest.mark.uses_pyarrow +class ArrowTypeCompatibilityTest(unittest.TestCase): + @parameterized.expand([(beam_schema, ) + for beam_schema in get_test_beam_schemas_protos()]) + def test_beam_schema_survives_roundtrip(self, beam_schema): + roundtripped = beam_schema_from_arrow_schema( + arrow_schema_from_beam_schema(beam_schema)) + + self.assertEqual(beam_schema, roundtripped) + +@parameterized_class([ + { + 'batch_typehint': pa.Table, + 'element_typehint': row_type.RowTypeConstraint.from_fields([ + ('foo', Optional[int]), + ('bar', Optional[float]), + ('baz', Optional[str]), + ]), + 'batch': pa.Table.from_pydict({ + 'foo': pa.array(range(100), type=pa.int64()), + 'bar': pa.array([i / 100 for i in range(100)], type=pa.float64()), + 'baz': pa.array([str(i) for i in range(100)], type=pa.string()), + }), + }, + { + 'batch_typehint': pa.Table, + 'element_typehint': row_type.RowTypeConstraint.from_fields([ + ('foo', Optional[int]), + ( + 'nested', + Optional[row_type.RowTypeConstraint.from_fields([ + ("bar", Optional[float]), # noqa: F821 + ("baz", Optional[str]), # noqa: F821 + ])]), + ]), + 'batch': pa.Table.from_pydict({ + 'foo': pa.array(range(100), type=pa.int64()), + 'nested': pa.array([ + None if i % 11 else { + 'bar': i / 100, 'baz': str(i) + } for i in range(100) + ]), + }), + }, + { + 'batch_typehint': pa.Array, + 'element_typehint': int, + 'batch': pa.array(range(100), type=pa.int64()), + }, + { + 'batch_typehint': pa.Array, + 'element_typehint': row_type.RowTypeConstraint.from_fields([ + ("bar", Optional[float]), # noqa: F821 + ("baz", Optional[str]), # noqa: F821 + ]), + 'batch': pa.array([ + { + 'bar': i / 100, 'baz': str(i) + } if i % 7 else None for i in range(100) + ]), + } +]) +@pytest.mark.uses_pyarrow +class ArrowBatchConverterTest(unittest.TestCase): + def create_batch_converter(self): + return BatchConverter.from_typehints( + element_type=self.element_typehint, batch_type=self.batch_typehint) + + def setUp(self): + self.converter = self.create_batch_converter() + self.normalized_batch_typehint = typehints.normalize(self.batch_typehint) + self.normalized_element_typehint = typehints.normalize( + self.element_typehint) + + def equality_check(self, left, right): + if isinstance(left, pa.Array): + self.assertTrue(left.equals(right)) + else: + self.assertEqual(left, right) + + def test_typehint_validates(self): + typehints.validate_composite_type_param(self.batch_typehint, '') + typehints.validate_composite_type_param(self.element_typehint, '') + + def test_type_check(self): + typehints.check_constraint(self.normalized_batch_typehint, self.batch) + + def test_type_check_element(self): + for element in self.converter.explode_batch(self.batch): + typehints.check_constraint(self.normalized_element_typehint, element) + + def test_explode_rebatch(self): + exploded = list(self.converter.explode_batch(self.batch)) + rebatched = self.converter.produce_batch(exploded) + + typehints.check_constraint(self.normalized_batch_typehint, rebatched) + self.equality_check(self.batch, rebatched) + + def test_estimate_byte_size_implemented(self): + # Just verify that we can call byte size + self.assertGreater(self.converter.estimate_byte_size(self.batch), 0) + + @parameterized.expand([ + (2, ), + (3, ), + (10, ), + ]) + def test_estimate_byte_size_partitions(self, N): + elements = list(self.converter.explode_batch(self.batch)) + + # Split elements into N contiguous partitions, create a batch out of each + batches = [ + self.converter.produce_batch( + elements[len(elements) * i // N:len(elements) * (i + 1) // N]) + for i in range(N) + ] + + # Some estimate_byte_size implementations use random samples, + # set a seed temporarily to make this test deterministic + with temp_seed(12345): + partitioned_size_estimate = sum( + self.converter.estimate_byte_size(batch) for batch in batches) + size_estimate = self.converter.estimate_byte_size(self.batch) + + # Assert that size estimate for partitions is within 10% of size estimate + # for the whole partition. + self.assertLessEqual( + abs(partitioned_size_estimate / size_estimate - 1), 0.1) + + @parameterized.expand([ + (2, ), + (3, ), + (10, ), + ]) + def test_combine_batches(self, N): + elements = list(self.converter.explode_batch(self.batch)) + + # Split elements into N contiguous partitions, create a batch out of each + batches = [ + self.converter.produce_batch( + elements[len(elements) * i // N:len(elements) * (i + 1) // N]) + for i in range(N) + ] + + # Combine the batches, output should be equivalent to the original batch + combined = self.converter.combine_batches(batches) + + self.equality_check(self.batch, combined) + + def test_equals(self): + self.assertTrue(self.converter == self.create_batch_converter()) + self.assertTrue(self.create_batch_converter() == self.converter) + + def test_hash(self): + self.assertEqual(hash(self.create_batch_converter()), hash(self.converter)) + + +if __name__ == '__main__': + logging.getLogger().setLevel(logging.INFO) + unittest.main() diff --git a/sdks/python/apache_beam/typehints/batch.py b/sdks/python/apache_beam/typehints/batch.py index 322ba717a05c..de6c7fb71572 100644 --- a/sdks/python/apache_beam/typehints/batch.py +++ b/sdks/python/apache_beam/typehints/batch.py @@ -39,6 +39,8 @@ from apache_beam import coders from apache_beam.typehints import typehints +__all__ = ['BatchConverter'] + B = TypeVar('B') E = TypeVar('E') diff --git a/sdks/python/apache_beam/typehints/schemas.py b/sdks/python/apache_beam/typehints/schemas.py index 4371ca2de0da..55335bbd6b6e 100644 --- a/sdks/python/apache_beam/typehints/schemas.py +++ b/sdks/python/apache_beam/typehints/schemas.py @@ -34,13 +34,23 @@ bytes <-----> BYTES ByteString ------> BYTES Timestamp <-----> LogicalType(urn="beam:logical_type:micros_instant:v1") - Timestamp <------ LogicalType(urn="beam:logical_type:millis_instant:v1") Decimal <-----> LogicalType(urn="beam:logical_type:fixed_decimal:v1") Mapping <-----> MapType Sequence <-----> ArrayType NamedTuple <-----> RowType beam.Row ------> RowType +One direction mapping of Python types from Beam portable schemas: + + bytes + <------ LogicalType(urn="beam:logical_type:fixed_bytes:v1") + <------ LogicalType(urn="beam:logical_type:var_bytes:v1") + str + <------ LogicalType(urn="beam:logical_type:fixed_char:v1") + <------ LogicalType(urn="beam:logical_type:var_char:v1") + Timestamp + <------ LogicalType(urn="beam:logical_type:millis_instant:v1") + Note that some of these mappings are provided as conveniences, but they are lossy and will not survive a roundtrip from python to Beam schemas and back. For example, the Python type :code:`int` will map to :code:`INT64` in @@ -57,6 +67,7 @@ # pytype: skip-file import decimal +import logging from typing import Any from typing import ByteString from typing import Dict @@ -116,6 +127,8 @@ float: schema_pb2.DOUBLE, }) +_LOGGER = logging.getLogger(__name__) + def named_fields_to_schema( names_and_types: Union[Dict[str, type], Sequence[Tuple[str, type]]], @@ -171,6 +184,25 @@ def typing_from_runner_api( schema_registry=schema_registry).typing_from_runner_api(fieldtype_proto) +def value_to_runner_api( + type_proto: schema_pb2.FieldType, + value, + schema_registry: SchemaTypeRegistry = SCHEMA_REGISTRY +) -> schema_pb2.FieldValue: + return SchemaTranslation(schema_registry=schema_registry).value_to_runner_api( + type_proto, value) + + +def value_from_runner_api( + type_proto: schema_pb2.FieldType, + value_proto: schema_pb2.FieldValue, + schema_registry: SchemaTypeRegistry = SCHEMA_REGISTRY +) -> schema_pb2.FieldValue: + return SchemaTranslation( + schema_registry=schema_registry).value_from_runner_api( + type_proto, value_proto) + + def option_to_runner_api( option: Tuple[str, Any], schema_registry: SchemaTypeRegistry = SCHEMA_REGISTRY) -> schema_pb2.Option: @@ -269,59 +301,114 @@ def typing_to_runner_api(self, type_: type) -> schema_pb2.FieldType: logical_type=schema_pb2.LogicalType(urn=PYTHON_ANY_URN), nullable=True) else: - if logical_type.argument_type() is None: - return schema_pb2.FieldType( - logical_type=schema_pb2.LogicalType( - urn=logical_type.urn(), - representation=self.typing_to_runner_api( - logical_type.representation_type()))) - else: - # TODO(https://github.com/apache/beam/issues/23373): Complete support - # for logical types that require arguments. - # This include implement SchemaTranslation.value_to_runner_api (see Java - # SDK's SchemaTranslation.fieldValueToProto) - return schema_pb2.FieldType( - logical_type=schema_pb2.LogicalType( - urn=logical_type.urn(), - representation=self.typing_to_runner_api( - logical_type.representation_type()), - argument_type=self.typing_to_runner_api( - logical_type.argument_type()))) + argument_type = None + argument = None + if logical_type.argument_type() is not None: + argument_type = self.typing_to_runner_api(logical_type.argument_type()) + try: + argument = self.value_to_runner_api( + argument_type, logical_type.argument()) + except ValueError: + # TODO(https://github.com/apache/beam/issues/23373): Complete support + # for logical types that require arguments beyond atomic type. + # For now, skip arguments. + argument = None + return schema_pb2.FieldType( + logical_type=schema_pb2.LogicalType( + urn=logical_type.urn(), + representation=self.typing_to_runner_api( + logical_type.representation_type()), + argument_type=argument_type, + argument=argument)) + + def atomic_value_from_runner_api( + self, + atomic_type: schema_pb2.AtomicType, + atomic_value: schema_pb2.AtomicTypeValue): + if atomic_type == schema_pb2.BYTE: + value = np.int8(atomic_value.byte) + elif atomic_type == schema_pb2.INT16: + value = np.int16(atomic_value.int16) + elif atomic_type == schema_pb2.INT32: + value = np.int32(atomic_value.int32) + elif atomic_type == schema_pb2.INT64: + value = np.int64(atomic_value.int64) + elif atomic_type == schema_pb2.FLOAT: + value = np.float32(atomic_value.float) + elif atomic_type == schema_pb2.DOUBLE: + value = np.float64(atomic_value.double) + elif atomic_type == schema_pb2.STRING: + value = atomic_value.string + elif atomic_type == schema_pb2.BOOLEAN: + value = atomic_value.boolean + elif atomic_type == schema_pb2.BYTES: + value = atomic_value.bytes + else: + raise ValueError( + f"Unrecognized atomic_type ({atomic_type}) " + f"when decoding value {atomic_value!r}") + + return value + + def atomic_value_to_runner_api( + self, atomic_type: schema_pb2.AtomicType, + value) -> schema_pb2.AtomicTypeValue: + if atomic_type == schema_pb2.BYTE: + atomic_value = schema_pb2.AtomicTypeValue(byte=value) + elif atomic_type == schema_pb2.INT16: + atomic_value = schema_pb2.AtomicTypeValue(int16=value) + elif atomic_type == schema_pb2.INT32: + atomic_value = schema_pb2.AtomicTypeValue(int32=value) + elif atomic_type == schema_pb2.INT64: + atomic_value = schema_pb2.AtomicTypeValue(int64=value) + elif atomic_type == schema_pb2.FLOAT: + atomic_value = schema_pb2.AtomicTypeValue(float=value) + elif atomic_type == schema_pb2.DOUBLE: + atomic_value = schema_pb2.AtomicTypeValue(double=value) + elif atomic_type == schema_pb2.STRING: + atomic_value = schema_pb2.AtomicTypeValue(string=value) + elif atomic_type == schema_pb2.BOOLEAN: + atomic_value = schema_pb2.AtomicTypeValue(boolean=value) + elif atomic_type == schema_pb2.BYTES: + atomic_value = schema_pb2.AtomicTypeValue(bytes=value) + else: + raise ValueError( + "Unrecognized atomic_type {atomic_type} when encoding value {value}") - def option_from_runner_api( - self, option_proto: schema_pb2.Option) -> Tuple[str, Any]: - if not option_proto.HasField('type'): - return option_proto.name, None + return atomic_value - fieldtype_proto = option_proto.type - if fieldtype_proto.WhichOneof("type_info") != "atomic_type": + def value_from_runner_api( + self, + type_proto: schema_pb2.FieldType, + value_proto: schema_pb2.FieldValue): + if type_proto.WhichOneof("type_info") != "atomic_type": + # TODO: Allow other value types raise ValueError( "Encounterd option with unsupported type. Only " - f"atomic_type options are supported: {option_proto}") - - if fieldtype_proto.atomic_type == schema_pb2.BYTE: - value = np.int8(option_proto.value.atomic_value.byte) - elif fieldtype_proto.atomic_type == schema_pb2.INT16: - value = np.int16(option_proto.value.atomic_value.int16) - elif fieldtype_proto.atomic_type == schema_pb2.INT32: - value = np.int32(option_proto.value.atomic_value.int32) - elif fieldtype_proto.atomic_type == schema_pb2.INT64: - value = np.int64(option_proto.value.atomic_value.int64) - elif fieldtype_proto.atomic_type == schema_pb2.FLOAT: - value = np.float32(option_proto.value.atomic_value.float) - elif fieldtype_proto.atomic_type == schema_pb2.DOUBLE: - value = np.float64(option_proto.value.atomic_value.double) - elif fieldtype_proto.atomic_type == schema_pb2.STRING: - value = option_proto.value.atomic_value.string - elif fieldtype_proto.atomic_type == schema_pb2.BOOLEAN: - value = option_proto.value.atomic_value.boolean - elif fieldtype_proto.atomic_type == schema_pb2.BYTES: - value = option_proto.value.atomic_value.bytes - else: + f"atomic_type options are supported: {type_proto}") + + value = self.atomic_value_from_runner_api( + type_proto.atomic_type, value_proto.atomic_value) + return value + + def value_to_runner_api(self, typing_proto: schema_pb2.FieldType, value): + if typing_proto.WhichOneof("type_info") != "atomic_type": + # TODO: Allow other value types raise ValueError( - f"Unrecognized atomic_type ({fieldtype_proto.atomic_type}) " - f"when decoding option {option_proto!r}") + "Only atomic_type option values are currently supported in Python. " + f"Got {value!r}, which maps to fieldtype {typing_proto!r}.") + + atomic_value = self.atomic_value_to_runner_api( + typing_proto.atomic_type, value) + value_proto = schema_pb2.FieldValue(atomic_value=atomic_value) + return value_proto + + def option_from_runner_api( + self, option_proto: schema_pb2.Option) -> Tuple[str, Any]: + if not option_proto.HasField('type'): + return option_proto.name, None + value = self.value_from_runner_api(option_proto.type, option_proto.value) return option_proto.name, value def option_to_runner_api(self, option: Tuple[str, Any]) -> schema_pb2.Option: @@ -332,41 +419,9 @@ def option_to_runner_api(self, option: Tuple[str, Any]) -> schema_pb2.Option: # Don't set type, value return schema_pb2.Option(name=name) - fieldtype_proto = self.typing_to_runner_api(type(value)) - if fieldtype_proto.WhichOneof("type_info") != "atomic_type": - # TODO: Allow other value types - raise ValueError( - "Only atomic_type option values are currently supported in Python. " - f"Got {value!r}, which maps to fieldtype {fieldtype_proto!r}.") - - if fieldtype_proto.atomic_type == schema_pb2.BYTE: - atomictypevalue_proto = schema_pb2.AtomicTypeValue(byte=value) - elif fieldtype_proto.atomic_type == schema_pb2.INT16: - atomictypevalue_proto = schema_pb2.AtomicTypeValue(int16=value) - elif fieldtype_proto.atomic_type == schema_pb2.INT32: - atomictypevalue_proto = schema_pb2.AtomicTypeValue(int32=value) - elif fieldtype_proto.atomic_type == schema_pb2.INT64: - atomictypevalue_proto = schema_pb2.AtomicTypeValue(int64=value) - elif fieldtype_proto.atomic_type == schema_pb2.FLOAT: - atomictypevalue_proto = schema_pb2.AtomicTypeValue(float=value) - elif fieldtype_proto.atomic_type == schema_pb2.DOUBLE: - atomictypevalue_proto = schema_pb2.AtomicTypeValue(double=value) - elif fieldtype_proto.atomic_type == schema_pb2.STRING: - atomictypevalue_proto = schema_pb2.AtomicTypeValue(string=value) - elif fieldtype_proto.atomic_type == schema_pb2.BOOLEAN: - atomictypevalue_proto = schema_pb2.AtomicTypeValue(boolean=value) - elif fieldtype_proto.atomic_type == schema_pb2.BYTES: - atomictypevalue_proto = schema_pb2.AtomicTypeValue(bytes=value) - else: - raise ValueError( - "Unrecognized atomic_type in fieldtype_proto=" - f"{fieldtype_proto!r} when encoding option {option!r}") - - return schema_pb2.Option( - name=name, - type=fieldtype_proto, - value=schema_pb2.FieldValue(atomic_value=atomictypevalue_proto), - ) + type_proto = self.typing_to_runner_api(type(value)) + value_proto = self.value_to_runner_api(type_proto, value) + return schema_pb2.Option(name=name, type=type_proto, value=value_proto) def typing_from_runner_api( self, fieldtype_proto: schema_pb2.FieldType) -> type: @@ -463,12 +518,10 @@ def named_tuple_from_schema(self, schema: schema_pb2.Schema) -> type: # Define a reduce function, otherwise these types can't be pickled # (See BEAM-9574) - def __reduce__(self): - return ( - _hydrate_namedtuple_instance, - (schema.SerializeToString(), tuple(self))) - - setattr(user_type, '__reduce__', __reduce__) + setattr( + user_type, + '__reduce__', + _named_tuple_reduce_method(schema.SerializeToString())) self.schema_registry.add(user_type, schema) coders.registry.register_coder(user_type, coders.RowCoder) @@ -476,6 +529,13 @@ def __reduce__(self): return user_type +def _named_tuple_reduce_method(serialized_schema): + def __reduce__(self): + return _hydrate_namedtuple_instance, (serialized_schema, tuple(self)) + + return __reduce__ + + def _hydrate_namedtuple_instance(encoded_schema, values): return named_tuple_from_schema( proto_utils.parse_Bytes(encoded_schema, schema_pb2.Schema))(*values) @@ -643,8 +703,24 @@ def from_runner_api(cls, logical_type_proto): if logical_type is None: raise ValueError( "No logical type registered for URN '%s'" % logical_type_proto.urn) - # TODO(bhulette): Use argument - return logical_type() + if not logical_type_proto.HasField( + "argument_type") or not logical_type_proto.HasField("argument"): + # logical type_proto without argument + return logical_type() + else: + try: + argument = value_from_runner_api( + logical_type_proto.argument_type, logical_type_proto.argument) + except ValueError: + # TODO(https://github.com/apache/beam/issues/23373): Complete support + # for logical types that require arguments beyond atomic type. + # For now, skip arguments. + _LOGGER.warning( + 'Logical type %s with argument is currently unsupported. ' + 'Argument values are omitted', + logical_type_proto.urn) + return logical_type() + return logical_type(argument) class NoArgumentLogicalType(LogicalType[LanguageT, RepresentationT, None]): @@ -666,6 +742,28 @@ def _from_typing(cls, typ): return cls() +class PassThroughLogicalType(LogicalType[LanguageT, LanguageT, ArgT]): + """A base class for LogicalTypes that use the same type as the underlying + representation type. + """ + def to_language_type(self, value): + return value + + @classmethod + def representation_type(cls): + # type: () -> type + return cls.language_type() + + def to_representation_type(self, value): + return value + + @classmethod + def _from_typing(cls, typ): + # type: (type) -> LogicalType + # TODO(https://github.com/apache/beam/issues/23373): enable argument + return cls() + + MicrosInstantRepresentation = NamedTuple( 'MicrosInstantRepresentation', [('seconds', np.int64), ('micros', np.int64)]) @@ -851,3 +949,129 @@ def _from_typing(cls, typ): # TODO(yathu,BEAM-10722): Investigate and resolve conflicts in logical type # registration when more than one logical types sharing the same language type LogicalType.register_logical_type(DecimalLogicalType) + + +@LogicalType.register_logical_type +class FixedBytes(PassThroughLogicalType[bytes, np.int32]): + """A logical type for fixed-length bytes.""" + @classmethod + def urn(cls): + return common_urns.fixed_bytes.urn + + def __init__(self, length: np.int32): + self.length = length + + @classmethod + def language_type(cls) -> type: + return bytes + + def to_language_type(self, value: bytes): + length = len(value) + if length > self.length: + raise ValueError( + "value length {} > allowed length {}".format(length, self.length)) + elif length < self.length: + # padding at the end + value = value + b'\0' * (self.length - length) + + return value + + @classmethod + def argument_type(cls): + return np.int32 + + def argument(self): + return self.length + + +@LogicalType.register_logical_type +class VariableBytes(PassThroughLogicalType[bytes, np.int32]): + """A logical type for variable-length bytes with specified maximum length.""" + @classmethod + def urn(cls): + return common_urns.var_bytes.urn + + def __init__(self, max_length: np.int32 = np.iinfo(np.int32).max): + self.max_length = max_length + + @classmethod + def language_type(cls) -> type: + return bytes + + def to_language_type(self, value: bytes): + length = len(value) + if length > self.max_length: + raise ValueError( + "value length {} > allowed length {}".format(length, self.max_length)) + + return value + + @classmethod + def argument_type(cls): + return np.int32 + + def argument(self): + return self.max_length + + +@LogicalType.register_logical_type +class FixedString(PassThroughLogicalType[str, np.int32]): + """A logical type for fixed-length string.""" + @classmethod + def urn(cls): + return common_urns.fixed_char.urn + + def __init__(self, length: np.int32): + self.length = length + + @classmethod + def language_type(cls) -> type: + return str + + def to_language_type(self, value: str): + length = len(value) + if length > self.length: + raise ValueError( + "value length {} > allowed length {}".format(length, self.length)) + elif length < self.length: + # padding at the end + value = value + ' ' * (self.length - length) + + return value + + @classmethod + def argument_type(cls): + return np.int32 + + def argument(self): + return self.length + + +@LogicalType.register_logical_type +class VariableString(PassThroughLogicalType[str, np.int32]): + """A logical type for variable-length string with specified maximum length.""" + @classmethod + def urn(cls): + return common_urns.var_char.urn + + def __init__(self, max_length: np.int32 = np.iinfo(np.int32).max): + self.max_length = max_length + + @classmethod + def language_type(cls) -> type: + return str + + def to_language_type(self, value: str): + length = len(value) + if length > self.max_length: + raise ValueError( + "value length {} > allowed length {}".format(length, self.max_length)) + + return value + + @classmethod + def argument_type(cls): + return np.int32 + + def argument(self): + return self.max_length diff --git a/sdks/python/apache_beam/typehints/schemas_test.py b/sdks/python/apache_beam/typehints/schemas_test.py index 5d9434345cbc..de2ed829ab36 100644 --- a/sdks/python/apache_beam/typehints/schemas_test.py +++ b/sdks/python/apache_beam/typehints/schemas_test.py @@ -266,6 +266,39 @@ def get_test_beam_fieldtype_protos(): string='str'))), ]) for i, typ in enumerate(all_primitives) + ] + [ + schema_pb2.Field( + name='nested', + type=schema_pb2.FieldType( + row_type=schema_pb2.RowType( + schema=schema_pb2.Schema( + fields=[ + schema_pb2.Field( + name='nested_field', + type=schema_pb2.FieldType( + atomic_type=schema_pb2.INT64, + ), + options=[ + schema_pb2.Option( + name='a_nested_field_flag' + ), + ]), + ], + options=[ + schema_pb2.Option( + name='a_nested_schema_flag'), + schema_pb2.Option( + name='a_str', + type=schema_pb2.FieldType( + atomic_type=schema_pb2.STRING + ), + value=schema_pb2.FieldValue( + atomic_value=schema_pb2. + AtomicTypeValue( + string='str'))), + ], + ))), + ), ]))), schema_pb2.FieldType( row_type=schema_pb2.RowType( @@ -303,7 +336,7 @@ def get_test_beam_fieldtype_protos(): atomic_type=schema_pb2 .DOUBLE)))), ])))) - ]))) + ]))), ] return all_primitives + \ @@ -633,8 +666,10 @@ def test_generated_class_pickle_instance(self): self.assertEqual(instance, self.pickler.loads(self.pickler.dumps(instance))) - @unittest.skip("https://github.com/apache/beam/issues/22714") def test_generated_class_pickle(self): + if self.pickler in [pickle, dill]: + self.skipTest('https://github.com/apache/beam/issues/22714') + schema = schema_pb2.Schema( id="some-uuid", fields=[ diff --git a/sdks/python/container/py310/base_image_requirements.txt b/sdks/python/container/py310/base_image_requirements.txt index aeb8d0d990ba..651a332f909f 100644 --- a/sdks/python/container/py310/base_image_requirements.txt +++ b/sdks/python/container/py310/base_image_requirements.txt @@ -66,7 +66,7 @@ google-cloud-pubsublite==1.5.0 google-cloud-recommendations-ai==0.7.1 google-cloud-spanner==3.22.2 google-cloud-videointelligence==1.16.3 -google-cloud-vision==1.0.2 +google-cloud-vision==3.1.4 google-crc32c==1.5.0 google-pasta==0.2.0 google-resumable-media==2.4.0 @@ -102,7 +102,7 @@ overrides==6.5.0 packaging==21.3 pandas==1.4.4 parameterized==0.8.1 -pbr==5.10.0 +pbr==5.11.0 pluggy==1.0.0 proto-plus==1.22.1 protobuf==3.19.6 @@ -131,7 +131,7 @@ requests-mock==1.10.0 requests-oauthlib==1.3.1 rsa==4.9 scikit-learn==1.1.2 -scipy==1.9.2 +scipy==1.9.3 six==1.16.0 sortedcontainers==2.4.0 soupsieve==2.3.2.post1 diff --git a/sdks/python/container/py37/base_image_requirements.txt b/sdks/python/container/py37/base_image_requirements.txt index 53041ca821f0..2dbf7d1827db 100644 --- a/sdks/python/container/py37/base_image_requirements.txt +++ b/sdks/python/container/py37/base_image_requirements.txt @@ -70,7 +70,7 @@ google-cloud-recommendations-ai==0.7.1 google-cloud-spanner==3.22.2 google-cloud-storage==2.5.0 google-cloud-videointelligence==1.16.3 -google-cloud-vision==1.0.2 +google-cloud-vision==3.1.4 google-crc32c==1.5.0 google-pasta==0.2.0 google-python-cloud-debugger==3.1 @@ -109,7 +109,7 @@ overrides==6.5.0 packaging==21.3 pandas==1.3.5 parameterized==0.8.1 -pbr==5.10.0 +pbr==5.11.0 pluggy==1.0.0 proto-plus==1.22.1 protobuf==3.19.6 diff --git a/sdks/python/container/py38/base_image_requirements.txt b/sdks/python/container/py38/base_image_requirements.txt index e5f7fec58485..ca45ef30f856 100644 --- a/sdks/python/container/py38/base_image_requirements.txt +++ b/sdks/python/container/py38/base_image_requirements.txt @@ -70,7 +70,7 @@ google-cloud-recommendations-ai==0.7.1 google-cloud-spanner==3.22.2 google-cloud-storage==2.5.0 google-cloud-videointelligence==1.16.3 -google-cloud-vision==1.0.2 +google-cloud-vision==3.1.4 google-crc32c==1.5.0 google-pasta==0.2.0 google-python-cloud-debugger==3.1 @@ -109,7 +109,7 @@ overrides==6.5.0 packaging==21.3 pandas==1.4.4 parameterized==0.8.1 -pbr==5.10.0 +pbr==5.11.0 pluggy==1.0.0 proto-plus==1.22.1 protobuf==3.19.6 @@ -138,7 +138,7 @@ requests-mock==1.10.0 requests-oauthlib==1.3.1 rsa==4.9 scikit-learn==1.1.2 -scipy==1.9.2 +scipy==1.9.3 six==1.16.0 sortedcontainers==2.4.0 soupsieve==2.3.2.post1 diff --git a/sdks/python/container/py39/base_image_requirements.txt b/sdks/python/container/py39/base_image_requirements.txt index bc624e7df12c..f4b9cdca16a5 100644 --- a/sdks/python/container/py39/base_image_requirements.txt +++ b/sdks/python/container/py39/base_image_requirements.txt @@ -70,7 +70,7 @@ google-cloud-recommendations-ai==0.7.1 google-cloud-spanner==3.22.2 google-cloud-storage==2.5.0 google-cloud-videointelligence==1.16.3 -google-cloud-vision==1.0.2 +google-cloud-vision==3.1.4 google-crc32c==1.5.0 google-pasta==0.2.0 google-python-cloud-debugger==3.1 @@ -109,7 +109,7 @@ overrides==6.5.0 packaging==21.3 pandas==1.4.4 parameterized==0.8.1 -pbr==5.10.0 +pbr==5.11.0 pluggy==1.0.0 proto-plus==1.22.1 protobuf==3.19.6 @@ -138,7 +138,7 @@ requests-mock==1.10.0 requests-oauthlib==1.3.1 rsa==4.9 scikit-learn==1.1.2 -scipy==1.9.2 +scipy==1.9.3 six==1.16.0 sortedcontainers==2.4.0 soupsieve==2.3.2.post1 diff --git a/sdks/python/mypy.ini b/sdks/python/mypy.ini index 9309120a8cab..a628036d6682 100644 --- a/sdks/python/mypy.ini +++ b/sdks/python/mypy.ini @@ -89,6 +89,9 @@ ignore_errors = true [mypy-apache_beam.runners.direct.*] ignore_errors = true +[mypy-apache_beam.runners.dask.*] +ignore_errors = true + [mypy-apache_beam.runners.interactive.*] ignore_errors = true diff --git a/sdks/python/setup.py b/sdks/python/setup.py index c91fb2e71a85..61858fa5d978 100644 --- a/sdks/python/setup.py +++ b/sdks/python/setup.py @@ -314,7 +314,7 @@ def get_portability_package_data(): 'google-cloud-dlp>=3.0.0,<4', 'google-cloud-language>=1.3.0,<2', 'google-cloud-videointelligence>=1.8.0,<2', - 'google-cloud-vision>=0.38.0,<2', + 'google-cloud-vision>=2,<4', 'google-cloud-recommendations-ai>=0.1.0,<0.8.0' ], 'interactive': [ @@ -324,7 +324,7 @@ def get_portability_package_data(): 'ipython>=7,<8;python_version<="3.7"', 'ipython>=8,<9;python_version>"3.7"', 'ipykernel>=6,<7', - 'ipywidgets>=7.6.5,<8', + 'ipywidgets>=8,<9', # Skip version 6.1.13 due to # https://github.com/jupyter/jupyter_client/issues/637 'jupyter-client>=6.1.11,<6.1.13', @@ -350,7 +350,11 @@ def get_portability_package_data(): # This can be removed once dill is updated to version > 0.3.5.1 # Issue: https://github.com/apache/beam/issues/23566 'dataframe': ['pandas>=1.0,<1.5;python_version<"3.10"', - 'pandas>=1.4.3,<1.5;python_version>="3.10"'] + 'pandas>=1.4.3,<1.5;python_version>="3.10"'], + 'dask': [ + 'dask >= 2022.6', + 'distributed >= 2022.6', + ], }, zip_safe=False, # PyPI package information. diff --git a/sdks/python/test-suites/portable/common.gradle b/sdks/python/test-suites/portable/common.gradle index 79770e893256..0eae96c8bec9 100644 --- a/sdks/python/test-suites/portable/common.gradle +++ b/sdks/python/test-suites/portable/common.gradle @@ -172,15 +172,15 @@ task samzaValidatesRunner() { def createSparkRunnerTestTask(String workerType) { def taskName = "sparkCompatibilityMatrix${workerType}" - // `project(':runners:spark:2:job-server').shadowJar.archivePath` is not resolvable until runtime, so hard-code it here. - def jobServerJar = "${rootDir}/runners/spark/2/job-server/build/libs/beam-runners-spark-job-server-${version}.jar" + // `project(':runners:spark:3:job-server').shadowJar.archivePath` is not resolvable until runtime, so hard-code it here. + def jobServerJar = "${rootDir}/runners/spark/3/job-server/build/libs/beam-runners-spark-3-job-server-${version}.jar" def options = "--spark_job_server_jar=${jobServerJar} --environment_type=${workerType}" if (workerType == 'PROCESS') { options += " --environment_options=process_command=${buildDir.absolutePath}/sdk_worker.sh" } def task = toxTask(taskName, 'spark-runner-test', options) task.configure { - dependsOn ':runners:spark:2:job-server:shadowJar' + dependsOn ':runners:spark:3:job-server:shadowJar' if (workerType == 'DOCKER') { dependsOn pythonContainerTask } else if (workerType == 'PROCESS') { @@ -208,7 +208,7 @@ project.tasks.register("preCommitPy${pythonVersionSuffix}") { project.tasks.register("postCommitPy${pythonVersionSuffix}") { dependsOn = ['setupVirtualenv', "postCommitPy${pythonVersionSuffix}IT", - ':runners:spark:2:job-server:shadowJar', + ':runners:spark:3:job-server:shadowJar', 'portableLocalRunnerJuliaSetWithSetupPy', 'portableWordCountSparkRunnerBatch', 'portableLocalRunnerTestWithRequirementsFile'] @@ -248,13 +248,13 @@ project.tasks.register("sparkExamples") { dependsOn = [ 'setupVirtualenv', 'installGcpTest', - ':runners:spark:2:job-server:shadowJar' + ':runners:spark:3:job-server:shadowJar' ] doLast { def testOpts = [ "--log-cli-level=INFO", ] - def jobServerJar = "${rootDir}/runners/spark/2/job-server/build/libs/beam-runners-spark-job-server-${version}.jar" + def jobServerJar = "${rootDir}/runners/spark/3/job-server/build/libs/beam-runners-spark-3-job-server-${version}.jar" def pipelineOpts = [ "--runner=SparkRunner", "--project=apache-beam-testing", @@ -350,6 +350,13 @@ project.tasks.register("xlangSpannerIOIT") { "--environment_type=LOOPBACK", "--temp_location=gs://temp-storage-for-end-to-end-tests/temp-it", "--flink_job_server_jar=${project(":runners:flink:${latestFlinkVersion}:job-server").shadowJar.archivePath}", + '--sdk_harness_log_level_overrides=' + + // suppress info level flink.runtime log flood + '{\\"org.apache.flink.runtime\\":\\"WARN\\",' + + // suppress full __metricscontainers log printed in FlinkPipelineRunner.createPortablePipelineResult + '\\"org.apache.beam.runners.flink.FlinkPipelineRunner\\":\\"WARN\\",' + + // suppress metric name collision warning logs + '\\"org.apache.flink.runtime.metrics.groups\\":\\"ERROR\\"}' ] def cmdArgs = mapToArgString([ "test_opts": testOpts, @@ -388,7 +395,7 @@ def addTestJavaJarCreator(String runner, Task jobServerJarTask) { // TODO(BEAM-11333) Update and test multiple Flink versions. addTestJavaJarCreator("FlinkRunner", tasks.getByPath(":runners:flink:${latestFlinkVersion}:job-server:shadowJar")) -addTestJavaJarCreator("SparkRunner", tasks.getByPath(":runners:spark:2:job-server:shadowJar")) +addTestJavaJarCreator("SparkRunner", tasks.getByPath(":runners:spark:3:job-server:shadowJar")) def addTestFlinkUberJar(boolean saveMainSession) { project.tasks.register("testUberJarFlinkRunner${saveMainSession ? 'SaveMainSession' : ''}") { diff --git a/sdks/python/test-suites/tox/common.gradle b/sdks/python/test-suites/tox/common.gradle index 99afc1d72557..61802ac9c45e 100644 --- a/sdks/python/test-suites/tox/common.gradle +++ b/sdks/python/test-suites/tox/common.gradle @@ -24,6 +24,9 @@ test.dependsOn "testPython${pythonVersionSuffix}" toxTask "testPy${pythonVersionSuffix}Cloud", "py${pythonVersionSuffix}-cloud" test.dependsOn "testPy${pythonVersionSuffix}Cloud" +toxTask "testPy${pythonVersionSuffix}Dask", "py${pythonVersionSuffix}-dask" +test.dependsOn "testPy${pythonVersionSuffix}Dask" + toxTask "testPy${pythonVersionSuffix}Cython", "py${pythonVersionSuffix}-cython" test.dependsOn "testPy${pythonVersionSuffix}Cython" diff --git a/sdks/python/tox.ini b/sdks/python/tox.ini index 138a5410ead0..33ec39c41892 100644 --- a/sdks/python/tox.ini +++ b/sdks/python/tox.ini @@ -17,7 +17,7 @@ [tox] # new environments will be excluded by default unless explicitly added to envlist. -envlist = py37,py38,py39,py310,py37-{cloud,cython,lint,mypy},py38-{cloud,cython,docs,cloudcoverage},py39-{cloud,cython},py310-{cloud,cython},whitespacelint +envlist = py37,py38,py39,py310,py37-{cloud,cython,lint,mypy,dask},py38-{cloud,cython,docs,cloudcoverage,dask},py39-{cloud,cython},py310-{cloud,cython,dask},whitespacelint toxworkdir = {toxinidir}/target/{env:ENV_NAME:.tox} [pycodestyle] @@ -92,6 +92,10 @@ extras = test,gcp,interactive,dataframe,aws,azure commands = {toxinidir}/scripts/run_pytest.sh {envname} "{posargs}" +[testenv:py{37,38,39}-dask] +extras = test,dask +commands = + {toxinidir}/scripts/run_pytest.sh {envname} "{posargs}" [testenv:py38-cloudcoverage] deps = codecov @@ -129,6 +133,8 @@ commands = deps = -r build-requirements.txt mypy==0.782 + dask==2022.01.0 + distributed==2022.01.0 # make extras available in case any of these libs are typed extras = gcp @@ -136,8 +142,9 @@ commands = mypy --version python setup.py mypy + [testenv:py38-docs] -extras = test,gcp,docs,interactive,dataframe +extras = test,gcp,docs,interactive,dataframe,dask deps = Sphinx==1.8.5 sphinx_rtd_theme==0.4.3 diff --git a/website/ADD_CASE_STUDY.md b/website/ADD_CASE_STUDY.md new file mode 100644 index 000000000000..f5fd454cad8c --- /dev/null +++ b/website/ADD_CASE_STUDY.md @@ -0,0 +1,72 @@ + + +# How to add a new case study + +1. Fork [Apache Beam](https://github.com/apache/beam) repository +2. This [case study draft template](https://docs.google.com/document/d/1qRpXW-WM4jtlcy5VaqDaXgYap9KI1ii27Uwp641UOBM/edit#heading=h.l6lphj20eacs) provides some helpful tips, questions and ideas to prepare and organize your case study content +3. Copy [case study md template](https://github.com/apache/beam/tree/master/website/CASE_STUDY_TEMPLATE.md) to the `case-studies` folder and name your file with company or project name e.g., `beam/website/www/site/content/en/case-studies/YOUR_CASE_STUDY_NAME.md` +4. Add your case study content to the md file you just created. See [Case study md file recommendations](#case-study-md-file-recommendations) +5. Add images to the image folder [beam/website/www/site/static/images/case-study](https://github.com/apache/beam/tree/master/website/www/site/static/images/case-study)/company-name according to [Case study images recommendations](#case-study-images-recommendations) +6. Add case study quote card for the [Apache Beam](https://beam.apache.org/) website homepage `Case Studies Powered by Apache Beam` section. See [Add case study card to the Apache Beam website homepage](#Add-case-study-card-to-the-Apache-Beam-website-homepage) +7. Create pull request to the apache beam repository with your changes + + +## Case study md file recommendations + +Following properties determine how your case-study will looks on [Apache Beam case studies](https://beam.apache.org/case-studies/) listing and the case study page itself. + +| Field | Description | +|-------------------|---------------------------------------------------------------------------------------------------------| +| `title` | Case study title, usually 4-12 words | +| `name` | Company or project name | +| `icon` | Relative path to the company/project logo e.g. "/images/logos/powered-by/company_name.png" | +| `category` | `study` for case studies | +| `cardTitle` | Case study card title for Apache Beam [case studies](https://beam.apache.org/case-studies/) page | +| `cardDescription` | Description for [case studies](https://beam.apache.org/case-studies/) page, usually 30-40 words | +| `authorName` | Case study author | +| `authorPosition` | Case study author role | +| `authorImg` | Relative path for case study author photo, e.g. "/images/case-study/company/authorImg.png" | +| `publishDate` | Case study publish date for sorting at [case studies](https://beam.apache.org/case-studies/), e.g. `2022-10-14T01:56:00+00:00` | + +Other sections of the [case study md template](https://github.com/apache/beam/blob/master/website/CASE_STUDY_TEMPLATE.md) are organized to present the case study content. + +## Case study images recommendations + +1. Add case study company/project logo to the [images/logos/powered-by](https://github.com/apache/beam/tree/master/website/www/site/static/images/logos/powered-by) folder. Please use your company/project name e.g. `ricardo.png` +2. Create your company/project folder to group images used in your case study e.g., `beam/website/www/site/static/images/case-study/company-name` folder +3. Add author photo to `beam/website/www/site/static/images/case-study/company-name` folder +4. Add other images that your case study is using to `beam/website/www/site/static/images/case-study/company-name` folder + + +## Add case study card to the Apache Beam website homepage + +To add a new case study card to the Apache Beam website homepage, add the new case study entry to the [quotes.yaml](https://github.com/apache/beam/blob/master/website/www/site/data/en/quotes.yaml) using the following format: + +| Field | Description | +|-------------------|---------------------------------------------------------------------------------------------------------| +| `text` | Homepage case study text, recommended up to 215 characters or so | +| `icon` | Relative path to quotation marks logo, by default `icons/quote-icon.svg` | +| `logoUrl` | Relative path for company/project logo, e.g. `images/logos/powered-by/company_name.png` | +| `linkUrl` | Relative path to the case study web page, e.g., `case-studies/YOUR_CASE_STUDY_NAME/index.html` | +| `linkText` | Link text, by default using `Learn more` | + +Example: +``` + text: Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s. // recommendation to use no more than 215 symbols in the text + icon: icons/quote-icon.svg + logoUrl: images/logos/powered-by/company_name.png + linkUrl: case-studies/YOUR_CASE_STUDY_NAME/index.html + linkText: Learn more +``` diff --git a/website/ADD_LOGO.md b/website/ADD_LOGO.md index a478ab867641..b6beddf05bca 100644 --- a/website/ADD_LOGO.md +++ b/website/ADD_LOGO.md @@ -18,27 +18,31 @@ --> # How to add your logo - +Please follow these steps to add your company or project logo to Apache Beam [case studies](https://beam.apache.org/case-studies/) page: 1. Fork [Apache Beam](https://github.com/apache/beam) repository 2. Add file with company or project name to the [case-studies](https://github.com/apache/beam/tree/master/website/www/site/content/en/case-studies) folder e.g., `company.md` -3. Add project/company logo to +3. Add company/project logo to the [images/logos/powered-by](https://github.com/apache/beam/tree/master/website/www/site/static/images/logos/powered-by) folder. Please use your company/project name e.g. `ricardo.png` 4. Copy template below to the created file and replace next fields with your data -| Field | Name | -|-----------------|--------------------------------------------------| -| title | Project/Company name | -| icon | Path to the logo e.g. "/images/company_name.png" | -| cardDescription | Description of the project | +| Field | Description | +|-------------------|---------------------------------------------------------------------------------------------------------| +| `title` | Company/project name | +| `icon` | Path to the company/project logo e.g. "/images/logos/powered-by/company_name.png" | +| `hasNav` | Specified logo page has space for left & right nav menu | +| `hasLink` | Links logo image to the company/project website instead of displaying cardDescription, optional | +| `cardDescription` | Company or project description, optional | ``` --- title: "Cloud Dataflow" -icon: /images/company_name.png -cardDescription: "Project/Company description" +icon: /images/logos/powered-by/company_name.png +hasNav: true +hasLink: false +cardDescription: "Google Cloud Dataflow is a fully managed service for executing Apache Beam pipelines within the Google Cloud Platform ecosystem." --- ``` -5. Create pull request to the apache beam repository with your changes +5. Create pull request to the Apache Beam repository with your changes \ No newline at end of file diff --git a/website/CASE_STUDY_TEMPLATE.md b/website/CASE_STUDY_TEMPLATE.md new file mode 100644 index 000000000000..4c1faa1265df --- /dev/null +++ b/website/CASE_STUDY_TEMPLATE.md @@ -0,0 +1,97 @@ +--- +title: "Case study title" +name: "Company/project name" +icon: /images/logos/powered-by/company_name.png +hasNav: true +category: study +cardTitle: "Case study title (different for the Case Studies page listing)" +cardDescription: "Case study description for Case Studies page listing" +authorName: "Name LastName" +authorPosition: "Software Engineer @ companyName" +authorImg: /images/case-study/company/authorImg.png +publishDate: 2022-02-15T01:56:00+00:00 +--- + + +

    +
    + +
    +
    +

    + “Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book.” +

    +
    +
    + +
    +
    +
    + Name LastName +
    +
    + Software Engineer @ companyName +
    +
    +
    +
    +
    + + +
    + +# Case Study Title + +## Background + +[Lorem Ipsum](https://www.lipsum.com/) is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum. + +## Quote Section + +
    +

    + Lorem Ipsum is simply dummy text of the printing and typesetting industry +

    +
    +
    + +
    +
    +
    + Name LastName +
    +
    + Software Engineer @ companyName +
    +
    +
    +
    + +## Content Section + +Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. + +
    + +
    + +## Results + +Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. + + +{{< case_study_feedback Template >}} +
    +
    diff --git a/website/CONTRIBUTE.md b/website/CONTRIBUTE.md index 31a4550be237..b54139280df9 100644 --- a/website/CONTRIBUTE.md +++ b/website/CONTRIBUTE.md @@ -30,12 +30,15 @@ This guide consists of: - [Define TableOfContents](#define-tableofcontents) - [Language switching](#language-switching) - [Code highlighting](#code-highlighting) - - [Adding class to markdown text](#paragraph) + - [Adding class to markdown text](#adding-class-to-markdown-text) - [Table](#table) - [Github sample](#github-sample) - [Others](#others) + - [How to add relative links to JavaScript or CSS](#how-to-add-relative-links-to-javascript-or-css) - [What to be replaced in Jekyll](#what-to-be-replaced-in-jekyll) - [Translation guide](#translation-guide) +- [How to add new case study](#how-to-add-a-new-apache-beam-case-study) +- [How to add new logo](#how-to-add-a-new-logo-to-case-studies-page) ## Project structure @@ -45,10 +48,13 @@ www/ ├── site │   ├── archetypes # frontmatter template │   ├── assets -│ │ └── scss # styles +│ │ ├── icons # svg icons +│ │ ├── js # scripts +│ │ ├── scss # styles │   ├── content # pages │ │ └── en │ │ ├── blog +│ │ ├── case-studies │ │ ├── community │ │ ├── contribute │ │ ├── documentation @@ -63,7 +69,6 @@ www/ │ │ ├── downloads # downloaded files │ │ └── fonts │ │ └── images -│ │ └── js │   └── themes │ └── docsy ├── build_code_samples.sh @@ -162,7 +167,7 @@ $ hugo new about/_index.md $ hugo new -c content/pl about/_index.md ``` -## How to write in Hugo +## How to write in Hugo way This section will guide you how to use Hugo shortcodes in Apache Beam website. Please refer to the [Hugo documentation](https://gohugo.io/content-management/shortcodes/) for more details of usage. @@ -280,7 +285,7 @@ A table markdown here. {{< /table >}} ``` -### Code sample +### Github sample To retrieve a piece of code from Beam project. @@ -312,6 +317,15 @@ To get branch of the repository in markdown: To render capability matrix, please take a look at [this example](/www/site/content/en/documentation/runners/capability-matrix/#beam-capability-matrix). +### How to add relative links to JavaScript or CSS +Please take a note that relative links should be added with relative paths to JavaScript or CSS files with using Hugo syntax, so that they are able to form correct absolute links on localhost, staging and production. Examples: +``` +/themes/docsy/assets/js/search.js # var searchPage = "{{ "search/" | absURL }}?q=" + query; +/assets/js/page-nav.js # img.src = "{{ "images/arrow-expandable.svg" | absURL }}"; +/assets/js/copy-to-clipboard.js # +/assets/scss/_case_study.scss # background-image: url('{{ "images/open-quote.svg" | absURL }}'); +``` + ## What to be replaced in Jekyll This section will briefly let you know the replaced features of Jekyll in terms of writing a new blog post or documentation in Hugo. @@ -420,3 +434,9 @@ Now from your template: Similar to markdown content translation, there are two separated section menus `/www/site/layouts/partials/section-menu` corresponding to your languages. Your job is to take the section menus in `en` directory, translate and place them inside your `pl` directory. **Note**: if you get stuck at adding translation, please refer to [our example](https://github.com/PolideaInternal/beam/tree/example/i18n/). + +## How to add a new Apache Beam case study +Please follow this guide to [add a new case study](https://github.com/apache/beam/tree/master/website/ADD_CASE_STUDY.md) + +## How to add a new logo to case studies page +Please follow this guide to add [a new logo](https://github.com/apache/beam/tree/master/website/ADD_LOGO.md) to the [case studies](https://beam.apache.org/case-studies/) page. diff --git a/website/www/site/assets/js/shuffle-elements.js b/website/www/site/assets/js/shuffle-elements.js new file mode 100644 index 000000000000..2e5c4bc06dcd --- /dev/null +++ b/website/www/site/assets/js/shuffle-elements.js @@ -0,0 +1,25 @@ +// Licensed under the Apache License, Version 2.0 (the 'License'); you may not +// use this file except in compliance with the License. You may obtain a copy of +// the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an 'AS IS' BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations under +// the License. + +$(document).ready(function() { + const logos = document.querySelector(".case-study-list--additional"); + const temp = logos.cloneNode(true); + let i = temp.children.length + 1; + + while( i-- > 0 ) { + temp.appendChild( temp.children[Math.random() * i |0] ); + } + + logos.parentNode.replaceChild(temp, logos); + + document.querySelector(".case-study-list--additional").style.visibility = "visible"; +}); diff --git a/website/www/site/assets/scss/_calendar.scss b/website/www/site/assets/scss/_calendar.scss index 8e27fccbd6ea..68d08872c961 100644 --- a/website/www/site/assets/scss/_calendar.scss +++ b/website/www/site/assets/scss/_calendar.scss @@ -94,6 +94,9 @@ padding: 24px 19.2px 24.7px 20px; margin-bottom: 24px; } + @media (max-width: $ak-breakpoint-xs) { + width: 260px; + } &:hover { text-decoration: none; box-shadow: 0 4px 20px 0 rgba(0, 0, 0, 0.24), @@ -251,6 +254,9 @@ max-width: 327px; height: 356px; padding: 32px 20px; + @media (max-width: $ak-breakpoint-xs) { + max-width: 260px; + } .calendar-card-big-title { margin-top: 35px; diff --git a/website/www/site/assets/scss/_case_study.scss b/website/www/site/assets/scss/_case_study.scss index 00ed9fd6c933..cd9e809b1997 100644 --- a/website/www/site/assets/scss/_case_study.scss +++ b/website/www/site/assets/scss/_case_study.scss @@ -44,6 +44,7 @@ } .case-study-list--additional { + visibility: hidden; @media (min-width: $mobile) and (max-width: $tablet) { justify-content: center; } @@ -88,11 +89,19 @@ .case-study-used-by-card--responsive { @media (min-width: $mobile){ - width: 23%; + width: 18%; margin-right: 0; } } +.case-study-used-by-card--with-link { + &:hover { + .case-study-used-by-card-img { + display: block; + } + } +} + .case-study-card { padding: 16px; display: flex; diff --git a/website/www/site/assets/scss/_local.scss b/website/www/site/assets/scss/_local.scss index a297bc15bde8..f638f64a1185 100644 --- a/website/www/site/assets/scss/_local.scss +++ b/website/www/site/assets/scss/_local.scss @@ -15,8 +15,24 @@ * limitations under the License. */ +@import "media"; + .paragraph-wrap { a { word-break: break-word; } } + +.calendar-mobile--twitter { + @media (max-width: $ak-breakpoint-xs) { + iframe { + width: 260px !important; + } + } +} + +.calendar-mobile--events { + @media (max-width: $ak-breakpoint-xs) { + overflow-x: auto; + } +} diff --git a/website/www/site/content/en/case-studies/Amazon.md b/website/www/site/content/en/case-studies/Amazon.md new file mode 100644 index 000000000000..2fba8b727aa4 --- /dev/null +++ b/website/www/site/content/en/case-studies/Amazon.md @@ -0,0 +1,18 @@ +--- +title: "Amazon" +icon: /images/logos/powered-by/Amazon.png +--- + + diff --git a/website/www/site/content/en/case-studies/ML6.md b/website/www/site/content/en/case-studies/ML6.md new file mode 100644 index 000000000000..6f41ce5405ce --- /dev/null +++ b/website/www/site/content/en/case-studies/ML6.md @@ -0,0 +1,18 @@ +--- +title: "ML6" +icon: /images/logos/powered-by/ML6.jpg +--- + + diff --git a/website/www/site/content/en/case-studies/Strivr.md b/website/www/site/content/en/case-studies/Strivr.md new file mode 100644 index 000000000000..4443d771f5b2 --- /dev/null +++ b/website/www/site/content/en/case-studies/Strivr.md @@ -0,0 +1,17 @@ +--- +title: "Strivr" +icon: /images/logos/powered-by/Strivr.png +--- + diff --git a/website/www/site/content/en/case-studies/TrustPilot.md b/website/www/site/content/en/case-studies/TrustPilot.md new file mode 100644 index 000000000000..09053e72f208 --- /dev/null +++ b/website/www/site/content/en/case-studies/TrustPilot.md @@ -0,0 +1,17 @@ +--- +title: "TrustPilot" +icon: /images/logos/powered-by/Trustpilot.png +--- + diff --git a/website/www/site/content/en/case-studies/Twitter.md b/website/www/site/content/en/case-studies/Twitter.md new file mode 100644 index 000000000000..16ce225c4e74 --- /dev/null +++ b/website/www/site/content/en/case-studies/Twitter.md @@ -0,0 +1,17 @@ +--- +title: "Twitter" +icon: /images/logos/powered-by/Twitter.png +--- + diff --git a/website/www/site/content/en/case-studies/Wayfair.md b/website/www/site/content/en/case-studies/Wayfair.md new file mode 100644 index 000000000000..88210a020508 --- /dev/null +++ b/website/www/site/content/en/case-studies/Wayfair.md @@ -0,0 +1,17 @@ +--- +title: "Wayfair" +icon: /images/logos/powered-by/Wayfair.png +--- + diff --git a/website/www/site/content/en/case-studies/Wizeline.md b/website/www/site/content/en/case-studies/Wizeline.md new file mode 100644 index 000000000000..e971353f667f --- /dev/null +++ b/website/www/site/content/en/case-studies/Wizeline.md @@ -0,0 +1,17 @@ +--- +title: "Wizeline" +icon: /images/logos/powered-by/Wizeline.png +--- + diff --git a/website/www/site/content/en/case-studies/akvelon.md b/website/www/site/content/en/case-studies/akvelon.md index ad95073fb772..b119bafc770b 100644 --- a/website/www/site/content/en/case-studies/akvelon.md +++ b/website/www/site/content/en/case-studies/akvelon.md @@ -2,7 +2,7 @@ title: "Akvelon" icon: /images/logos/powered-by/akvelon.png hasNav: true -cardDescription: "Akvelon is a software engineering company that helps start-ups, SMBs, and Fortune 500 companies unlock the full potential of cloud, data, and AI/ML to empower their strategic advantage. Akvelon team has deep expertise in integrating Apache Beam with diverse data processing ecosystems and is an enthusiastic Apache Beam community contributor." +cardDescription: "

    Akvelon is a software engineering company that helps start-ups, SMBs, and Fortune 500 companies unlock the full potential of cloud, data, and AI/ML to empower their strategic advantage. Akvelon team has deep expertise in integrating Apache Beam with diverse data processing ecosystems and is an enthusiastic Apache Beam community contributor.

    " --- -Kio is a set of Kotlin extensions for Apache Beam to implement fluent-like API for Java SDK. +
    + +# Kio is a set of Kotlin extensions for Apache Beam to implement fluent-like API for Java SDK. ## Word Count example @@ -41,3 +43,5 @@ kio.execute().waitUntilDone() ## Documentation For more information about Kio, please see the documentation here: [https://code.chermenin.ru/kio](https://code.chermenin.ru/kio). +
    +
    diff --git a/website/www/site/content/en/contribute/release-guide.md b/website/www/site/content/en/contribute/release-guide.md index c5ea4442145b..5b48068d05a2 100644 --- a/website/www/site/content/en/contribute/release-guide.md +++ b/website/www/site/content/en/contribute/release-guide.md @@ -562,18 +562,18 @@ See the source of the script for more details, or to run commands manually in ca 1. Verify Docker images are published. How to find images: 1. Visit [https://hub.docker.com/u/apache](https://hub.docker.com/search?q=apache%2Fbeam&type=image) 2. Visit each repository and navigate to *tags* tab. - 3. Verify images are pushed with tags: ${RELEASE}_rc{RC_NUM} + 3. Verify images are pushed with tags: ${RELEASE_VERSION}_rc{RC_NUM} 1. Verify that third party licenses are included in Docker containers by logging in to the images. - For Python SDK images, there should be around 80 ~ 100 dependencies. Please note that dependencies for the SDKs with different Python versions vary. Need to verify all Python images by replacing `${ver}` with each supported Python version `X.Y`. ``` - docker run --rm -it --entrypoint=/bin/bash apache/beam_python${ver}_sdk:${RELEASE}_rc{RC_NUM} + docker run --rm -it --entrypoint=/bin/bash apache/beam_python${ver}_sdk:${RELEASE_VERSION}_rc{RC_NUM} ls -al /opt/apache/beam/third_party_licenses/ | wc -l ``` - For Java SDK images, there should be around 200 dependencies. ``` - docker run --rm -it --entrypoint=/bin/bash apache/beam_java${ver}_sdk:${RELEASE}_rc{RC_NUM} + docker run --rm -it --entrypoint=/bin/bash apache/beam_java${ver}_sdk:${RELEASE_VERSION}_rc{RC_NUM} ls -al /opt/apache/beam/third_party_licenses/ | wc -l ``` 1. Publish staging artifacts @@ -740,7 +740,7 @@ all major features and bug fixes, and all known issues. 1. Maven artifacts deployed to the staging repository of [repository.apache.org](https://repository.apache.org/content/repositories/) 1. Source distribution deployed to the dev repository of [dist.apache.org](https://dist.apache.org/repos/dist/dev/beam/) 1. Website pull request proposed to list the [release](https://beam.apache.org/get-started/downloads/), publish the [Java API reference manual](https://beam.apache.org/releases/javadoc/), and publish the [Python API reference manual](https://beam.apache.org/releases/pydoc/). -1. Docker images are published to [DockerHub](https://hub.docker.com/search?q=apache%2Fbeam&type=image) with tags: {RELEASE}_rc{RC_NUM}. +1. Docker images are published to [DockerHub](https://hub.docker.com/search?q=apache%2Fbeam&type=image) with tags: {RELEASE_VERSION}_rc{RC_NUM}. You can (optionally) also do additional verification by: 1. Check that Python zip file contains the `README.md`, `NOTICE`, and `LICENSE` files. @@ -787,8 +787,9 @@ Here’s an email template; please adjust as you see fit. * website pull request listing the release [6], the blog post [6], and publishing the API reference manual [7]. * Java artifacts were built with Gradle GRADLE_VERSION and OpenJDK/Oracle JDK JDK_VERSION. * Python artifacts are deployed along with the source release to the dist.apache.org [2] and PyPI[8]. - * Validation sheet with a tab for 1.2.3 release to help with validation [9]. - * Docker images published to Docker Hub [10]. + * Go artifacts and documentation are available at pkg.go.dev [9] + * Validation sheet with a tab for 1.2.3 release to help with validation [10]. + * Docker images published to Docker Hub [11]. The vote will be open for at least 72 hours. It is adopted by majority approval, with at least 3 PMC affirmative votes. @@ -805,8 +806,9 @@ Here’s an email template; please adjust as you see fit. [6] https://github.com/apache/beam/pull/... [7] https://github.com/apache/beam-site/pull/... [8] https://pypi.org/project/apache-beam/1.2.3rc3/ - [9] https://docs.google.com/spreadsheets/d/1qk-N5vjXvbcEk68GjbkSZTR8AGqyNUM-oLFo_ZXBpJw/edit#gid=... - [10] https://hub.docker.com/search?q=apache%2Fbeam&type=image + [9] https://pkg.go.dev/github.com/apache/beam/sdks/v2@v1.2.3-RC3/go/pkg/beam + [10] https://docs.google.com/spreadsheets/d/1qk-N5vjXvbcEk68GjbkSZTR8AGqyNUM-oLFo_ZXBpJw/edit#gid=... + [11] https://hub.docker.com/search?q=apache%2Fbeam&type=image If there are any issues found in the release candidate, reply on the vote thread to cancel the vote. There’s no need to wait 72 hours. @@ -877,7 +879,7 @@ _Note_: -Prepourl and -Pver can be found in the RC vote email sent by Release Ma ``` **Spark Local Runner** ``` - ./gradlew :runners:spark:2:runQuickstartJavaSpark \ + ./gradlew :runners:spark:3:runQuickstartJavaSpark \ -Prepourl=https://repository.apache.org/content/repositories/orgapachebeam-${KEY} \ -Pver=${RELEASE_VERSION} ``` @@ -1145,8 +1147,8 @@ All wheels should be published, in addition to the zip of the release source. ./beam/release/src/main/scripts/publish_docker_images.sh ``` * **Verify that:** - * Images are published at [DockerHub](https://hub.docker.com/search?q=apache%2Fbeam&type=image) with tags {RELEASE} and *latest*. - * Images with *latest* tag are pointing to current release by confirming the digest of the image with *latest* tag is the same as the one with {RELEASE} tag. + * Images are published at [DockerHub](https://hub.docker.com/search?q=apache%2Fbeam&type=image) with tags {RELEASE_VERSION} and *latest*. + * Images with *latest* tag are pointing to current release by confirming the digest of the image with *latest* tag is the same as the one with {RELEASE_VERSION} tag. (Optional) Clean up any unneeded local images afterward to save disk space. @@ -1165,7 +1167,7 @@ Create and push a new signed tag for the released version by copying the tag for # Optional: unlock the signing key by signing an arbitrary file. gpg --output ~/doc.sig --sign ~/.bashrc -VERSION_TAG="v${RELEASE}" +VERSION_TAG="v${RELEASE_VERSION}" # Tag for Go SDK git tag -s "sdks/$VERSION_TAG" "$RC_TAG" diff --git a/website/www/site/content/en/documentation/basics.md b/website/www/site/content/en/documentation/basics.md index e38b4c6b0dfd..548850246a9f 100644 --- a/website/www/site/content/en/documentation/basics.md +++ b/website/www/site/content/en/documentation/basics.md @@ -93,7 +93,7 @@ For more information about pipelines, see the following pages: * [Beam Programming Guide: Overview](/documentation/programming-guide/#overview) * [Beam Programming Guide: Creating a pipeline](/documentation/programming-guide/#creating-a-pipeline) * [Design your pipeline](/documentation/pipelines/design-your-pipeline) - * [Create your pipeline](/documentation/pipeline/create-your-pipeline) + * [Create your pipeline](/documentation/pipelines/create-your-pipeline) ### PCollection diff --git a/website/www/site/content/en/documentation/ml/data-processing.md b/website/www/site/content/en/documentation/ml/data-processing.md index 6304b6d4fe1b..70e72e1c983d 100755 --- a/website/www/site/content/en/documentation/ml/data-processing.md +++ b/website/www/site/content/en/documentation/ml/data-processing.md @@ -53,6 +53,8 @@ ib.collect(beam_df.describe()) ib.collect(beam_df.isnull()) ``` +For a full end-to-end example on how to implement data exploration and data preprocessing with Beam and the DataFrame API for your AI/ML project, you can follow the [Beam Dataframe API tutorial for AI/ML](https://github.com/apache/beam/tree/master/examples/notebooks/beam-ml/dataframe_api_preprocessing.ipynb). + ## Data pipeline for ML A typical data preprocessing pipeline consists of the following steps: 1. Reading and writing data: read/write the data from your filesystem, database or messaging queue. Beam has a rich set of [IO connectors](https://beam.apache.org/documentation/io/built-in/) for ingesting and writing data. diff --git a/website/www/site/content/en/documentation/ml/multi-model-pipelines.md b/website/www/site/content/en/documentation/ml/multi-model-pipelines.md index be614e4b5000..ad1f5ff80f46 100644 --- a/website/www/site/content/en/documentation/ml/multi-model-pipelines.md +++ b/website/www/site/content/en/documentation/ml/multi-model-pipelines.md @@ -90,7 +90,7 @@ with pipeline as p: ``` In -this [notebook](https://github.com/apache/beam/tree/master/examples/notebooks/beam-ml/run-inference-multi-model.ipynb) +this [notebook](https://github.com/apache/beam/tree/master/examples/notebooks/beam-ml/run_inference_multi_model.ipynb) , we show an end-to-end example of a cascade pipeline used for generating and ranking image captions. The solution consists of two open-source models: diff --git a/website/www/site/content/en/documentation/ml/orchestration.md b/website/www/site/content/en/documentation/ml/orchestration.md new file mode 100644 index 000000000000..e3f7b7169e40 --- /dev/null +++ b/website/www/site/content/en/documentation/ml/orchestration.md @@ -0,0 +1,223 @@ +--- +title: "Orchestration" +--- + + +# Workflow orchestration + +## Understanding the Beam DAG + + +Apache Beam is an open source, unified model for defining both batch and streaming data-parallel processing pipelines. One of the central concepts to the Beam programming model is the DAG (= Directed Acyclic Graph). Each Beam pipeline is a DAG that can be constructed through the Beam SDK in your programming language of choice (from the set of supported beam SDKs). Each node of this DAG represents a processing step (PTransform) that accepts a collection of data as input (PCollection) and outputs a transformed collection of data (PCollection). The edges define how data flows through the pipeline from one processing step to another. The image below shows an example of such a pipeline. + +![A standalone beam pipeline](/images/standalone-beam-pipeline.svg) + +Note that simply defining a pipeline and the corresponding DAG does not mean that data will start flowing through the pipeline. To actually execute the pipeline, it has to be deployed to one of the [supported Beam runners](https://beam.apache.org/documentation/runners/capability-matrix/). These distributed processing back-ends include Apache Flink, Apache Spark and Google Cloud Dataflow. A [Direct Runner](https://beam.apache.org/documentation/runners/direct/) is also provided to execute the pipeline locally on your machine for development and debugging purposes. Make sure to check out the [runner capability matrix](https://beam.apache.org/documentation/runners/capability-matrix/) to guarantee that the chosen runner supports the data processing steps defined in your pipeline, especially when using the Direct Runner. + +## Orchestrating frameworks + +Successfully delivering machine learning projects is about a lot more than training a model and calling it a day. A full ML workflow will often contain a range of other steps including data ingestion, data validation, data preprocessing, model evaluation, model deployment, data drift detection, etc. Furthermore, it’s essential to keep track of metadata and artifacts from your experiments to answer important questions like: +- What data was this model trained on and with which training parameters? +- When was this model deployed and what accuracy did it get on a test dataset? +Without this knowledge at your disposal, it will become increasingly difficult to troubleshoot, monitor and improve your ML solutions as they grow in size. + +The solution: MLOps. MLOps is an umbrella term used to describe best practices and guiding principles that aim to make the development and maintenance of machine learning systems seamless and efficient. Simply put, MLOps is most often about automating machine learning workflows throughout the model and data lifecycle. Popular frameworks to create these workflow DAGs are [Kubeflow Pipelines](https://www.kubeflow.org/docs/components/pipelines/introduction/), [Apache Airflow](https://airflow.apache.org/docs/apache-airflow/stable/index.html) and [TFX](https://www.tensorflow.org/tfx/guide). + +So what does all of this have to do with Beam? Well, since we established that Beam is a great tool for a range of ML tasks, a beam pipeline can either be used as a standalone data processing job or can be part of a larger sequence of steps in such a workflow. In the latter case, the beam DAG is just one node in the overarching DAG composed by the workflow orchestrator. This results in a DAG in a DAG, as illustrated by the example below. + +![An beam pipeline as part of a larger orchestrated workflow](/images/orchestrated-beam-pipeline.svg) + +It is important to understand the key difference between the Beam DAG and the orchestrating DAG. The Beam DAG processes data and passes that data between the nodes of its DAG. The focus of Beam is on parallelization and enabling both batch and streaming jobs. In contrast, the orchestration DAG schedules and monitors steps in the workflow and passed between the nodes of the DAG are execution parameters, metadata and artifacts. An example of such an artifact could be a trained model or a dataset. Such artifacts are often passed by a reference URI and not by value. + +Note: TFX creates a workflow DAG, which needs an orchestrator of its own to be executed. [Natively supported orchestrators for TFX](https://www.tensorflow.org/tfx/guide/custom_orchestrator) are Airflow, Kubeflow Pipelines and, here’s the kicker, Beam itself! As mentioned by the [TFX docs](https://www.tensorflow.org/tfx/guide/beam_orchestrator): + +> "Several TFX components rely on Beam for distributed data processing. In addition, TFX can use Apache Beam to orchestrate and execute the pipeline DAG. Beam orchestrator uses a different BeamRunner than the one which is used for component data processing." + +Caveat: The Beam orchestrator is not meant to be a TFX orchestrator to be used in production environments. It simply enables debugging TFX pipelines locally on Beam’s DirectRunner without the need for the extra setup that is needed for Airflow or Kubeflow. + +## Preprocessing example + +Let’s get practical and take a look at two such orchestrated ML workflows, one with Kubeflow Pipelines (KFP) and one with Tensorflow Extended (TFX). These two frameworks achieve the same goal of creating workflows, but have their own distinct advantages and disadvantages: KFP requires you to create your workflow components from scratch and requires a user to explicitly indicate which artifacts should be passed between components and in what way. In contrast, TFX offers a number of prebuilt components and takes care of the artifact passing more implicitly. Clearly, there is a trade-off to be considered between flexibility and programming overhead when choosing between the two frameworks. We will start by looking at an example with KFP and then transition to TFX to show TFX takes care of a lot of functionality that we had to define by hand in the KFP example. + +For simplicity, we will showcase workflows with only three components: data ingestion, data preprocessing and model training. Depending on the scenario, a range of extra components could be added such as model evaluation, model deployment, etc. We will focus our attention on the preprocessing component, since it showcases how to use Apache beam in an ML workflow for efficient and parallel processing of your ML data. + +The dataset we will use consists of image-caption pairs, i.e. images paired with a textual caption describing the content of the image. These pairs are taken from captions subset of the [MSCOCO 2014 dataset](https://cocodataset.org/#home). This multi-modal data (image + text) gives us the opportunity to experiment with preprocessing operations for both modalities. + +### Kubeflow pipelines (KFP) + +In order to execute our ML workflow with KFP we must perform three steps: + +1. Create the KFP components by specifying the interface to the components and by writing and containerizing the implementation of the component logic +2. Create the KFP pipeline by connecting the created components and specifying how inputs and outputs should be passed from between components and compiling the pipeline definition to a full pipeline definition. +3. Execute the KFP pipeline by submitting it to a KFP client endpoint. + +The full example code can be found [here](https://github.com/apache/beam/tree/master/sdks/python/apache_beam/examples/ml-orchestration/kfp) + +#### Create the KFP components + +This is our target file structure: + + kfp + ├── pipeline.py + ├── components + │ ├── ingestion + │ │ ├── Dockerfile + │ │ ├── component.yaml + │ │ ├── requirements.txt + │ │ └── src + │ │ └── ingest.py + │ ├── preprocessing + │ │ ├── Dockerfile + │ │ ├── component.yaml + │ │ ├── requirements.txt + │ │ └── src + │ │ └── preprocess.py + │ └── train + │ ├── Dockerfile + │ ├── component.yaml + │ ├── requirements.txt + │ └── src + │ └── train.py + └── requirements.txt + +Let’s start with the component specifications. The full preprocessing component specification is illustrated below. The inputs are the path where the ingested dataset was saved by the ingest component and a path to a directory where the component can store artifacts. Additionally, there are some inputs that specify how and where the Beam pipeline should run. The specifications for the ingestion and train component are similar and can be found [here](https://github.com/apache/beam/tree/master/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/ingestion/component.yaml) and [here](https://github.com/apache/beam/tree/master/sdks/python/apache_beam/examples/ml-orchestration/kfp/components/train/component.yaml), respectively. + +>Note: we are using the KFP v1 SDK, because v2 is still in [beta](https://www.kubeflow.org/docs/started/support/#application-status). The v2 SDK introduces some new options for specifying the component interface with more native support for input and output artifacts. To see how to migrate components from v1 to v2, consult the [KFP docs](https://www.kubeflow.org/docs/components/pipelines/sdk-v2/v2-component-io/). + +{{< highlight file="sdks/python/apache_beam/examples/ml-orchestration/kfp/components/preprocessing/component.yaml" >}} +{{< code_sample "sdks/python/apache_beam/examples/ml-orchestration/kfp/components/preprocessing/component.yaml" preprocessing_component_definition >}} +{{< /highlight >}} + +In this case, each component shares an identical Dockerfile but extra component-specific dependencies could be added where necessary. + +{{< highlight language="Dockerfile" file="sdks/python/apache_beam/examples/ml-orchestration/kfp/components/preprocessing/Dockerfile" >}} +{{< code_sample "sdks/python/apache_beam/examples/ml-orchestration/kfp/components/preprocessing/Dockerfile" component_dockerfile >}} +{{< /highlight >}} + +With the component specification and containerization out of the way we can look at the actual implementation of the preprocessing component. + +Since KFP provides the input and output arguments as command-line arguments, an `argumentparser` is needed. + +{{< highlight file="sdks/python/apache_beam/examples/ml-orchestration/kf/components/preprocessing/src/preprocess.py" >}} +{{< code_sample "sdks/python/apache_beam/examples/ml-orchestration/kfp/components/preprocessing/src/preprocess.py" preprocess_component_argparse >}} +{{< /highlight >}} + +The implementation of the `preprocess_dataset` function contains the Beam pipeline code and the Beam pipeline options to select the desired runner. The executed preprocessing involves downloading the image bytes from their url, converting them to a Torch Tensor and resizing to the desired size. The caption undergoes a series of string manipulations to ensure that our model receives clean uniform image descriptions (Tokenization is not yet done here, but could be included here as well if the vocabulary is known). Finally each element is serialized and written to [Avro](https://avro.apache.org/docs/1.2.0/) files (Alternative files formats could be used as well, e.g. TFRecords). + + +{{< highlight file="sdks/python/apache_beam/examples/ml-orchestration/kfp/components/preprocessing/src/preprocess.py" >}} +{{< code_sample "sdks/python/apache_beam/examples/ml-orchestration/kfp/components/preprocessing/src/preprocess.py" deploy_preprocessing_beam_pipeline >}} +{{< /highlight >}} + +It also contains the necessary code to perform the component IO. First, a target path is constructed to store the preprocessed dataset based on the component input parameter `base_artifact_path` and a timestamp. Output values from components can only be returned as files so we write the value of the constructed target path to an output file that was provided by KFP to our component. + +{{< highlight file="sdks/python/apache_beam/examples/ml-orchestration/kfp/components/preprocessing/src/preprocess.py" >}} +{{< code_sample "sdks/python/apache_beam/examples/ml-orchestration/kfp/components/preprocessing/src/preprocess.py" kfp_component_input_output >}} +{{< /highlight >}} + +Since we are mainly interested in the preprocessing component to show how a Beam pipeline can be integrated into a larger ML workflow, we will not cover the implementation of the ingestion and train component in depth. Implementations of dummy components that mock their behavior are provided in the full example code. + +#### Create the pipeline definition + +`pipeline.py` first loads the created components from their specification `.yaml` file. + +{{< highlight file="sdks/python/apache_beam/examples/ml-orchestration/kfp/pipeline.py" >}} +{{< code_sample "sdks/python/apache_beam/examples/ml-orchestration/kfp/pipeline.py" load_kfp_components >}} +{{< /highlight >}} + +After that, the pipeline is created and the required components inputs and outputs are specified manually. + +{{< highlight file="sdks/python/apache_beam/examples/ml-orchestration/kfp/pipeline.py" >}} +{{< code_sample "sdks/python/apache_beam/examples/ml-orchestration/kfp/pipeline.py" define_kfp_pipeline >}} +{{< /highlight >}} + +Finally, the defined pipeline is compiled and a `pipeline.json` specification file is generated. + +{{< highlight file="sdks/python/apache_beam/examples/ml-orchestration/kfp/pipeline.py" >}} +{{< code_sample "sdks/python/apache_beam/examples/ml-orchestration/kfp/pipeline.py" compile_kfp_pipeline >}} +{{< /highlight >}} + + +#### Execute the KFP pipeline + +Using the specification file and the snippet below with the necessary [requirements](https://github.com/apache/beam/tree/master/sdks/python/apache_beam/examples/ml-orchestration/kfp/requirements.txt) installed, the pipeline can now be executed. Consult the [docs](https://kubeflow-pipelines.readthedocs.io/en/latest/source/kfp.client.html#kfp.Client.run_pipeline) for more information. Note that, before executing the pipeline, a container for each component must be built and pushed to a container registry that can be accessed by your pipeline execution. Also make sure that the component specification `.yaml` files are updated to point to the correct container image. + +{{< highlight file="sdks/python/apache_beam/examples/ml-orchestration/kfp/pipeline.py" >}} +{{< code_sample "sdks/python/apache_beam/examples/ml-orchestration/kfp/pipeline.py" execute_kfp_pipeline >}} +{{< /highlight >}} + + +### Tensorflow Extended (TFX) + +The way of working for TFX is similar to the approach for KFP as illustrated above: Define the individual workflow components, connect them in a pipeline object and run the pipeline in the target environment. However, what makes TFX different is that it has already built a set of Python packages that are libraries to create workflow components. So unlike the KFP example, we do not need to start from scratch by writing and containerizing our code. What is left for the users to do is pick which of those TFX components are relevant to their specific workflow and adapt their functionality to the specific use case using the library. The image below shows the available components and their corresponding libraries. The link with Apache Beam is that TFX relies heavily on it to implement data-parallel pipelines in these libraries. This means that components created with these libraries will need to be run on one of the support Beam runners. The full example code can again be found [here](https://github.com/apache/beam/tree/master/sdks/python/apache_beam/examples/ml-orchestration/tfx) + + +![TFX libraries and components](https://www.tensorflow.org/static/tfx/guide/images/libraries_components.png) + +We will work out a small example in a similar fashion as for KFP. There we used ingestion, preprocessing and trainer components. Translating this to TFX, we will need the ExampleGen, Transform and Trainer libraries. + +This time we will start by looking at the pipeline definition. Note that this looks very similar to our previous example. + +{{< highlight file="sdks/python/apache_beam/examples/ml-orchestration/tfx/coco_captions_local.py" >}} +{{< code_sample "sdks/python/apache_beam/examples/ml-orchestration/tfx/coco_captions_local.py" tfx_pipeline >}} +{{< /highlight >}} + +We will use the same data input as last time, i.e. a couple of image-captions pairs extracted from the [MSCOCO 2014 dataset](https://cocodataset.org/#home). This time, however, in CSV format because the ExampleGen component does not by default have support for jsonlines. (The formats that are supported out of the box are listed [here](https://www.tensorflow.org/tfx/guide/examplegen#data_sources_and_formats). Alternatively, it’s possible to write a [custom ExampleGen](https://www.tensorflow.org/tfx/guide/examplegen#custom_examplegen) as well.) + +Copy the snippet below to an input data csv file: + +{{< highlight >}} +image_id,id,caption,image_url,image_name,image_license +318556,255,"An angled view of a beautifully decorated bathroom.","http://farm4.staticflickr.com/3133/3378902101_3c9fa16b84_z.jpg","COCO_train2014_000000318556.jpg","Attribution-NonCommercial-ShareAlike License" +476220,14,"An empty kitchen with white and black appliances.","http://farm7.staticflickr.com/6173/6207941582_b69380c020_z.jpg","COCO_train2014_000000476220.jpg","Attribution-NonCommercial License" +{{< /highlight >}} + +So far, we have only imported standard TFX components and chained them together into a pipeline. Both the Transform and Trainer components have a `module_file` argument defined. That’s where we define the behavior we want from these standard components. + +#### Preprocess + +The Transform component searches the `module_file` for a definition of the function `preprocessing_fn`. This function is the central concept of the `tf.transform` library. As per the [TFX docs](https://www.tensorflow.org/tfx/transform/get_started#define_a_preprocessing_function): + +> The preprocessing function is the most important concept of tf.Transform. The preprocessing function is a logical description of a transformation of the dataset. The preprocessing function accepts and returns a dictionary of tensors, where a tensor means Tensor or SparseTensor. There are two kinds of functions used to define the preprocessing function: +>1. Any function that accepts and returns tensors. These add TensorFlow operations to the graph that transform raw data into transformed data. +>2. Any of the analyzers provided by tf.Transform. Analyzers also accept and return tensors, but unlike TensorFlow functions, they do not add operations to the graph. Instead, analyzers cause tf.Transform to compute a full-pass operation outside of TensorFlow. They use the input tensor values over the entire dataset to generate a constant tensor that is returned as the output. For example, tft.min computes the minimum of a tensor over the dataset. tf.Transform provides a fixed set of analyzers, but this will be extended in future versions. + +So our `preprocesing_fn` can contain all tf operations that accept and return tensors and also specific `tf.transform` operations. In our simple example below we use the former to convert all incoming captions to lowercase letters only, while the latter does a full pass on all the data in our dataset to compute the average length of the captions to be used for a follow-up preprocessing step. + +{{< highlight file="sdks/python/apache_beam/examples/ml-orchestration/tfx/coco_captions_utils.py" >}} +{{< code_sample "sdks/python/apache_beam/examples/ml-orchestration/tfx/coco_captions_utils.py" tfx_preprocessing_fn >}} +{{< /highlight >}} + +However this function only defines the logical steps that have to be performed during preprocessing and needs a concrete implementation before it can be executed. One such implementation is provided by `tf.Transform` using Apache Beam and provides a PTransform `tft_beam.AnalyzeAndTransformDataset` to process the data. We can test this preproccesing_fn outside of the TFX Transform component using this PTransform explicitly. Calling the `processing_fn` in such a way is not necessary when using `tf.Transform` in combination with the TFX Transform component. + +{{< highlight file="sdks/python/apache_beam/examples/ml-orchestration/tfx/coco_captions_utils.py" >}} +{{< code_sample "sdks/python/apache_beam/examples/ml-orchestration/tfx/coco_captions_utils.py" tfx_analyze_and_transform >}} +{{< /highlight >}} + +#### Train + +Finally the Trainer component behaves in a similar way as the Transform component, but instead of looking for a `preprocessing_fn` it requires a `run_fn` function to be present in the specified `module_file`. Our simple implementation, creates a stub model using `tf.Keras` and saves the resulting model to a directory. + +{{< highlight file="sdks/python/apache_beam/examples/ml-orchestration/tfx/coco_captions_utils.py" >}} +{{< code_sample "sdks/python/apache_beam/examples/ml-orchestration/tfx/coco_captions_utils.py" tfx_run_fn >}} +{{< /highlight >}} + +#### Executing the pipeline + +To launch the pipeline two configurations must be provided: The orchestrator for the TFX pipeline and the pipeline options to run Beam pipelines. In this case we use the `LocalDagRunner` for orchestration to run the pipeline locally without extra setup dependencies. Where the created pipeline can specify Beam’s pipeline options as usual through the `beam_pipeline_args` argument. + +{{< highlight file="sdks/python/apache_beam/examples/ml-orchestration/tfx/coco_captions_local.py" >}} +{{< code_sample "sdks/python/apache_beam/examples/ml-orchestration/tfx/coco_captions_local.py" tfx_execute_pipeline >}} +{{< /highlight >}} diff --git a/website/www/site/content/en/documentation/ml/overview.md b/website/www/site/content/en/documentation/ml/overview.md old mode 100755 new mode 100644 index 0423686329c8..628dd009b75b --- a/website/www/site/content/en/documentation/ml/overview.md +++ b/website/www/site/content/en/documentation/ml/overview.md @@ -47,12 +47,26 @@ Further reading: ## Inference -There are several ways to use and deploy your model: -1. Making it available for online predictions via an API -2. Running it in real-time as new data becomes available in a pipeline -3. Running it in batch on an existing dataset +Beam provides different ways of implementing inference as part of your pipeline. This way you can run your ML model directly in your pipeline and apply it on big scale datasets, both in batch and streaming pipelines. + +### RunInference +The recommended way to implement inference is by using the [RunInference API](https://beam.apache.org/documentation/sdks/python-machine-learning/). RunInference takes advantage of existing Apache Beam concepts, such as the `BatchElements` transform and the `Shared` class, to enable you to use models in your pipelines to create transforms optimized for machine learning inferences. The ability to create arbitrarily complex workflow graphs also allows you to build multi-model pipelines. + +You can easily integrate your model in your pipeline by using the corresponding model handlers. A `ModelHandler` is an object that wraps the underlying model and allows you to configure its parameters. Model handlers are available for PyTorch, Scikit-learn and TensorFlow. Examples of how to use RunInference for PyTorch, Scikit-learn and TensorFlow are shown in this [notebook](https://github.com/apache/beam/blob/master/examples/notebooks/beam-ml/run_inference_pytorch_tensorflow_sklearn.ipynb). + +GPUs are optimized for training artificial intelligence and deep learning models as they can process multiple computations simultaneously. RunInference also allows you to use GPUs for significant inference speedup. An example of how to use RunInference with GPUs is demonstrated [here](/documentation/ml/runinference-metrics). + +### Custom Inference +As of now, RunInference API doesn't support making remote inference calls (e.g. Natural Language API, Cloud Vision API and others). Therefore, in order to use these remote APIs with Beam, one needs to write custom inference call. The [notebook](https://github.com/apache/beam/blob/master/examples/notebooks/beam-ml/custom_remote_inference.ipynb) shows how you can implement such a custom remote inference call using `beam.DoFn`. While implementing such a remote inference for real life projects, you need to think about following: + +* API quotas and the heavy load you might incur on your external API. For optimizing the calls to external API, you can confgure `PipelineOptions` to limit the parallel calls to the external remote API. + +* You must be prepared to encounter, identify, and handle failure as gracefully as possible. We recommend using techniques like `Exponential backoff` and `Dead letter queues`. + +* When running inference with an external API, you should batch your input together to allow for more efficient execution. + +* You should consider monitoring and measuring performance of a pipeline when deploying since monitoring can provide insight into the status and health of the application. -Beam is ideally suitable for the last 2 use cases. In this case your data will run through a pipeline (streaming or batch), and you can obtain predictions by running inference in one of the steps of your pipeline. Beam provides the [RunInference API](https://beam.apache.org/documentation/sdks/python-machine-learning/) to facilitate the integration of your model into a pipeline step. When running your model, a common requirement is to enable GPU execution. Beam also provides support for this. ## Orchestrators @@ -61,6 +75,7 @@ In order to automate and track the AI/ML workflows throughout your project, you ## Examples You can find examples of end-to-end AI/ML pipelines for several use cases: -* [Multi model pipelines in Beam](/documentation/ml/multi-model-pipelines) -* [Online Clustering in Beam](/documentation/ml/online-clustering) -* [Anomaly Detection in Beam](/documentation/ml/anomaly-detection) +* [ML Workflow Orchestration](/documentation/ml/orchestration): illustrates how ML workflows consisting of multiple steps can be orchestrated by using Kubeflow Pipelines and Tensorflow Extended. +* [Multi model pipelines in Beam](/documentation/ml/multi-model-pipelines): explains how multi-model pipelines work and gives an overview of what you need to know to build one using the RunInference API. +* [Online Clustering in Beam](/documentation/ml/online-clustering): demonstrates how to setup a realtime clustering pipeline that can read text from PubSub, convert the text into an embedding using a transformer based language model with the RunInference API, and cluster them using BIRCH with Stateful Processing. +* [Anomaly Detection in Beam](/documentation/ml/anomaly-detection): demonstrates how to setup an anomaly detection pipeline that reads text from PubSub in real-time, and then detects anomaly using a trained HDBSCAN clustering model with the RunInference API. \ No newline at end of file diff --git a/website/www/site/content/en/documentation/ml/runinference-metrics.md b/website/www/site/content/en/documentation/ml/runinference-metrics.md new file mode 100644 index 000000000000..e58cf20c0d42 --- /dev/null +++ b/website/www/site/content/en/documentation/ml/runinference-metrics.md @@ -0,0 +1,102 @@ +--- +title: "RunInference Metrics" +--- + + +# RunInference Metrics Example + +The main purpose of the example is to demonstrate and explain different metrics that are available when using the [RunInference](https://beam.apache.org/documentation/transforms/python/elementwise/runinference/) transform to perform inference using a machine learning model. We use a pipeline that reads a list of sentences, tokeinzes the text, and uses a Transformer based model `distilbert-base-uncased-finetuned-sst-2-english` for classifying the texts into two different classes using `RunInference`. + +We showcase different RunInference metrics when the pipeline is executed using the Dataflow Runner on CPU and GPU. The full example code can be found [here](https://github.com/apache/beam/tree/master/sdks/python/apache_beam/examples/inference/runinference_metrics/). + + +The file structure for entire pipeline is: + + runinference_metrics/ + ├── pipeline/ + │ ├── __init__.py + │ ├── options.py + │ └── transformations.py + ├── __init__.py + ├── config.py + ├── main.py + └── setup.py + +`pipeline/transformations.py` contains the code for `beam.DoFn` and additional functions that are used for pipeline + +`pipeline/options.py` contains the pipeline options to configure the Dataflow pipeline + +`config.py` defines some variables like GCP `PROJECT_ID`, `NUM_WORKERS` that are used multiple times + +`setup.py` defines the packages/requirements for the pipeline to run + +`main.py` contains the pipeline code and some additional functions used for running the pipeline + + +### How to Run the Pipeline +First, make sure you have installed the required packages. One should have access to a Google Cloud Project and then correctly configure the GCP variables like `PROJECT_ID`, `REGION`, and others in `config.py`. To use GPUs, follow the setup instructions [here](https://github.com/GoogleCloudPlatform/python-docs-samples/tree/main/dataflow/gpu-examples/pytorch-minimal). + + +1. Dataflow with CPU: `python main.py --mode cloud --device CPU` +2. Dataflow with GPU: `python main.py --mode cloud --device GPU` + +The pipeline can be broken down into few simple steps: +1. Create a list of texts to use as an input using `beam.Create` +2. Tokenize the text +3. Use RunInference to do inference +4. Postprocess the output of RunInference + +{{< highlight >}} + with beam.Pipeline(options=pipeline_options) as pipeline: + _ = ( + pipeline + | "Create inputs" >> beam.Create(inputs) + | "Tokenize" >> beam.ParDo(Tokenize(cfg.TOKENIZER_NAME)) + | "Inference" >> + RunInference(model_handler=KeyedModelHandler(model_handler)) + | "Decode Predictions" >> beam.ParDo(PostProcessor())) +{{< /highlight >}} + + +## RunInference Metrics + +As mentioned above, we benchmarked the performance of RunInference using Dataflow on both CPU and GPU. These metrics can be seen in the GCP UI and can also be printed using + +{{< highlight >}} +metrics = pipeline.result.metrics().query(beam.metrics.MetricsFilter()) +{{< /highlight >}} + + +A snapshot of different metrics from GCP UI when using Dataflow on GPU: + + ![RunInference GPU metrics rendered on Dataflow](/images/runinference_metrics_snapshot.svg) + +Some metrics commonly used for benchmarking are: + +* `num_inferences`: represents the total number of elements passed to `run_inference()`. + +* `inference_batch_latency_micro_secs_MEAN`: represents the average time taken to perform the inference across all batches of examples, measured in microseconds. + +* `inference_request_batch_size_COUNT`: represents the total number of samples across all batches of examples (created from `beam.BatchElements`) to be passed to run_inference() + +* `inference_request_batch_byte_size_MEAN`: represents the average size of all elements for all samples in all batches of examples (created from `beam.BatchElements`) to be passed to run_inference(). This is measured in bytes. + +* `model_byte_size_MEAN`: represents the average memory consumed to load and initialize the model. This is measured in bytes. + +* `load_model_latency_milli_secs_MEAN`: represents the average time taken to load and initialize the model. This is measured in milliseconds. + +One can derive other relevant metrics like +* `Total time taken for inference` = `num_inferences x inference_batch_latency_micro_secs_MEAN` + diff --git a/website/www/site/content/en/documentation/programming-guide.md b/website/www/site/content/en/documentation/programming-guide.md index 4276e7dc3658..8d242cc60703 100644 --- a/website/www/site/content/en/documentation/programming-guide.md +++ b/website/www/site/content/en/documentation/programming-guide.md @@ -3792,39 +3792,89 @@ the user ids from a `PCollection` of purchases one would write (using the `Selec purchases.apply(Select.fieldNames("userId")); {{< /highlight >}} +{{< highlight py >}} +input_pc = ... # {"user_id": ...,"bank": ..., "purchase_amount": ...} +output_pc = input_pc | beam.Select("user_id") +{{< /highlight >}} + ##### **Nested fields** +{{< paragraph class="language-py" >}} +Support for Nested fields hasn't been developed for the Python SDK yet. +{{< /paragraph >}} + +{{< paragraph class="language-go" >}} +Support for Nested fields hasn't been developed for the Go SDK yet. +{{< /paragraph >}} + +{{< paragraph class="language-java" >}} Individual nested fields can be specified using the dot operator. For example, to select just the postal code from the shipping address one would write +{{< /paragraph >}} {{< highlight java >}} purchases.apply(Select.fieldNames("shippingAddress.postCode")); {{< /highlight >}} + ##### **Wildcards** +{{< paragraph class="language-py" >}} +Support for wildcards hasn't been developed for the Python SDK yet. +{{< /paragraph >}} + +{{< paragraph class="language-go" >}} +Support for wildcards hasn't been developed for the Go SDK yet. +{{< /paragraph >}} + +{{< paragraph class="language-java" >}} The * operator can be specified at any nesting level to represent all fields at that level. For example, to select all shipping-address fields one would write +{{< /paragraph >}} {{< highlight java >}} purchases.apply(Select.fieldNames("shippingAddress.*")); {{< /highlight >}} + ##### **Arrays** +{{< paragraph class="language-java" >}} An array field, where the array element type is a row, can also have subfields of the element type addressed. When selected, the result is an array of the selected subfield type. For example +{{< /paragraph >}} + +{{< paragraph class="language-py" >}} +Support for Array fields hasn't been developed for the Python SDK yet. +{{< /paragraph >}} + +{{< paragraph class="language-go" >}} +Support for Array fields hasn't been developed for the Go SDK yet. +{{< /paragraph >}} {{< highlight java >}} purchases.apply(Select.fieldNames("transactions[].bank")); {{< /highlight >}} +{{< paragraph class="language-java" >}} Will result in a row containing an array field with element-type string, containing the list of banks for each transaction. +{{< /paragraph >}} +{{< paragraph class="language-java" >}} While the use of [] brackets in the selector is recommended, to make it clear that array elements are being selected, they can be omitted for brevity. In the future, array slicing will be supported, allowing selection of portions of the array. +{{< /paragraph >}} + ##### **Maps** @@ -3858,6 +3908,14 @@ The following purchasesByType.apply(Select.fieldNames("purchases{}.userId")); {{< /highlight >}} +{{< paragraph class="language-py" >}} +Support for Map fields hasn't been developed for the Python SDK yet. +{{< /paragraph >}} + +{{< paragraph class="language-go" >}} +Support for Map fields hasn't been developed for the Go SDK yet. +{{< /paragraph >}} + Will result in a row containing a map field with key-type string and value-type string. The selected map will contain all of the keys from the original map, and the values will be the userId contained in the purchase record. @@ -3882,6 +3940,14 @@ could select only the userId and streetAddress fields as follows purchases.apply(Select.fieldNames("userId", "shippingAddress.streetAddress")); {{< /highlight >}} +{{< paragraph class="language-py" >}} +Support for Nested fields hasn't been developed for the Python SDK yet. +{{< /paragraph >}} + +{{< paragraph class="language-go" >}} +Support for Nested fields hasn't been developed for the Go SDK yet. +{{< /paragraph >}} + The resulting `PCollection` will have the following schema @@ -3910,6 +3976,14 @@ The same is true for wildcard selections. The following purchases.apply(Select.fieldNames("userId", "shippingAddress.*")); {{< /highlight >}} +{{< paragraph class="language-py" >}} +Support for Wildcards hasn't been developed for the Python SDK yet. +{{< /paragraph >}} + +{{< paragraph class="language-go" >}} +Support for Wildcards hasn't been developed for the Go SDK yet. +{{< /paragraph >}} + Will result in the following schema
    @@ -3956,6 +4030,15 @@ selected field will appear as its own array field. For example purchases.apply(Select.fieldNames( "transactions.bank", "transactions.purchaseAmount")); {{< /highlight >}} +{{< paragraph class="language-py" >}} +Support for nested fields hasn't been developed for the Python SDK yet. +{{< /paragraph >}} + +{{< paragraph class="language-go" >}} +Support for nested fields hasn't been developed for the Go SDK yet. +{{< /paragraph >}} + +{{< paragraph class="language-java" >}} Will result in the following schema
    @@ -3976,6 +4059,7 @@ Will result in the following schema

    +{{< /paragraph >}} Wildcard selections are equivalent to separately selecting each field. @@ -3993,6 +4077,15 @@ Another use of the Select transform is to flatten a nested schema into a single purchases.apply(Select.flattenedSchema()); {{< /highlight >}} +{{< paragraph class="language-py" >}} +Support for nested fields hasn't been developed for the Python SDK yet. +{{< /paragraph >}} + +{{< paragraph class="language-go" >}} +Support for nested fields hasn't been developed for the Go SDK yet. +{{< /paragraph >}} + +{{< paragraph class="language-java" >}} Will result in the following schema @@ -4045,21 +4138,48 @@ Will result in the following schema

    +{{< /paragraph >}} ##### **Grouping aggregations** +{{< paragraph class="language-java" >}} The `Group` transform allows simply grouping data by any number of fields in the input schema, applying aggregations to those groupings, and storing the result of those aggregations in a new schema field. The output of the `Group` transform has a schema with one field corresponding to each aggregation performed. +{{< /paragraph >}} + +{{< paragraph class="language-py" >}} +The `GroupBy` transform allows simply grouping data by any number of fields in the input schema, applying aggregations to +those groupings, and storing the result of those aggregations in a new schema field. The output of the `GroupBy` transform +has a schema with one field corresponding to each aggregation performed. +{{< /paragraph >}} +{{< paragraph class="language-java" >}} The simplest usage of `Group` specifies no aggregations, in which case all inputs matching the provided set of fields are grouped together into an `ITERABLE` field. For example +{{< /paragraph >}} + +{{< paragraph class="language-py" >}} +The simplest usage of `GroupBy` specifies no aggregations, in which case all inputs matching the provided set of fields +are grouped together into an `ITERABLE` field. For example +{{< /paragraph >}} {{< highlight java >}} -purchases.apply(Group.byFieldNames("userId", "shippingAddress.streetAddress")); +purchases.apply(Group.byFieldNames("userId", "bank")); {{< /highlight >}} +{{< highlight py >}} +input_pc = ... # {"user_id": ...,"bank": ..., "purchase_amount": ...} +output_pc = input_pc | beam.GroupBy('user_id','bank') +{{< /highlight >}} + +{{< paragraph class="language-go" >}} +Support for schema-aware grouping hasn't been developed for the Go SDK yet. +{{< /paragraph >}} + +{{< paragraph class="lanuage-java" >}} The output schema of this is: +{{< /paragraph >}} @@ -4071,7 +4191,7 @@ The output schema of this is: - + @@ -4104,6 +4224,18 @@ purchases.apply(Group.byFieldNames("userId") .aggregateField("costCents", Top.largestLongsFn(10), "topPurchases")); {{< /highlight >}} +{{< highlight py >}} +input_pc = ... # {"user_id": ..., "item_Id": ..., "cost_cents": ...} +output_pc = input_pc | beam.GroupBy("user_id") + .aggregate_field("item_id", CountCombineFn, "num_purchases") + .aggregate_field("cost_cents", sum, "total_spendcents") + .aggregate_field("cost_cents", TopCombineFn, "top_purchases") +{{< /highlight >}} + +{{< paragraph class="language-go" >}} +Support for schema-aware grouping hasn't been developed for the Go SDK yet. +{{< /paragraph >}} + The result of this aggregation will have the following schema:
    keyROW{userId:STRING, streetAddress:STRING}ROW{userId:STRING, bank:STRING}
    values
    @@ -4135,6 +4267,14 @@ that are likely associated with that transaction (both the user and product matc "natural join" - one in which the same field names are used on both the left-hand and right-hand sides of the join - and is specified with the `using` keyword: +{{< paragraph class="language-py" >}} +Support for joins hasn't been developed for the Python SDK yet. +{{< /paragraph >}} + +{{< paragraph class="language-go" >}} +Support for joins hasn't been developed for the Go SDK yet. +{{< /paragraph >}} + {{< highlight java >}} PCollection transactions = readTransactions(); PCollection reviews = readReviews(); @@ -4142,6 +4282,7 @@ PCollection joined = transactions.apply( Join.innerJoin(reviews).using("userId", "productId")); {{< /highlight >}} +{{< paragraph class="language-java" >}} The resulting schema is the following:
    @@ -4162,12 +4303,21 @@ The resulting schema is the following:

    +{{< /paragraph >}} Each resulting row contains one Transaction and one Review that matched the join condition. If the fields to match in the two schemas have different names, then the on function can be used. For example, if the Review schema named those fields differently than the Transaction schema, then we could write the following: +{{< paragraph class="language-py" >}} +Support for joins hasn't been developed for the Python SDK yet. +{{< /paragraph >}} + +{{< paragraph class="language-go" >}} +Support for joins hasn't been developed for the Go SDK yet. +{{< /paragraph >}} + {{< highlight java >}} PCollection joined = transactions.apply( Join.innerJoin(reviews).on( @@ -4188,6 +4338,14 @@ can optionally be expanded - providing individual joined records, as in the `Joi processed in unexpanded format - providing the join key along with Iterables of all records from each input that matched that key. +{{< paragraph class="language-py" >}} +Support for joins hasn't been developed for the Python SDK yet. +{{< /paragraph >}} + +{{< paragraph class="language-go" >}} +Support for joins hasn't been developed for the Go SDK yet. +{{< /paragraph >}} + ##### **Filtering events** The `Filter` transform can be configured with a set of predicates, each one based one specified fields. Only records for @@ -7282,7 +7440,18 @@ To create an SDK wrapper for use in a Python pipeline, do the following: #### 13.1.2. Creating cross-language Python transforms -To make your Python transform usable with different SDK languages, you must create a Python module that registers an existing Python transform as a cross-language transform for use with the Python expansion service and calls into that existing transform to perform its intended operation. +Any Python transforms defined in the scope of the expansion service should be accessible by specifying their fully qualified names. For example, you could use Python's `ReadFromText` transform in a Java pipeline with its fully qualified name `apache_beam.io.ReadFromText`: + +```java +p.apply("Read", + PythonExternalTransform.>from("apache_beam.io.ReadFromText") + .withKwarg("file_pattern", options.getInputFile()) + .withKwarg("validate", false)) +``` + + > **Note:** `PythonExternalTransform` has other useful methods such as `withExtraPackages` for staging PyPI package dependencies and `withOutputCoder` for setting an output coder. + +Alternatively, you may want to create a Python module that registers an existing Python transform as a cross-language transform for use with the Python expansion service and calls into that existing transform to perform its intended operation. A registered URN can be used later in an expansion request for indicating an expansion target. **Defining the Python module** @@ -7336,7 +7505,7 @@ $ export PORT_FOR_EXPANSION_SERVICE=12345 3. Import any modules that contain transforms to be made available using the expansion service. {{< highlight >}} -$ python -m apache_beam.runners.portability.expansion_service_test -p $PORT_FOR_EXPANSION_SERVICE +$ python -m apache_beam.runners.portability.expansion_service_test -p $PORT_FOR_EXPANSION_SERVICE --pickle_library=cloudpickle {{< /highlight >}} 4. This expansion service is now ready to serve up transforms on the address `localhost:$PORT_FOR_EXPANSION_SERVICE`. @@ -7393,7 +7562,29 @@ Depending on the SDK language of the pipeline, you can use a high-level SDK-wrap #### 13.2.1. Using cross-language transforms in a Java pipeline -Currently, to access cross-language transforms from the Java SDK, you have to use the lower-level [External](https://github.com/apache/beam/blob/master/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/External.java) class. +Users have three options to use cross-language transforms in a Java pipeline. At the highest level of abstraction, some popular Python transforms are accessible through dedicated Java wrapper transforms. For example, the Java SDK has the `DataframeTransform` class, which uses the Python SDK's `DataframeTransform`, and it has the `RunInference` class, which uses the Python SDK's `RunInference`, and so on. When an SDK-specific wrapper transform is not available for a target Python transform, you can use the lower-level [PythonExternalTransform](https://github.com/apache/beam/blob/master/sdks/java/extensions/python/src/main/java/org/apache/beam/sdk/extensions/python/PythonExternalTransform.java) class instead by specifying the fully qualified name of the Python transform. If you want to try external transforms from SDKs other than Python (including Java SDK itself), you can also use the lowest-level [External](https://github.com/apache/beam/blob/master/runners/core-construction-java/src/main/java/org/apache/beam/runners/core/construction/External.java) class. + +**Using an SDK wrapper** + +To use a cross-language transform through an SDK wrapper, import the module for the SDK wrapper and call it from your pipeline, as shown in the example: + +```java +import org.apache.beam.sdk.extensions.python.transforms.DataframeTransform; + +input.apply(DataframeTransform.of("lambda df: df.groupby('a').sum()").withIndexes()) +``` + +**Using the PythonExternalTransform class** + +When an SDK-specific wrapper is not available, you can access the Python cross-language transform through the `PythonExternalTransform` class by specifying the fully qualified name and the constructor arguments of the target Python transform. + +```java +input.apply( + PythonExternalTransform., PCollection>from( + "apache_beam.dataframe.transforms.DataframeTransform") + .withKwarg("func", PythonCallableSource.of("lambda df: df.groupby('a').sum()")) + .withKwarg("include_indexes", true)) +``` **Using the External class** @@ -7608,3 +7799,238 @@ Dataflow supports multi-language pipelines through the Dataflow Runner v2 backen ### 13.4 Tips and Troubleshooting {#x-lang-transform-tips-troubleshooting} For additional tips and troubleshooting information, see [here](https://cwiki.apache.org/confluence/display/BEAM/Multi-language+Pipelines+Tips). + +## 14 Batched DoFns {#batched-dofns} +{{< language-switcher java py go typescript >}} + +{{< paragraph class="language-go language-java language-typescript" >}} +Batched DoFns are currently a Python-only feature. +{{< /paragraph >}} + +{{< paragraph class="language-py" >}} +Batched DoFns enable users to create modular, composable components that +operate on batches of multiple logical elements. These DoFns can leverage +vectorized Python libraries, like numpy, scipy, and pandas, which operate on +batches of data for efficiency. +{{< /paragraph >}} + +### 14.1 Basics {#batched-dofn-basics} +{{< paragraph class="language-go language-java language-typescript" >}} +Batched DoFns are currently a Python-only feature. +{{< /paragraph >}} + +{{< paragraph class="language-py" >}} +A trivial Batched DoFn might look like this: +{{< /paragraph >}} + +{{< highlight py >}} +class MultiplyByTwo(beam.DoFn): +  # Type +  def process_batch(self, batch: np.ndarray) -> Iterator[np.ndarray]: +    yield batch * 2 + +  # Declare what the element-wise output type is +  def infer_output_type(self, input_element_type): +    return input_element_type +{{< /highlight >}} + +{{< paragraph class="language-py" >}} +This DoFn can be used in a Beam pipeline that otherwise operates on individual +elements. Beam will implicitly buffer elements and create numpy arrays on the +input side, and on the output side it will explode the numpy arrays back into +individual elements: +{{< /paragraph >}} + +{{< highlight py >}} +(p | beam.Create([1, 2, 3, 4]).with_output_types(np.int64) +   | beam.ParDo(MultiplyByTwo()) # Implicit buffering and batch creation +   | beam.Map(lambda x: x/3))  # Implicit batch explosion +{{< /highlight >}} + +{{< paragraph class="language-py" >}} +Note that we use +[`PTransform.with_output_types`](https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.ptransform.html#apache_beam.transforms.ptransform.PTransform.with_output_types) to +set the _element-wise_ typehint for the output of `beam.Create`. Then, when +`MultiplyByTwo` is applied to this `PCollection`, Beam recognizes that +`np.ndarray` is an acceptable batch type to use in conjunction with `np.int64` +elements. We will use numpy typehints like these throughout this guide, but +Beam supports typehints from other libraries as well, see [Supported Batch +Types](#batched-dofn-types). +{{< /paragraph >}} + +{{< paragraph class="language-py" >}} +In the previous case, Beam will implicitly create and explode batches at the +input and output boundaries. However, if Batched DoFns with equivalent types are +chained together, this batch creation and explosion will be elided. The batches +will be passed straight through! This makes it much simpler to efficiently +compose transforms that operate on batches. +{{< /paragraph >}} + +{{< highlight py >}} +(p | beam.Create([1, 2, 3, 4]).with_output_types(np.int64) +   | beam.ParDo(MultiplyByTwo()) # Implicit buffering and batch creation +   | beam.ParDo(MultiplyByTwo()) # Batches passed through +   | beam.ParDo(MultiplyByTwo())) +{{< /highlight >}} + +### 14.2 Element-wise Fallback {#batched-dofn-elementwise} +{{< paragraph class="language-go language-java language-typescript" >}} +Batched DoFns are currently a Python-only feature. +{{< /paragraph >}} + +{{< paragraph class="language-py" >}} +For some DoFns you may be able to provide both a batched and an element-wise +implementation of your desired logic. You can do this by simply defining both +`process` and `process_batch`: +{{< /paragraph >}} + +{{< highlight py >}} +class MultiplyByTwo(beam.DoFn): +  def process(self, element: np.int64) -> Iterator[np.int64]: + # Multiply an individual int64 by 2 +    yield batch * 2 + +  def process_batch(self, batch: np.ndarray) -> Iterator[np.ndarray]: + # Multiply a _batch_ of int64s by 2 +    yield batch * 2 +{{< /highlight >}} + +{{< paragraph class="language-py" >}} +When executing this DoFn, Beam will select the best implementation to use given +the context. Generally, if the inputs to a DoFn are already batched Beam will +use the batched implementation; otherwise it will use the element-wise +implementation defined in the `process` method. +{{< /paragraph >}} + +{{< paragraph class="language-py" >}} +Note that, in this case, there is no need to define `infer_output_type`. This is +because Beam can get the output type from the typehint on `process`. +{{< /paragraph >}} + + + +### 14.3 Batch Production vs. Batch Consumption {#batched-dofn-batch-production} +{{< paragraph class="language-go language-java language-typescript" >}} +Batched DoFns are currently a Python-only feature. +{{< /paragraph >}} + +{{< paragraph class="language-py" >}} +By convention, Beam assumes that the `process_batch` method, which consumes +batched inputs, will also produce batched outputs. Similarly, Beam assumes the +`process` method will produce individual elements. This can be overridden with +the [`@beam.DoFn.yields_elements`](https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.DoFn.yields_elements) and +[`@beam.DoFn.yields_batches`](https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.DoFn.yields_batches) decorators. For example: +{{< /paragraph >}} + +{{< highlight py >}} +# Consumes elements, produces batches +class ReadFromFile(beam.DoFn): + +  @beam.DoFn.yields_batches +  def process(self, path: str) -> Iterator[np.ndarray]: +    ... +    yield array +   + +  # Declare what the element-wise output type is +  def infer_output_type(self): +    return np.int64 + +# Consumes batches, produces elements +class WriteToFile(beam.DoFn): +  @beam.DoFn.yields_elements +  def process_batch(self, batch: np.ndarray) -> Iterator[str]: +    ... +    yield output_path +{{< /highlight >}} + +### 14.4 Supported Batch Types {#batched-dofn-types} +{{< paragraph class="language-go language-java language-typescript" >}} +Batched DoFns are currently a Python-only feature. +{{< /paragraph >}} + +{{< paragraph class="language-py" >}} +We’ve used numpy types in the Batched DoFn implementations in this guide – +`np.int64 ` as the element typehint and `np.ndarray` as the corresponding +batch typehint – but Beam supports typehints from other libraries as well. +{{< /paragraph >}} + +#### [numpy](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/typehints/batch.py) +| Element Typehint | Batch Typehint | +| ---------------- | -------------- | +| Numeric types (`int`, `np.int32`, `bool`, ...) | np.ndarray (or NumpyArray) | + +#### [pandas](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/typehints/pandas_type_compatibility.py) +| Element Typehint | Batch Typehint | +| ---------------- | -------------- | +| Numeric types (`int`, `np.int32`, `bool`, ...) | `pd.Series` | +| `bytes` | | +| `Any` | | +| [Beam Schema Types](#schemas) | `pd.DataFrame` | + +#### Other types? +If there are other batch types you would like to use with Batched DoFns, please +[file an issue](https://github.com/apache/beam/issues/new/choose). + +### 14.5 Dynamic Batch Input and Output Types {#batched-dofn-dynamic-types} +{{< paragraph class="language-go language-java language-typescript" >}} +Batched DoFns are currently a Python-only feature. +{{< /paragraph >}} + +{{< paragraph class="language-py" >}} +For some Batched DoFns, it may not be sufficient to declare batch types +statically, with typehints on `process` and/or `process_batch`. You may need to +declare these types dynamically. You can do this by overriding the +[`get_input_batch_type`](https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.DoFn.get_input_batch_type) +and +[`get_output_batch_type`](https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.core.html#apache_beam.transforms.core.DoFn.get_output_batch_type) +methods on your DoFn: +{{< /paragraph >}} + +{{< highlight py >}} +# Utilize Beam's parameterized NumpyArray typehint +from apache_beam.typehints.batch import NumpyArray + +class MultipyByTwo(beam.DoFn): + # No typehints needed +  def process_batch(self, batch): +    yield batch * 2 + +  def get_input_batch_type(self, input_element_type): +    return NumpyArray[input_element_type] + +  def get_output_batch_type(self, input_element_type): +    return NumpyArray[input_element_type] + +  def infer_output_type(self, input_element_type): +    return input_element_type +{{< /highlight >}} + +### 14.6 Batches and Event-time Semantics {#batched-dofn-event-time} +{{< paragraph class="language-go language-java language-typescript" >}} +Batched DoFns are currently a Python-only feature. +{{< /paragraph >}} + +{{< paragraph class="language-py" >}} +Currently, batches must have a single set of timing information (event time, +windows, etc...) that applies to every logical element in the batch. There is +currently no mechanism to create batches that span multiple timestamps. However, +it is possible to retrieve this timing information in Batched DoFn +implementations. This information can be accessed by using the conventional +`DoFn.*Param` attributes: +{{< /paragraph >}} + +{{< highlight py >}} +class RetrieveTimingDoFn(beam.DoFn): + +  def process_batch( +    self, +    batch: np.ndarray, +    timestamp=beam.DoFn.TimestampParam, +    pane_info=beam.DoFn.PaneInfoParam, +   ) -> Iterator[np.ndarray]: +     ... + +  def infer_output_type(self, input_type): +    return input_type +{{< /highlight >}} diff --git a/website/www/site/content/en/documentation/runners/spark.md b/website/www/site/content/en/documentation/runners/spark.md index ff6fa3cc47a8..b7283f0cbe1b 100644 --- a/website/www/site/content/en/documentation/runners/spark.md +++ b/website/www/site/content/en/documentation/runners/spark.md @@ -293,7 +293,7 @@ python -m apache_beam.examples.wordcount \ - `--runner`(required): `SparkRunner`. - `--output_executable_path`(required): path for the bundle jar to be created. - `--output`(required): where output shall be written. -- `--spark_version`(optional): select spark version 2 (default) or 3. +- `--spark_version`(optional): select spark version 3 (default) or 2 (deprecated!). 5. Submit spark job to Dataproc cluster's master node. diff --git a/website/www/site/content/en/documentation/sdks/java-multi-language-pipelines.md b/website/www/site/content/en/documentation/sdks/java-multi-language-pipelines.md index 5f1b971f2046..fe1fba52d17f 100644 --- a/website/www/site/content/en/documentation/sdks/java-multi-language-pipelines.md +++ b/website/www/site/content/en/documentation/sdks/java-multi-language-pipelines.md @@ -138,26 +138,27 @@ default Beam SDK, you might need to run your own expansion service. In such cases, [start the expansion service](#advanced-start-an-expansion-service) before running your pipeline. -Here we've provided commands for running the example pipeline using -Gradle on a [Beam HEAD Git clone](https://github.com/apache/beam). -If you need a more stable environment, please -[setup a Java project](/get-started/quickstart-java/) that uses the latest -released Beam version and include the necessary dependencies. +### Run with Dataflow runner at HEAD (Beam 2.41.0 and later) -### Run with Dataflow runner +> **Note:** Due to [issue#23717](https://github.com/apache/beam/issues/23717), +> Beam 2.42.0 requires manually starting up an expansion service (see +> [these instructions](https://beam.apache.org/documentation/sdks/java-multi-language-pipelines/#advanced-start-an-expansion-service)) +> and using the additional pipeline option `--expansionService=localhost:` +> when executing the pipeline. The following script runs the example multi-language pipeline on Dataflow, using example text from a Cloud Storage bucket. You’ll need to adapt the script to your environment. ``` +export GCP_PROJECT= export OUTPUT_BUCKET= export GCP_REGION= export TEMP_LOCATION=gs://$OUTPUT_BUCKET/tmp -export PYTHON_VERSION= ./gradlew :examples:multi-language:pythonDataframeWordCount --args=" \ --runner=DataflowRunner \ +--project=$GCP_PROJECT \ --output=gs://${OUTPUT_BUCKET}/count \ --region=${GCP_REGION}" ``` @@ -187,15 +188,20 @@ python -m apache_beam.runners.portability.local_job_service_main -p $JOB_SERVER_ (this guide requires that your JAVA_HOME is set to Java 11). ``` -./gradlew :sdks:java:container:java11:docker +./gradlew :sdks:java:container:java11:docker -Pjava11Home=$JAVA_HOME ``` 5. Run the pipeline. +> **Note:** Due to [issue#23717](https://github.com/apache/beam/issues/23717), +> Beam 2.42.0 requires manually starting up an expansion service (see +> [these instructions](https://beam.apache.org/documentation/sdks/java-multi-language-pipelines/#advanced-start-an-expansion-service)) +> and using the additional pipeline option `--expansionService=localhost:` +> when executing the pipeline. + ``` export JOB_SERVER_PORT= # Same port as before export OUTPUT_FILE= -export PYTHON_VERSION= ./gradlew :examples:multi-language:pythonDataframeWordCount --args=" \ --runner=PortableRunner \ @@ -226,19 +232,64 @@ For example, to start the standard expansion service for a Python transform, [ExpansionServiceServicer](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/runners/portability/expansion_service.py), follow these steps: -1. Activate a Python virtual environment and install Apache Beam, as described - in the [Python quick start](/get-started/quickstart-py/). -2. In the **beam/sdks/python** directory of the Beam source code, run the - following command: +1. Activate a new virtual environment following +[these instructions](https://beam.apache.org/get-started/quickstart-py/#create-and-activate-a-virtual-environment). + +2. Install Apache Beam with `gcp` and `dataframe` packages. + +``` +pip install apache-beam[gcp,dataframe] +``` - ``` - python apache_beam/runners/portability/expansion_service_main.py -p 18089 --fully_qualified_name_glob "*" - ``` +4. Run the following command + +``` +python -m apache_beam.runners.portability.expansion_service_main -p --fully_qualified_name_glob "*" +``` The command runs [expansion_service_main.py](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/runners/portability/expansion_service_main.py), which starts the standard expansion service. When you use Gradle to run your Java pipeline, you can specify the expansion service with the -`expansionService` option. For example: `--expansionService=localhost:18089`. +`expansionService` option. For example: `--expansionService=localhost:`. + +### Run with Dataflow runner using a Beam release (Beam 2.43.0 and later) + +> **Note:** Due to [issue#23717](https://github.com/apache/beam/issues/23717), +> Beam 2.42.0 requires manually starting up an expansion service (see +> [these instructions](https://beam.apache.org/documentation/sdks/java-multi-language-pipelines/#advanced-start-an-expansion-service)) +> and using the additional pipeline option `--expansionService=localhost:` +> when executing the pipeline. + +* Check out the Beam examples Maven archetype for the relevant Beam version. + +``` +export BEAM_VERSION= + +mvn archetype:generate \ + -DarchetypeGroupId=org.apache.beam \ + -DarchetypeArtifactId=beam-sdks-java-maven-archetypes-examples \ + -DarchetypeVersion=$BEAM_VERSION \ + -DgroupId=org.example \ + -DartifactId=multi-language-beam \ + -Dversion="0.1" \ + -Dpackage=org.apache.beam.examples \ + -DinteractiveMode=false +``` + +* Run the pipeline. + +``` +export GCP_PROJECT= +export GCP_BUCKET= +export GCP_REGION= + +mvn compile exec:java -Dexec.mainClass=org.apache.beam.examples.multilanguage.PythonDataframeWordCount \ + -Dexec.args="--runner=DataflowRunner --project=$GCP_PROJECT \ + --region=us-central1 \ + --gcpTempLocation=gs://$GCP_BUCKET/multi-language-beam/tmp \ + --output=gs://$GCP_BUCKET/multi-language-beam/output" \ + -Pdataflow-runner +``` ## Next steps diff --git a/website/www/site/content/en/documentation/sdks/java/testing/nexmark.md b/website/www/site/content/en/documentation/sdks/java/testing/nexmark.md index da5378034d8e..74ca4f2caaaa 100644 --- a/website/www/site/content/en/documentation/sdks/java/testing/nexmark.md +++ b/website/www/site/content/en/documentation/sdks/java/testing/nexmark.md @@ -494,7 +494,7 @@ configure logging. Batch Mode: ./gradlew :sdks:java:testing:nexmark:run \ - -Pnexmark.runner=":runners:spark:2" \ + -Pnexmark.runner=":runners:spark:3" \ -Pnexmark.args=" --runner=SparkRunner --suite=SMOKE @@ -506,7 +506,7 @@ Batch Mode: Streaming Mode: ./gradlew :sdks:java:testing:nexmark:run \ - -Pnexmark.runner=":runners:spark:2" \ + -Pnexmark.runner=":runners:spark:3" \ -Pnexmark.args=" --runner=SparkRunner --suite=SMOKE diff --git a/website/www/site/content/en/documentation/sdks/python-machine-learning.md b/website/www/site/content/en/documentation/sdks/python-machine-learning.md index b899e5149642..b35ab347a8b9 100644 --- a/website/www/site/content/en/documentation/sdks/python-machine-learning.md +++ b/website/www/site/content/en/documentation/sdks/python-machine-learning.md @@ -77,6 +77,9 @@ You need to provide a path to a file that contains the model's saved weights. Th 1. Download the pre-trained weights and host them in a location that the pipeline can access. 2. Pass the path of the model weights to the PyTorch `ModelHandler` by using the following code: `state_dict_path=`. +See [this notebook](https://github.com/apache/beam/blob/master/examples/notebooks/beam-ml/run_inference_pytorch.ipynb) +that illustrates running PyTorch models with Apache Beam. + #### Scikit-learn You need to provide a path to a file that contains the pickled Scikit-learn model. This path must be accessible by the pipeline. To use pre-trained models with the RunInference API and the Scikit-learn framework, complete the following steps: @@ -86,6 +89,9 @@ You need to provide a path to a file that contains the pickled Scikit-learn mode `model_uri=` and `model_file_type: `, where you can specify `ModelFileType.PICKLE` or `ModelFileType.JOBLIB`, depending on how the model was serialized. +See [this notebook](https://github.com/apache/beam/blob/master/examples/notebooks/beam-ml/run_inference_sklearn.ipynb) +that illustrates running Scikit-learn models with Apache Beam. + #### TensorFlow To use TensorFlow with the RunInference API, you need to do the following: @@ -94,48 +100,8 @@ To use TensorFlow with the RunInference API, you need to do the following: * Create a model handler using `tfx_bsl.public.beam.run_inference.CreateModelHandler()`. * Use the model handler with the [`apache_beam.ml.inference.base.RunInference`](/releases/pydoc/current/apache_beam.ml.inference.base.html) transform. -A sample pipeline might look like the following example: - -``` -import apache_beam as beam -from apache_beam.ml.inference.base import RunInference -from tensorflow_serving.apis import prediction_log_pb2 -from tfx_bsl.public.proto import model_spec_pb2 -from tfx_bsl.public.tfxio import TFExampleRecord -from tfx_bsl.public.beam.run_inference import CreateModelHandler - -pipeline = beam.Pipeline() -tfexample_beam_record = TFExampleRecord(file_pattern='/path/to/examples') -saved_model_spec = model_spec_pb2.SavedModelSpec(model_path='/path/to/model') -inference_spec_type = model_spec_pb2.InferenceSpecType(saved_model_spec=saved_model_spec) -model_handler = CreateModelHandler(inference_spec_type) -with pipeline as p: - _ = (p | tfexample_beam_record.RawRecordBeamSource() - | RunInference(model_handler) - | beam.Map(print) - ) -``` - -Note: A model handler that is created with `CreateModelHander()` is always unkeyed. - -### Keyed Model Handlers -To make a keyed model handler, wrap any unkeyed model handler in the keyed model handler. For example: - -``` -from apache_beam.ml.inference.base import RunInference -from apache_beam.ml.inference.base import KeyedModelHandler -model_handler = -keyed_model_handler = KeyedModelHandler(model_handler) - -with pipeline as p: - p | ( - RunInference(keyed_model_handler) - ) -``` - -If you are unsure if your data is keyed, you can also use `MaybeKeyedModelHandler`. - -For more information, see [`KeyedModelHander`](https://beam.apache.org/releases/pydoc/current/apache_beam.ml.inference.base.html#apache_beam.ml.inference.base.KeyedModelHandler). +See [this notebook](https://github.com/apache/beam/blob/master/examples/notebooks/beam-ml/run_inference_tensorflow.ipynb) +that illustrates running TensorFlow models with Apache Beam and tfx-bsl. ### Use custom models @@ -209,6 +175,10 @@ with pipeline as p: predictions = data | RunInference(keyed_model_handler) ``` +If you are unsure if your data is keyed, you can also use `MaybeKeyedModelHandler`. + +For more information, see [`KeyedModelHander`](https://beam.apache.org/releases/pydoc/current/apache_beam.ml.inference.base.html#apache_beam.ml.inference.base.KeyedModelHandler). + ### Use the PredictionResults object When doing a prediction in Apache Beam, the output `PCollection` includes both the keys of the input examples and the inferences. Including both these items in the output allows you to find the input that determined the predictions. diff --git a/website/www/site/content/en/documentation/sdks/python-pipeline-dependencies.md b/website/www/site/content/en/documentation/sdks/python-pipeline-dependencies.md index bf2e44e55866..330a8af8e449 100644 --- a/website/www/site/content/en/documentation/sdks/python-pipeline-dependencies.md +++ b/website/www/site/content/en/documentation/sdks/python-pipeline-dependencies.md @@ -36,7 +36,7 @@ If your pipeline uses public packages from the [Python Package Index](https://py This command creates a `requirements.txt` file that lists all packages that are installed on your machine, regardless of where they were installed from. -2. Edit the `requirements.txt` file and leave only the packages that were installed from PyPI and are used in the workflow source. Delete all packages that are not relevant to your code. +2. Edit the `requirements.txt` file and delete all packages that are not relevant to your code. 3. Run your pipeline with the following command-line option: @@ -44,7 +44,6 @@ If your pipeline uses public packages from the [Python Package Index](https://py The runner will use the `requirements.txt` file to install your additional dependencies onto the remote workers. -**Important:** Remote workers will install all packages listed in the `requirements.txt` file. Because of this, it's very important that you delete non-PyPI packages from the `requirements.txt` file, as stated in step 2. If you don't remove non-PyPI packages, the remote workers will fail when attempting to install packages from sources that are unknown to them. > **NOTE**: An alternative to `pip freeze` is to use a library like [pip-tools](https://github.com/jazzband/pip-tools) to compile all the dependencies required for the pipeline from a `--requirements_file`, where only top-level dependencies are mentioned. ## Custom Containers {#custom-containers} diff --git a/website/www/site/content/en/get-started/downloads.md b/website/www/site/content/en/get-started/downloads.md index 522ad829a07f..20eb6d6d1b03 100644 --- a/website/www/site/content/en/get-started/downloads.md +++ b/website/www/site/content/en/get-started/downloads.md @@ -53,6 +53,14 @@ Additionally, you may want to depend on additional SDK modules, such as IO connectors or other extensions, and additional runners to execute your pipeline at scale. +The Go SDK is accessible via Go Modules and calling `go get` from a module subdirectory: + + go get github.com/apache/beam/sdks/v2/go/pkg/beam + +Specific versions can be depended on similarly: + + go get github.com/apache/beam/sdks/v2@v{{< param release_latest >}}/go/pkg/beam + ## Downloading source code You can download the source code package for a release from the links in the diff --git a/website/www/site/layouts/case-studies/list.html b/website/www/site/layouts/case-studies/list.html index c1957847a3ca..a67079bd06ad 100644 --- a/website/www/site/layouts/case-studies/list.html +++ b/website/www/site/layouts/case-studies/list.html @@ -54,14 +54,22 @@

    {{ .Params.cardTitle }}

    Also used by

    {{ range where $pages "Params.category" "ne" "study" }} -
    -
    - -
    -
    - {{ .Params.cardDescription | safeHTML }} + {{ if .Params.hasLink }} + +
    + +
    +
    + {{ else }} +
    +
    + +
    +
    + {{ .Params.cardDescription | safeHTML }} +
    -
    + {{ end }} {{ end }}
    @@ -71,4 +79,8 @@

    Also used by

    + +{{ $shuffle := resources.Get "js/shuffle-elements.js" | minify | fingerprint }} + + {{ end }} diff --git a/website/www/site/layouts/index.html b/website/www/site/layouts/index.html index 396b0dbac839..6ada1652bdfd 100644 --- a/website/www/site/layouts/index.html +++ b/website/www/site/layouts/index.html @@ -104,7 +104,7 @@

    - @@ -148,8 +148,8 @@

    -
    - +
    +
    diff --git a/website/www/site/layouts/partials/header.html b/website/www/site/layouts/partials/header.html index 328396b35750..76735af60da9 100644 --- a/website/www/site/layouts/partials/header.html +++ b/website/www/site/layouts/partials/header.html @@ -16,7 +16,7 @@ Brand - {{ T "nav-get-started" }} + {{ T "nav-get-started" }} {{ T "nav-documentation" }}

  • - {{ T "nav-get-started" }} + {{ T "nav-get-started" }}
  • Documentation @@ -125,7 +125,7 @@
  • Runner Support
  • +
  • Batched DoFns
  • @@ -214,9 +215,11 @@
  • diff --git a/website/www/site/static/images/logos/powered-by/Amazon.png b/website/www/site/static/images/logos/powered-by/Amazon.png new file mode 100644 index 000000000000..7ff122bf2f5e Binary files /dev/null and b/website/www/site/static/images/logos/powered-by/Amazon.png differ diff --git a/website/www/site/static/images/logos/powered-by/ML6.jpg b/website/www/site/static/images/logos/powered-by/ML6.jpg new file mode 100644 index 000000000000..49062000537e Binary files /dev/null and b/website/www/site/static/images/logos/powered-by/ML6.jpg differ diff --git a/website/www/site/static/images/logos/powered-by/Strivr.png b/website/www/site/static/images/logos/powered-by/Strivr.png new file mode 100644 index 000000000000..cd9c99347564 Binary files /dev/null and b/website/www/site/static/images/logos/powered-by/Strivr.png differ diff --git a/website/www/site/static/images/logos/powered-by/Trustpilot.png b/website/www/site/static/images/logos/powered-by/Trustpilot.png new file mode 100644 index 000000000000..62703ac96202 Binary files /dev/null and b/website/www/site/static/images/logos/powered-by/Trustpilot.png differ diff --git a/website/www/site/static/images/logos/powered-by/Twitter.png b/website/www/site/static/images/logos/powered-by/Twitter.png new file mode 100644 index 000000000000..4ca58962cfeb Binary files /dev/null and b/website/www/site/static/images/logos/powered-by/Twitter.png differ diff --git a/website/www/site/static/images/logos/powered-by/Wayfair.png b/website/www/site/static/images/logos/powered-by/Wayfair.png new file mode 100644 index 000000000000..62745a08b75c Binary files /dev/null and b/website/www/site/static/images/logos/powered-by/Wayfair.png differ diff --git a/website/www/site/static/images/logos/powered-by/Wizeline.png b/website/www/site/static/images/logos/powered-by/Wizeline.png new file mode 100644 index 000000000000..16045d207da4 Binary files /dev/null and b/website/www/site/static/images/logos/powered-by/Wizeline.png differ diff --git a/website/www/site/static/images/orchestrated-beam-pipeline.svg b/website/www/site/static/images/orchestrated-beam-pipeline.svg new file mode 100644 index 000000000000..7270c6df081b --- /dev/null +++ b/website/www/site/static/images/orchestrated-beam-pipeline.svg @@ -0,0 +1,35 @@ + + + + + + + + + Beam DAGLoad DataTransformTransformTransformTransformTransformTransformData sourcesTrain/Val DatasetTest DatasetScrape DataTrain ML ModelEvaluate ML ModelOrchestrating DAG \ No newline at end of file diff --git a/website/www/site/static/images/runinference_metrics_snapshot.svg b/website/www/site/static/images/runinference_metrics_snapshot.svg new file mode 100644 index 000000000000..a1b41b2c2084 --- /dev/null +++ b/website/www/site/static/images/runinference_metrics_snapshot.svg @@ -0,0 +1,4751 @@ + + + + + diff --git a/website/www/site/static/images/standalone-beam-pipeline.svg b/website/www/site/static/images/standalone-beam-pipeline.svg new file mode 100644 index 000000000000..325b5be3d3e0 --- /dev/null +++ b/website/www/site/static/images/standalone-beam-pipeline.svg @@ -0,0 +1,35 @@ + + + + + + + + + Beam DAGLoad DataTransformTransformTransformTransformTransformTransformData sourcesOutputOutput \ No newline at end of file