Merge branch 'akvelon:master' into master

ruslan-ikhsan · Nov 30, 2022 · de51e1e · de51e1e
2 parents 84a73af + 37fb90c
commit de51e1e
Show file tree

Hide file tree

Showing 227 changed files with 27,438 additions and 2,208 deletions.
diff --git a/.github/workflows/dask_runner_tests.yml b/.github/workflows/dask_runner_tests.yml
@@ -0,0 +1,104 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# To learn more about GitHub Actions in Apache Beam check the CI.md
+
+name: Dask Runner Tests
+
+on:
+  schedule:
+    - cron: '3 7 * * *'
+  pull_request:
+    branches: ['master', 'release-*']
+    tags: 'v*'
+    paths: ['sdks/python/apache_beam/runners/dask/**']
+
+# This allows a subsequently queued workflow run to interrupt previous runs
+concurrency:
+  group: '${{ github.workflow }} @ ${{ github.event.pull_request.head.label || github.head_ref || github.ref }}'
+  cancel-in-progress: true
+
+jobs:
+
+  build_python_sdk_source:
+    name: 'Build python source distribution'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+      - name: Install python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.7
+      - name: Get build dependencies
+        working-directory: ./sdks/python
+        run: pip install pip setuptools --upgrade && pip install -r build-requirements.txt
+      - name: Build source
+        working-directory: ./sdks/python
+        run: python setup.py sdist
+      - name: Rename source file
+        working-directory: ./sdks/python/dist
+        run: mv $(ls | grep "apache-beam.*tar\.gz") apache-beam-source.tar.gz
+      - name: Upload compressed sources as artifacts
+        uses: actions/upload-artifact@v3
+        with:
+          name: python_sdk_source
+          path: sdks/python/dist/apache-beam-source.tar.gz
+
+  python_unit_tests:
+    name: 'Python Unit Tests'
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+        params: [
+          {"py_ver": "3.7", "tox_env": "py37"},
+          {"py_ver": "3.8", "tox_env": "py38"},
+          {"py_ver": "3.9", "tox_env": "py39"},
+          {"py_ver": "3.10", "tox_env": "py310" },
+        ]
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+      - name: Install python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.params.py_ver }}
+      - name: Get build dependencies
+        working-directory: ./sdks/python
+        run: pip install -r build-requirements.txt
+      - name: Install tox
+        run: pip install tox
+      - name: Install SDK with dask
+        working-directory: ./sdks/python
+        run: pip install setuptools --upgrade && pip install -e .[gcp,dask,test]
+      - name: Run tests basic unix
+        if: startsWith(matrix.os, 'ubuntu') || startsWith(matrix.os, 'macos')
+        working-directory: ./sdks/python
+        run: tox -c tox.ini -e ${{ matrix.params.tox_env }}-dask
+      - name: Run tests basic windows
+        if: startsWith(matrix.os, 'windows')
+        working-directory: ./sdks/python
+        run: tox -c tox.ini -e ${{ matrix.params.tox_env }}-win-dask
+      - name: Upload test logs
+        uses: actions/upload-artifact@v3
+        if: always()
+        with:
+          name: pytest-${{matrix.os}}-${{matrix.params.py_ver}}
+          path: sdks/python/pytest**.xml
+
diff --git a/.github/workflows/tour_of_beam_examples_ci.yml b/.github/workflows/tour_of_beam_examples_ci.yml
@@ -16,10 +16,10 @@
 name: Tour Of Beam Examples CI
 
 on:
-  push:
+  pull_request:
     paths:
-     - ./.github/workflows/playground_examples_ci_reusable.yml
-     - ./.github/workflows/tour_of_beam_examples_ci.yml
+     - .github/workflows/playground_examples_ci_reusable.yml
+     - .github/workflows/tour_of_beam_examples_ci.yml
      - playground/backend/**
      - playground/infrastructure/**
      - learning/tour-of-beam/learning-content/**

diff --git a/.gitignore b/.gitignore
@@ -120,13 +120,12 @@ website/www/yarn-error.log
 **/node_modules
 
 # Dart/Flutter
+**/.dart_tool
 **/.flutter-plugins
 **/.flutter-plugins-dependencies
-**/.dart_tool
-**/generated_plugin_registrant.dart
-**/*.g.dart
-**/*.mocks.dart
 **/.packages
+**/generated_plugin_registrant.dart
+playground/frontend/playground_components/pubspec.lock
 
 # Ignore Beam Playground Terraform
 **/.terraform

diff --git a/.test-infra/jenkins/job_InferenceBenchmarkTests_Python.groovy b/.test-infra/jenkins/job_InferenceBenchmarkTests_Python.groovy
@@ -35,10 +35,13 @@ def loadTestConfigurations = {
         job_name              : 'benchmark-tests-pytorch-imagenet-python' + now,
         project               : 'apache-beam-testing',
         region                : 'us-central1',
+        machine_type          : 'n1-standard-2',
+        num_workers           : 75,
+        disk_size_gb          : 50,
+        autoscaling_algorithm : 'NONE',
         staging_location      : 'gs://temp-storage-for-perf-tests/loadtests',
         temp_location         : 'gs://temp-storage-for-perf-tests/loadtests',
         requirements_file     : 'apache_beam/ml/inference/torch_tests_requirements.txt',
-        experiments           : 'no_use_multiple_sdk_containers',
         publish_to_big_query  : true,
         metrics_dataset       : 'beam_run_inference',
         metrics_table         : 'torch_inference_imagenet_results_resnet101',
@@ -47,7 +50,7 @@ def loadTestConfigurations = {
         influx_db_name        : InfluxDBCredentialsHelper.InfluxDBDatabaseName,
         influx_hostname       : InfluxDBCredentialsHelper.InfluxDBHostUrl,
         pretrained_model_name : 'resnet101',
-        input_file                 : 'gs://apache-beam-ml/testing/inputs/openimage_50k_benchmark.txt',
+        input_file            : 'gs://apache-beam-ml/testing/inputs/openimage_50k_benchmark.txt',
         model_state_dict_path : 'gs://apache-beam-ml/models/torchvision.models.resnet101.pth',
         output                : 'gs://temp-storage-for-end-to-end-tests/torch/result_101' + now + '.txt'
       ]
@@ -60,10 +63,13 @@ def loadTestConfigurations = {
         job_name              : 'benchmark-tests-pytorch-imagenet-python' + now,
         project               : 'apache-beam-testing',
         region                : 'us-central1',
+        machine_type          : 'n1-standard-2',
+        num_workers           : 75,
+        disk_size_gb          : 50,
+        autoscaling_algorithm : 'NONE',
         staging_location      : 'gs://temp-storage-for-perf-tests/loadtests',
         temp_location         : 'gs://temp-storage-for-perf-tests/loadtests',
         requirements_file     : 'apache_beam/ml/inference/torch_tests_requirements.txt',
-        experiments           : 'no_use_multiple_sdk_containers',
         publish_to_big_query  : true,
         metrics_dataset       : 'beam_run_inference',
         metrics_table         : 'torch_inference_imagenet_results_resnet152',
@@ -72,7 +78,7 @@ def loadTestConfigurations = {
         influx_db_name        : InfluxDBCredentialsHelper.InfluxDBDatabaseName,
         influx_hostname       : InfluxDBCredentialsHelper.InfluxDBHostUrl,
         pretrained_model_name : 'resnet152',
-        input_file                 : 'gs://apache-beam-ml/testing/inputs/openimage_50k_benchmark.txt',
+        input_file            : 'gs://apache-beam-ml/testing/inputs/openimage_50k_benchmark.txt',
         model_state_dict_path : 'gs://apache-beam-ml/models/torchvision.models.resnet152.pth',
         output                : 'gs://temp-storage-for-end-to-end-tests/torch/result_resnet152' + now + '.txt'
       ]
@@ -86,19 +92,21 @@ def loadTestConfigurations = {
         job_name              : 'benchmark-tests-pytorch-language-modeling-bert-base-uncased' + now,
         project               : 'apache-beam-testing',
         region                : 'us-central1',
+        machine_type          : 'n1-standard-2',
+        num_workers           : 250,
+        disk_size_gb          : 50,
+        autoscaling_algorithm : 'NONE',
         staging_location      : 'gs://temp-storage-for-perf-tests/loadtests',
         temp_location         : 'gs://temp-storage-for-perf-tests/loadtests',
         requirements_file     : 'apache_beam/ml/inference/torch_tests_requirements.txt',
-        pickle_library       : 'cloudpickle',
-        experiments           : 'no_use_multiple_sdk_containers',
         publish_to_big_query  : true,
         metrics_dataset       : 'beam_run_inference',
         metrics_table         : 'torch_language_modeling_bert_base_uncased',
         input_options         : '{}', // this option is not required for RunInference tests.
         influx_measurement    : 'torch_language_modeling_bert_base_uncased',
         influx_db_name        : InfluxDBCredentialsHelper.InfluxDBDatabaseName,
         influx_hostname       : InfluxDBCredentialsHelper.InfluxDBHostUrl,
-        input_file                 : 'gs://apache-beam-ml/testing/inputs/sentences_50k.txt',
+        input_file            : 'gs://apache-beam-ml/testing/inputs/sentences_50k.txt',
         bert_tokenizer        : 'bert-base-uncased',
         model_state_dict_path : 'gs://apache-beam-ml/models/huggingface.BertForMaskedLM.bert-base-uncased.pth',
         output                : 'gs://temp-storage-for-end-to-end-tests/torch/result_bert_base_uncased' + now + '.txt',
@@ -112,19 +120,21 @@ def loadTestConfigurations = {
         job_name              : 'benchmark-tests-pytorch-language-modeling-bert-large-cased' + now,
         project               : 'apache-beam-testing',
         region                : 'us-central1',
+        machine_type          : 'n1-standard-2',
+        num_workers           : 250,
+        disk_size_gb          : 50,
+        autoscaling_algorithm : 'NONE',
         staging_location      : 'gs://temp-storage-for-perf-tests/loadtests',
         temp_location         : 'gs://temp-storage-for-perf-tests/loadtests',
         requirements_file     : 'apache_beam/ml/inference/torch_tests_requirements.txt',
-        pickle_library       : 'cloudpickle',
-        experiments           : 'no_use_multiple_sdk_containers',
         publish_to_big_query  : true,
         metrics_dataset       : 'beam_run_inference',
         metrics_table         : 'torch_language_modeling_bert_large_uncased',
         input_options         : '{}', // this option is not required for RunInference tests.
         influx_measurement    : 'torch_language_modeling_bert_large_uncased',
         influx_db_name        : InfluxDBCredentialsHelper.InfluxDBDatabaseName,
         influx_hostname       : InfluxDBCredentialsHelper.InfluxDBHostUrl,
-        input_file                 : 'gs://apache-beam-ml/testing/inputs/sentences_50k.txt',
+        input_file            : 'gs://apache-beam-ml/testing/inputs/sentences_50k.txt',
         bert_tokenizer        : 'bert-large-uncased',
         model_state_dict_path : 'gs://apache-beam-ml/models/huggingface.BertForMaskedLM.bert-large-uncased.pth',
         output                : 'gs://temp-storage-for-end-to-end-tests/torch/result_bert_large_uncased' + now + '.txt'

diff --git a/.test-infra/jenkins/job_PostCommit_Python_Examples_Flink.groovy b/.test-infra/jenkins/job_PostCommit_Python_Examples_Flink.groovy
@@ -37,7 +37,6 @@ PostcommitJobBuilder.postCommitJob('beam_PostCommit_Python_Examples_Flink',
         gradle {
           rootBuildScriptDir(commonJobProperties.checkoutDir)
           tasks(":sdks:python:test-suites:portable:flinkExamplesPostCommit")
-          switches("-PflinkConfDir=$WORKSPACE/src/runners/flink/src/test/resources")
           commonJobProperties.setGradleSwitches(delegate)
         }
       }

diff --git a/.test-infra/metrics/grafana/dashboards/perftests_metrics/Java_JMH_benchmarks.json b/.test-infra/metrics/grafana/dashboards/perftests_metrics/Java_JMH_benchmarks.json
@@ -1610,7 +1610,9 @@
   "refresh": "",
   "schemaVersion": 30,
   "style": "dark",
-  "tags": [],
+  "tags": [
+    "performance tests"
+  ],
   "templating": {
     "list": []
   },
@@ -1636,5 +1638,5 @@
   "timezone": "",
   "title": "Java JMH benchmarks",
   "uid": "kllfR2vVk",
-  "version": 10
+  "version": 11
 }
diff --git a/.../metrics/grafana/dashboards/perftests_metrics/Python_ML_RunInference_Benchmark_Tests.json b/.../metrics/grafana/dashboards/perftests_metrics/Python_ML_RunInference_Benchmark_Tests.json
@@ -424,7 +424,7 @@
       "steppedLine": false,
       "targets": [
         {
-          "alias": "mean_load_model_latency_milli_seconds_resnet101",
+          "alias": "mean_inference_batch_latency_resnet101",
           "groupBy": [
             {
               "params": [
@@ -462,7 +462,7 @@
           "tags": []
         },
         {
-          "alias": "mean_load_model_latency_milli_seconds_resnet_152",
+          "alias": "mean_inference_batch_latency_resnet_152",
           "groupBy": [
             {
               "params": [
@@ -593,7 +593,7 @@
       "steppedLine": false,
       "targets": [
         {
-          "alias": "$mean_inference_batch_latency_bert_base_uncased",
+          "alias": "mean_inference_batch_latency_bert_base_uncased",
           "groupBy": [
             {
               "params": [
@@ -762,7 +762,7 @@
       "steppedLine": false,
       "targets": [
         {
-          "alias": "mean_load_model_latency_milli_seconds_resnet101",
+          "alias": "mean_load_model_latency_resnet101",
           "groupBy": [
             {
               "params": [
@@ -800,7 +800,7 @@
           "tags": []
         },
         {
-          "alias": "mean_load_model_latency_milli_seconds_resnet_152",
+          "alias": "mean_load_model_latency_resnet_152",
           "groupBy": [
             {
               "params": [
@@ -931,7 +931,7 @@
       "steppedLine": false,
       "targets": [
         {
-          "alias": "mean_load_model_latency_milli_seconds_bert_base_uncased",
+          "alias": "mean_load_model_latency_bert_base_uncased",
           "groupBy": [
             {
               "params": [
@@ -969,7 +969,7 @@
           "tags": []
         },
         {
-          "alias": "mean_load_model_latency_milli_seconds_bert_large_uncased",
+          "alias": "mean_load_model_latency_bert_large_uncased",
           "groupBy": [
             {
               "params": [

diff --git a/CHANGES.md b/CHANGES.md
@@ -61,9 +61,12 @@
 * Support for Bigtable sink (Write and WriteBatch) added (Go) ([#23324](https://github.com/apache/beam/issues/23324)).
 * S3 implementation of the Beam filesystem (Go) ([#23991](https://github.com/apache/beam/issues/23991)).
 * Support for SingleStoreDB source and sink added (Java) ([#22617](https://github.com/apache/beam/issues/22617)).
+* Added support for DefaultAzureCredential authentication in Azure Filesystem (Python) ([#24210](https://github.com/apache/beam/issues/24210)).
 
 ## New Features / Improvements
 
+* Beam now provides a portable "runner" that can render pipeline graphs with
+  graphviz.  See `python -m apache_beam.runners.render --help` for more details.
 * Local packages can now be used as dependencies in the requirements.txt file, rather
   than requiring them to be passed separately via the `--extra_package` option
   (Python) ([#23684](https://github.com/apache/beam/pull/23684)).
@@ -78,6 +81,8 @@
   dependency of the Java SDK Harness. Some users of a portable runner (such as Dataflow Runner v2)
   may have an undeclared dependency on this package (for example using GCS with
   TextIO) and will now need to declare the dependency.
+* `beam-sdks-java-core` is no longer a dependency of the Java SDK Harness. Users of a portable
+  runner (such as Dataflow Runner v2) will need to provide this package and its dependencies.
 
 ## Deprecations
 

diff --git a/build.gradle.kts b/build.gradle.kts
@@ -143,14 +143,25 @@ tasks.rat {
     "sdks/java/maven-archetypes/examples/sample.txt",
 
     // Ignore Flutter autogenerated files for Playground
+    "playground/frontend/**/*.g.dart",
+    "playground/frontend/**/*.g.yaml",
+    "playground/frontend/**/*.gen.dart",
+    "playground/frontend/**/*.golden.yaml",
+    "playground/frontend/**/*.mocks.dart",
     "playground/frontend/.metadata",
     "playground/frontend/pubspec.lock",
 
     // Ignore Flutter autogenerated files for Playground Components
+    "playground/frontend/**/*.pb.dart",
+    "playground/frontend/**/*.pbenum.dart",
+    "playground/frontend/**/*.pbgrpc.dart",
+    "playground/frontend/**/*.pbjson.dart",
     "playground/frontend/playground_components/.metadata",
     "playground/frontend/playground_components/pubspec.lock",
 
     // Ignore Flutter autogenerated files for Tour of Beam
+    "learning/tour-of-beam/frontend/**/*.g.dart",
+    "learning/tour-of-beam/frontend/**/*.gen.dart",
     "learning/tour-of-beam/frontend/.metadata",
     "learning/tour-of-beam/frontend/pubspec.lock",
 
@@ -340,6 +351,7 @@ tasks.register("python37PostCommit") {
   dependsOn(":sdks:python:test-suites:direct:py37:postCommitIT")
   dependsOn(":sdks:python:test-suites:direct:py37:directRunnerIT")
   dependsOn(":sdks:python:test-suites:direct:py37:hdfsIntegrationTest")
+  dependsOn(":sdks:python:test-suites:direct:py37:azureIntegrationTest")
   dependsOn(":sdks:python:test-suites:direct:py37:mongodbioIT")
   dependsOn(":sdks:python:test-suites:portable:py37:postCommitPy37")
   dependsOn(":sdks:python:test-suites:dataflow:py37:spannerioIT")

diff --git a/buildSrc/build.gradle.kts b/buildSrc/build.gradle.kts
@@ -53,7 +53,7 @@ dependencies {
   runtimeOnly("com.avast.gradle:gradle-docker-compose-plugin:0.14.12")                                       // Enable docker compose tasks
   runtimeOnly("ca.cutterslade.gradle:gradle-dependency-analyze:1.8.3")                                     // Enable dep analysis
   runtimeOnly("gradle.plugin.net.ossindex:ossindex-gradle-plugin:0.4.11")                                  // Enable dep vulnerability analysis
-  runtimeOnly("org.checkerframework:checkerframework-gradle-plugin:0.5.16")                                 // Enable enhanced static checking plugin
+  runtimeOnly("org.checkerframework:checkerframework-gradle-plugin:0.6.19")                                 // Enable enhanced static checking plugin
 }
 
 // Because buildSrc is built and tested automatically _before_ gradle