From 69df89421164ce5015f96c8605587bdcb761d024 Mon Sep 17 00:00:00 2001 From: Chris Neffshade Date: Wed, 13 Sep 2023 10:56:25 +0000 Subject: [PATCH] strings --- .../arc/environments/beam.env | 4 +- .github/workflows/README.md | 12 ++ .../workflows/beam_PreCommit_SQL_Java11.yml | 8 +- .github/workflows/beam_PreCommit_Spotless.yml | 9 +- .../workflows/beam_PreCommit_Whitespace.yml | 4 +- .../jenkins/job_PreCommit_PythonDocs.groovy | 33 ----- .../jenkins/job_PreCommit_PythonLint.groovy | 30 ---- .test-infra/jenkins/job_PreCommit_RAT.groovy | 26 ---- .../jenkins/job_PreCommit_Spotless.groovy | 45 ------ .../jenkins/job_PreCommit_Website.groovy | 27 ---- .../job_PreCommit_Website_Stage_GCS.groovy | 31 ---- .../jenkins/job_PreCommit_Whitespace.groovy | 31 ---- CHANGES.md | 5 +- examples/notebooks/beam-ml/README.md | 1 + it/google-cloud-platform/build.gradle | 6 +- .../worker/build.gradle | 135 +++++++++--------- .../windmill/AbstractWindmillStream.java | 17 +-- .../ForwardingClientResponseObserver.java | 14 +- .../windmill/StreamObserverFactory.java | 18 +-- .../worker/windmill/WindmillStream.java | 4 + .../grpcclient/GrpcCommitWorkStream.java | 31 ++-- .../grpcclient/GrpcGetDataStream.java | 41 +++--- .../grpcclient/GrpcGetWorkStream.java | 25 ++-- .../grpcclient/GrpcWindmillServer.java | 40 ++++-- .../windmill/src/main/proto/windmill.proto | 11 +- .../src/main/proto/windmill_service.proto | 2 +- sdks/go.mod | 22 +-- sdks/go.sum | 44 +++--- .../beam/checkstyle/suppressions.xml | 1 + .../apache_beam/dataframe/frame_base.py | 84 +++++++++-- .../apache_beam/dataframe/frame_base_test.py | 67 ++++++++- sdks/python/apache_beam/dataframe/frames.py | 113 ++++++++------- .../apache_beam/dataframe/frames_test.py | 27 +++- .../runners/worker/data_sampler.py | 10 +- .../apache_beam/transforms/ptransform.py | 52 +++++++ .../typehints/trivial_inference.py | 15 +- .../typehints/trivial_inference_test.py | 19 +++ sdks/python/apache_beam/yaml/yaml_provider.py | 76 ++++++++-- .../apache_beam/yaml/yaml_transform_test.py | 30 ++++ .../base_image_requirements_manual.txt | 1 - sdks/python/container/boot.go | 8 +- .../py310/base_image_requirements.txt | 80 ++++------- .../py311/base_image_requirements.txt | 74 ++++------ .../py38/base_image_requirements.txt | 80 ++++------- .../py39/base_image_requirements.txt | 82 ++++------- sdks/python/setup.py | 13 +- .../www/site/content/en/blog/beam-2.50.0.md | 1 + .../content/en/contribute/release-guide.md | 8 +- 48 files changed, 789 insertions(+), 728 deletions(-) delete mode 100644 .test-infra/jenkins/job_PreCommit_PythonDocs.groovy delete mode 100644 .test-infra/jenkins/job_PreCommit_PythonLint.groovy delete mode 100644 .test-infra/jenkins/job_PreCommit_RAT.groovy delete mode 100644 .test-infra/jenkins/job_PreCommit_Spotless.groovy delete mode 100644 .test-infra/jenkins/job_PreCommit_Website.groovy delete mode 100644 .test-infra/jenkins/job_PreCommit_Website_Stage_GCS.groovy delete mode 100644 .test-infra/jenkins/job_PreCommit_Whitespace.groovy diff --git a/.github/gh-actions-self-hosted-runners/arc/environments/beam.env b/.github/gh-actions-self-hosted-runners/arc/environments/beam.env index bfabf721796c..62a15edf5d6c 100644 --- a/.github/gh-actions-self-hosted-runners/arc/environments/beam.env +++ b/.github/gh-actions-self-hosted-runners/arc/environments/beam.env @@ -34,9 +34,9 @@ main_runner = { runner_image = "us-central1-docker.pkg.dev/apache-beam-testing/beam-github-actions/beam-arc-runner:60d397ecfbd2b10a1929615c70d500eb71a2c053" machine_type = "e2-standard-16" min_node_count = "1" - max_node_count = "16" + max_node_count = "24" min_replicas = "1" - max_replicas = "128" + max_replicas = "200" webhook_scaling = true disk_size_gb = 200 requests = { diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 4c894d6ca00a..07332a18e8bd 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -172,12 +172,24 @@ Please note that jobs with matrix need to have matrix element in the comment. Ex | Workflow name | Matrix | Trigger Phrase | Cron Status | |:-------------:|:------:|:--------------:|:-----------:| | [ PostCommit BeamMetrics Publish ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_BeamMetrics_Publish.yml) | N/A |`Run Beam Metrics Deployment`| [![.github/workflows/beam_PostCommit_BeamMetrics_Publish](https://github.com/apache/beam/actions/workflows/beam_PostCommit_BeamMetrics_Publish.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_Dataflow_ARM.yml) +| [ PostCommit TransformService Direct ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_TransformService_Direct.yml) | N/A |`Run TransformService_Direct PostCommit`| [![.github/workflows/beam_PostCommit_TransformService_Direct](https://github.com/apache/beam/actions/workflows/beam_PostCommit_TransformService_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_TransformService_Direct.yml) | [ PostCommit Go Dataflow ARM](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_Dataflow_ARM.yml) | N/A |`Run Go PostCommit Dataflow ARM`| [![.github/workflows/beam_PostCommit_Go_Dataflow_ARM](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_Dataflow_ARM.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_Dataflow_ARM.yml) | +| [ PostCommit Go VR Flink](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_VR_Flink.yml) | N/A |`Run Go Flink ValidatesRunner`| [![.github/workflows/beam_PostCommit_Go_VR_Flink](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_VR_Flink.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_VR_Flink.yml) | | [ PostCommit Go VR Samza](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_VR_Samza.yml) | N/A |`Run Go Samza ValidatesRunner`| [![.github/workflows/beam_PostCommit_Go_VR_Samza](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_VR_Samza.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_VR_Samza.yml) | +| [ PostCommit Go VR Spark](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_VR_Spark.yml) | N/A |`Run Go Spark ValidatesRunner`| [![.github/workflows/beam_PostCommit_Go_VR_Spark](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_VR_Spark.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Go_VR_Spark.yml) | +| [ PostCommit Java Examples Dataflow Java11 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow_Java11.yml) | N/A |`Run Java examples on Dataflow Java 11`| [![PostCommit Java Examples Dataflow Java11](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow_Java11.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow_Java11.yml) | | [ PostCommit Java Avro Versions ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Avro_Versions.yml) | N/A |`Run Java Avro Versions PostCommit`| [![PostCommit Java Avro Versions](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Avro_Versions.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Avro_Versions.yml) | +| [ PostCommit Java Dataflow V1 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_DataflowV1.yml) | N/A |`Run PostCommit_Java_Dataflow`| [![PostCommit Java Dataflow V1](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_DataflowV1.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_DataflowV1.yml) | +| [ PostCommit Java Dataflow V2 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_DataflowV2.yml) | N/A |`Run PostCommit_Java_DataflowV2`| [![PostCommit Java Dataflow V2](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_DataflowV2.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_DataflowV2.yml) | | [ PostCommit Java Examples Dataflow ARM ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow_ARM.yml) | N/A |`Run Java_Examples_Dataflow_ARM PostCommit`| [![PostCommit Java Examples Dataflow ARM](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow_ARM.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow_ARM.yml) | | [ PostCommit Java Examples Dataflow Java11 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow_Java11.yml) | N/A |`Run Java examples on Dataflow Java 11`| [![PostCommit Java Examples Dataflow Java11](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow_Java11.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow_Java11.yml) | | [ PostCommit Java Examples Dataflow Java17 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow_Java17.yml) | N/A |`Run Java examples on Dataflow Java 17`| [![PostCommit Java Examples Dataflow Java17](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow_Java17.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Examples_Dataflow_Java17.yml) | +| [ PostCommit Java Jpms Dataflow Java11 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Dataflow_Java11.yml) | N/A |`Run Jpms Dataflow Java 11 PostCommit`| [![.github/workflows/beam_PostCommit_Java_Jpms_Dataflow_Java11](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Dataflow_Java11.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Dataflow_Java11.yml) | +| [ PostCommit Java Jpms Dataflow Java17 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Dataflow_Java17.yml) | N/A |`Run Jpms Dataflow Java 17 PostCommit`| [![.github/workflows/beam_PostCommit_Java_Jpms_Dataflow_Java17](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Dataflow_Java17.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Dataflow_Java17.yml) | +| [ PostCommit Java Jpms Direct Java11 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Direct_Java11.yml) | N/A |`Run Jpms Direct Java 11 PostCommit`| [![.github/workflows/beam_PostCommit_Java_Jpms_Direct_Java11](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Direct_Java11.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Direct_Java11.yml) | +| [ PostCommit Java Jpms Direct Java17 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Direct_Java17.yml) | N/A |`Run Jpms Direct Java 17 PostCommit`| [![.github/workflows/beam_PostCommit_Java_Jpms_Direct_Java17](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Direct_Java17.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Direct_Java17.yml) | +| [ PostCommit Java Jpms Flink Java11 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Flink_Java11.yml) | N/A |`Run Jpms Flink Java 11 PostCommit`| [![.github/workflows/beam_PostCommit_Java_Jpms_Flink_Java11](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Flink_Java11.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Flink_Java11.yml) | +| [ PostCommit Java Jpms Spark Java11 ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Spark_Java11.yml) | N/A |`Run Jpms Spark Java 11 PostCommit`| [![.github/workflows/beam_PostCommit_Java_Jpms_Spark_Java11](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Spark_Java11.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Java_Jpms_Spark_Java11.yml) | | [ PostCommit Python Examples Dataflow ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Examples_Dataflow.yml) | N/A |`Run Python Examples_Dataflow`| [![PostCommit Python Examples Dataflow](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Examples_Dataflow.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Examples_Dataflow.yml) | | [ PostCommit Python Examples Direct ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Examples_Direct.yml) | N/A |`Run Python Examples_Direct`| [![PostCommit Python Examples Direct](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Examples_Direct.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Examples_Direct.yml) | | [ PostCommit Python Examples Flink ](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Examples_Flink.yml) | N/A |`Run Python Examples_Flink`| [![PostCommit Python Examples Flink](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Examples_Flink.yml/badge.svg?event=schedule)](https://github.com/apache/beam/actions/workflows/beam_PostCommit_Python_Examples_Flink.yml) | diff --git a/.github/workflows/beam_PreCommit_SQL_Java11.yml b/.github/workflows/beam_PreCommit_SQL_Java11.yml index fdbb68f940a4..e2c666dafa81 100644 --- a/.github/workflows/beam_PreCommit_SQL_Java11.yml +++ b/.github/workflows/beam_PreCommit_SQL_Java11.yml @@ -43,7 +43,7 @@ env: permissions: actions: write pull-requests: read - checks: read + checks: write contents: read deployments: read id-token: none @@ -115,4 +115,10 @@ jobs: uses: actions/upload-artifact@v3 with: name: SpotBugs Results + path: '**/build/reports/spotbugs/*.html' + - name: Publish SpotBugs Results + uses: jwgmeligmeyling/spotbugs-github-action@v1.2 + if: always() + with: + name: SpotBugs path: '**/build/reports/spotbugs/*.html' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Spotless.yml b/.github/workflows/beam_PreCommit_Spotless.yml index 0300ac07b4c6..a703454a74fe 100644 --- a/.github/workflows/beam_PreCommit_Spotless.yml +++ b/.github/workflows/beam_PreCommit_Spotless.yml @@ -50,7 +50,7 @@ concurrency: permissions: actions: write pull-requests: read - checks: read + checks: write contents: read deployments: read id-token: none @@ -94,4 +94,9 @@ jobs: uses: actions/upload-artifact@v3 with: name: java-code-coverage-report - path: "**/build/reports/checkstyle/*.xml" \ No newline at end of file + path: "**/build/reports/checkstyle/*.xml" + - name: Publish checkstyle check + uses: jwgmeligmeyling/checkstyle-github-action@v1 + if: always() + with: + path: '**/build/reports/checkstyle/*.xml' \ No newline at end of file diff --git a/.github/workflows/beam_PreCommit_Whitespace.yml b/.github/workflows/beam_PreCommit_Whitespace.yml index 4443c1d0d1a0..04705d49928c 100644 --- a/.github/workflows/beam_PreCommit_Whitespace.yml +++ b/.github/workflows/beam_PreCommit_Whitespace.yml @@ -19,10 +19,10 @@ on: push: tags: ['v*'] branches: ['master', 'release-*'] - paths: ['*.md', '*.build.gradle','.github/workflows/beam_PreCommit_Whitespace.yml'] + paths: ['**.md', '**.build.gradle', 'build.gradle.kts', '.github/workflows/beam_PreCommit_Whitespace.yml'] pull_request_target: branches: ['master', 'release-*'] - paths: ['*.md', '*.build.gradle'] + paths: ['**.md', '**.build.gradle', 'build.gradle.kts'] issue_comment: types: [created] schedule: diff --git a/.test-infra/jenkins/job_PreCommit_PythonDocs.groovy b/.test-infra/jenkins/job_PreCommit_PythonDocs.groovy deleted file mode 100644 index 17202263493c..000000000000 --- a/.test-infra/jenkins/job_PreCommit_PythonDocs.groovy +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import PrecommitJobBuilder -import CommonJobProperties as common - -PrecommitJobBuilder builder = new PrecommitJobBuilder( - scope: this, - nameBase: 'PythonDocs', - gradleTask: ':pythonDocsPreCommit', - timeoutMins: 30, - triggerPathPatterns: [ - '^sdks/python/.*$', - ] - ) -builder.build { - publishers {} -} diff --git a/.test-infra/jenkins/job_PreCommit_PythonLint.groovy b/.test-infra/jenkins/job_PreCommit_PythonLint.groovy deleted file mode 100644 index 118ca7b412b7..000000000000 --- a/.test-infra/jenkins/job_PreCommit_PythonLint.groovy +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import PrecommitJobBuilder - -PrecommitJobBuilder builder = new PrecommitJobBuilder( - scope: this, - nameBase: 'PythonLint', - gradleTask: ':pythonLintPreCommit', - triggerPathPatterns: [ - '^sdks/python/.*$', - '^release/.*$', - ] - ) -builder.build() diff --git a/.test-infra/jenkins/job_PreCommit_RAT.groovy b/.test-infra/jenkins/job_PreCommit_RAT.groovy deleted file mode 100644 index 613caa9af0de..000000000000 --- a/.test-infra/jenkins/job_PreCommit_RAT.groovy +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import PrecommitJobBuilder - -PrecommitJobBuilder builder = new PrecommitJobBuilder( - scope: this, - nameBase: 'RAT', - gradleTask: ':rat' - ) -builder.build() diff --git a/.test-infra/jenkins/job_PreCommit_Spotless.groovy b/.test-infra/jenkins/job_PreCommit_Spotless.groovy deleted file mode 100644 index a9da1ad5491a..000000000000 --- a/.test-infra/jenkins/job_PreCommit_Spotless.groovy +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import PrecommitJobBuilder - -PrecommitJobBuilder builder = new PrecommitJobBuilder( - scope: this, - nameBase: 'Spotless', - gradleTask: 'spotlessCheck checkStyleMain checkStyleTest', - triggerPathPatterns: [ - '^buildSrc/.*$', - '^sdks/java/.*$', - '^runners/.*$', - '^examples/java/.*$', - '^examples/kotlin/.*$', - '^.test-infra/jenkins/.*$', - ] - ) -builder.build { - publishers { - recordIssues { - tools { - checkStyle { - pattern('**/build/reports/checkstyle/*.xml') - } - } - enabledForFailure(true) - } - } -} diff --git a/.test-infra/jenkins/job_PreCommit_Website.groovy b/.test-infra/jenkins/job_PreCommit_Website.groovy deleted file mode 100644 index 73014819ed00..000000000000 --- a/.test-infra/jenkins/job_PreCommit_Website.groovy +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import PrecommitJobBuilder - -PrecommitJobBuilder builder = new PrecommitJobBuilder( - scope: this, - nameBase: 'Website', - gradleTask: ':websitePreCommit', - triggerPathPatterns: ['^website/.*$']) -builder.build() - diff --git a/.test-infra/jenkins/job_PreCommit_Website_Stage_GCS.groovy b/.test-infra/jenkins/job_PreCommit_Website_Stage_GCS.groovy deleted file mode 100644 index e2f7202d14eb..000000000000 --- a/.test-infra/jenkins/job_PreCommit_Website_Stage_GCS.groovy +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import PrecommitJobBuilder - -PrecommitJobBuilder builder = new PrecommitJobBuilder( - scope: this, - nameBase: 'Website_Stage_GCS', - gradleTask: ':website:stageWebsite', - triggerPathPatterns: ['^website/.*$']) -builder.build { - publishers { - buildDescription(/Website published to (http:\/\/.+\/index.html)/) - } -} - diff --git a/.test-infra/jenkins/job_PreCommit_Whitespace.groovy b/.test-infra/jenkins/job_PreCommit_Whitespace.groovy deleted file mode 100644 index 0221cf72917d..000000000000 --- a/.test-infra/jenkins/job_PreCommit_Whitespace.groovy +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import PrecommitJobBuilder - -PrecommitJobBuilder builder = new PrecommitJobBuilder( - scope: this, - nameBase: 'Whitespace', - gradleTask: ':whitespacePreCommit', - triggerPathPatterns: [ - '.*\\.md$', - '.*build\\.gradle$', - '.*build\\.gradle.kts$', - ] - ) -builder.build() diff --git a/CHANGES.md b/CHANGES.md index e9a3044b6ea9..fb1a14fb1287 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -71,8 +71,8 @@ ## Breaking Changes -* X behavior was changed ([#X](https://github.com/apache/beam/issues/X)). * Removed fastjson library dependency for Beam SQL. Table property is changed to be based on jackson ObjectNode (Java) ([#24154](https://github.com/apache/beam/issues/24154)). +* Removed TensorFlow from Beam Python container images [PR](https://github.com/apache/beam/pull/28424). If you have been negatively affected by this change, please comment on [#20605](https://github.com/apache/beam/issues/20605). ## Deprecations @@ -83,7 +83,7 @@ * Fixed X (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). ## Security Fixes -* Fixed (CVE-YYYY-NNNN)[https://www.cve.org/CVERecord?id=CVE-YYYY-NNNN] (Java/Python/Go) ([#X](https://github.com/apache/beam/issues/X)). +* Python containers updated, fixing [CVE-2021-30474](https://nvd.nist.gov/vuln/detail/CVE-2021-30474), [CVE-2021-30475](https://nvd.nist.gov/vuln/detail/CVE-2021-30475), [CVE-2021-30473](https://nvd.nist.gov/vuln/detail/CVE-2021-30473), [CVE-2020-36133](https://nvd.nist.gov/vuln/detail/CVE-2020-36133), [CVE-2020-36131](https://nvd.nist.gov/vuln/detail/CVE-2020-36131), [CVE-2020-36130](https://nvd.nist.gov/vuln/detail/CVE-2020-36130), and [CVE-2020-36135](https://nvd.nist.gov/vuln/detail/CVE-2020-36135) ## Known Issues @@ -146,6 +146,7 @@ * Long-running Python pipelines might experience a memory leak: [#28246](https://github.com/apache/beam/issues/28246). * Python Pipelines using BigQuery IO or `orjson` dependency might experience segmentation faults or get stuck: [#28318](https://github.com/apache/beam/issues/28318). +* Beam Python containers rely on a version of Debian/aom that has several security vulnerabilities: [CVE-2021-30474](https://nvd.nist.gov/vuln/detail/CVE-2021-30474), [CVE-2021-30475](https://nvd.nist.gov/vuln/detail/CVE-2021-30475), [CVE-2021-30473](https://nvd.nist.gov/vuln/detail/CVE-2021-30473), [CVE-2020-36133](https://nvd.nist.gov/vuln/detail/CVE-2020-36133), [CVE-2020-36131](https://nvd.nist.gov/vuln/detail/CVE-2020-36131), [CVE-2020-36130](https://nvd.nist.gov/vuln/detail/CVE-2020-36130), and [CVE-2020-36135](https://nvd.nist.gov/vuln/detail/CVE-2020-36135) # [2.49.0] - 2023-07-17 diff --git a/examples/notebooks/beam-ml/README.md b/examples/notebooks/beam-ml/README.md index 3a1ff935eb51..77bf3fc99f15 100644 --- a/examples/notebooks/beam-ml/README.md +++ b/examples/notebooks/beam-ml/README.md @@ -54,6 +54,7 @@ This section contains the following example notebooks. * [Apache Beam RunInference for scikit-learn](https://github.com/apache/beam/blob/master/examples/notebooks/beam-ml/run_inference_sklearn.ipynb) * [Apache Beam RunInference with TensorFlow](https://github.com/apache/beam/blob/master/examples/notebooks/beam-ml/run_inference_tensorflow.ipynb) * [Use RunInference with a model from TensorFlow Hub](https://github.com/apache/beam/blob/master/examples/notebooks/beam-ml/run_inference_with_tensorflow_hub.ipynb) +* [Apache Beam RunInference with Hugging Face](https://github.com/apache/beam/blob/master/examples/notebooks/beam-ml/run_inference_huggingface.ipynb) * [Apache Beam RunInference with XGBoost](https://github.com/apache/beam/blob/master/examples/notebooks/beam-ml/run_inference_xgboost.ipynb) * [Use RunInference with TFX](https://github.com/apache/beam/blob/master/examples/notebooks/beam-ml/run_inference_tensorflow_with_tfx.ipynb) * [Use RunInference in Apache Beam](https://github.com/apache/beam/blob/master/examples/notebooks/beam-ml/run_inference_pytorch_tensorflow_sklearn.ipynb) diff --git a/it/google-cloud-platform/build.gradle b/it/google-cloud-platform/build.gradle index f43b3f25720b..0917ddd3e21a 100644 --- a/it/google-cloud-platform/build.gradle +++ b/it/google-cloud-platform/build.gradle @@ -15,6 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +import org.apache.beam.gradle.IoPerformanceTestUtilities plugins { id 'org.apache.beam.module' } applyJavaNature( @@ -74,4 +75,7 @@ dependencies { testImplementation library.java.mockito_inline testRuntimeOnly project(path: ":runners:direct-java", configuration: "shadowTest") testRuntimeOnly library.java.slf4j_simple -} \ No newline at end of file +} + +tasks.register("GCSPerformanceTest", IoPerformanceTestUtilities.IoPerformanceTest, project, 'google-cloud-platform', 'FileBasedIOLT', ['configuration':'large','project':'apache-beam-testing', 'artifactBucket':'io-performance-temp']) +tasks.register("BigTablePerformanceTest", IoPerformanceTestUtilities.IoPerformanceTest, project, 'google-cloud-platform', 'BigTableIOLT', ['configuration':'large','project':'apache-beam-testing', 'artifactBucket':'io-performance-temp']) \ No newline at end of file diff --git a/runners/google-cloud-dataflow-java/worker/build.gradle b/runners/google-cloud-dataflow-java/worker/build.gradle index e1448e313c60..ce06063c9b52 100644 --- a/runners/google-cloud-dataflow-java/worker/build.gradle +++ b/runners/google-cloud-dataflow-java/worker/build.gradle @@ -67,90 +67,91 @@ def excluded_dependencies = [ library.java.error_prone_annotations, // Provided scope added in worker library.java.hamcrest, // Test only library.java.junit, // Test only - library.java.jsonassert // Test only + library.java.jsonassert, // Test only + library.java.truth // Test only ] applyJavaNature( automaticModuleName: 'org.apache.beam.runners.dataflow.worker', archivesBaseName: 'beam-runners-google-cloud-dataflow-java-legacy-worker', classesTriggerCheckerBugs: [ - 'BatchGroupAlsoByWindowAndCombineFn': 'TODO: file a bug report', - 'AssignWindowsParDoFnFactory': 'TODO: file a bug report', - 'FetchAndFilterStreamingSideInputsOperation': 'https://github.com/typetools/checker-framework/issues/5436', + 'BatchGroupAlsoByWindowAndCombineFn' : 'TODO: file a bug report', + 'AssignWindowsParDoFnFactory' : 'TODO: file a bug report', + 'FetchAndFilterStreamingSideInputsOperation': 'https://github.com/typetools/checker-framework/issues/5436', ], exportJavadoc: false, enableSpotbugs: false /* TODO(BEAM-5658): enable spotbugs */, shadowJarValidationExcludes: [ - "org/apache/beam/runners/dataflow/worker/**", - "org/apache/beam/repackaged/beam_runners_google_cloud_dataflow_java_legacy_worker/**", - // TODO(https://github.com/apache/beam/issues/19114): Move DataflowRunnerHarness class under org.apache.beam.runners.dataflow.worker namespace - "com/google/cloud/dataflow/worker/DataflowRunnerHarness.class", - // Allow slf4j implementation worker for logging during pipeline execution - "org/slf4j/impl/**" + "org/apache/beam/runners/dataflow/worker/**", + "org/apache/beam/repackaged/beam_runners_google_cloud_dataflow_java_legacy_worker/**", + // TODO(https://github.com/apache/beam/issues/19114): Move DataflowRunnerHarness class under org.apache.beam.runners.dataflow.worker namespace + "com/google/cloud/dataflow/worker/DataflowRunnerHarness.class", + // Allow slf4j implementation worker for logging during pipeline execution + "org/slf4j/impl/**" ], shadowClosure: { - // Each included dependency must also include all of its necessary transitive dependencies - // or have them provided by the users pipeline during job submission. Typically a users - // pipeline includes :runners:google-cloud-dataflow-java and its transitive dependencies - // so those dependencies don't need to be shaded (bundled and relocated) away. All other - // dependencies needed to run the worker must be shaded (bundled and relocated) to prevent - // ClassNotFound and/or MethodNotFound errors during pipeline execution. - // - // Each included dependency should have a matching relocation rule below that ensures - // that the shaded jar is correctly built. + // Each included dependency must also include all of its necessary transitive dependencies + // or have them provided by the users pipeline during job submission. Typically a users + // pipeline includes :runners:google-cloud-dataflow-java and its transitive dependencies + // so those dependencies don't need to be shaded (bundled and relocated) away. All other + // dependencies needed to run the worker must be shaded (bundled and relocated) to prevent + // ClassNotFound and/or MethodNotFound errors during pipeline execution. + // + // Each included dependency should have a matching relocation rule below that ensures + // that the shaded jar is correctly built. - dependencies { - include(dependency(library.java.slf4j_jdk14)) - } + dependencies { + include(dependency(library.java.slf4j_jdk14)) + } - dependencies { - include(project(path: ":model:fn-execution", configuration: "shadow")) - } - relocate("org.apache.beam.model.fnexecution.v1", getWorkerRelocatedPath("org.apache.beam.model.fnexecution.v1")) + dependencies { + include(project(path: ":model:fn-execution", configuration: "shadow")) + } + relocate("org.apache.beam.model.fnexecution.v1", getWorkerRelocatedPath("org.apache.beam.model.fnexecution.v1")) - dependencies { - include(project(":runners:core-construction-java")) - include(project(":runners:core-java")) - } - relocate("org.apache.beam.runners.core", getWorkerRelocatedPath("org.apache.beam.runners.core")) - relocate("org.apache.beam.repackaged.beam_runners_core_construction_java", getWorkerRelocatedPath("org.apache.beam.repackaged.beam_runners_core_construction_java")) - relocate("org.apache.beam.repackaged.beam_runners_core_java", getWorkerRelocatedPath("org.apache.beam.repackaged.beam_runners_core_java")) + dependencies { + include(project(":runners:core-construction-java")) + include(project(":runners:core-java")) + } + relocate("org.apache.beam.runners.core", getWorkerRelocatedPath("org.apache.beam.runners.core")) + relocate("org.apache.beam.repackaged.beam_runners_core_construction_java", getWorkerRelocatedPath("org.apache.beam.repackaged.beam_runners_core_construction_java")) + relocate("org.apache.beam.repackaged.beam_runners_core_java", getWorkerRelocatedPath("org.apache.beam.repackaged.beam_runners_core_java")) - dependencies { - include(project(":runners:java-fn-execution")) - } - relocate("org.apache.beam.runners.fnexecution", getWorkerRelocatedPath("org.apache.beam.runners.fnexecution")) - relocate("org.apache.beam.repackaged.beam_runners_java_fn_execution", getWorkerRelocatedPath("org.apache.beam.repackaged.beam_runners_java_fn_execution")) + dependencies { + include(project(":runners:java-fn-execution")) + } + relocate("org.apache.beam.runners.fnexecution", getWorkerRelocatedPath("org.apache.beam.runners.fnexecution")) + relocate("org.apache.beam.repackaged.beam_runners_java_fn_execution", getWorkerRelocatedPath("org.apache.beam.repackaged.beam_runners_java_fn_execution")) - dependencies { - include(project(":sdks:java:fn-execution")) - } - relocate("org.apache.beam.sdk.fn", getWorkerRelocatedPath("org.apache.beam.sdk.fn")) - relocate("org.apache.beam.repackaged.beam_sdks_java_fn_execution", getWorkerRelocatedPath("org.apache.beam.repackaged.beam_sdks_java_fn_execution")) + dependencies { + include(project(":sdks:java:fn-execution")) + } + relocate("org.apache.beam.sdk.fn", getWorkerRelocatedPath("org.apache.beam.sdk.fn")) + relocate("org.apache.beam.repackaged.beam_sdks_java_fn_execution", getWorkerRelocatedPath("org.apache.beam.repackaged.beam_sdks_java_fn_execution")) - dependencies { - // We have to include jetty-server/jetty-servlet and all of its transitive dependencies - // which includes several org.eclipse.jetty artifacts + servlet-api - include(dependency("org.eclipse.jetty:.*:9.2.10.v20150310")) - include(dependency("javax.servlet:javax.servlet-api:3.1.0")) - } - relocate("org.eclipse.jetty", getWorkerRelocatedPath("org.eclipse.jetty")) - relocate("javax.servlet", getWorkerRelocatedPath("javax.servlet")) + dependencies { + // We have to include jetty-server/jetty-servlet and all of its transitive dependencies + // which includes several org.eclipse.jetty artifacts + servlet-api + include(dependency("org.eclipse.jetty:.*:9.2.10.v20150310")) + include(dependency("javax.servlet:javax.servlet-api:3.1.0")) + } + relocate("org.eclipse.jetty", getWorkerRelocatedPath("org.eclipse.jetty")) + relocate("javax.servlet", getWorkerRelocatedPath("javax.servlet")) - // We don't relocate windmill since it is already underneath the org.apache.beam.runners.dataflow.worker namespace and never - // expect a user pipeline to include it. There is also a JNI component that windmill server relies on which makes - // arbitrary relocation more difficult. - dependencies { - include(project(path: ":runners:google-cloud-dataflow-java:worker:windmill", configuration: "shadow")) - } + // We don't relocate windmill since it is already underneath the org.apache.beam.runners.dataflow.worker namespace and never + // expect a user pipeline to include it. There is also a JNI component that windmill server relies on which makes + // arbitrary relocation more difficult. + dependencies { + include(project(path: ":runners:google-cloud-dataflow-java:worker:windmill", configuration: "shadow")) + } - // Include original source files extracted under - // '$buildDir/original_sources_to_package' to jar - from "$buildDir/original_sources_to_package" + // Include original source files extracted under + // '$buildDir/original_sources_to_package' to jar + from "$buildDir/original_sources_to_package" - exclude "META-INF/LICENSE.txt" - exclude "about.html" -}) + exclude "META-INF/LICENSE.txt" + exclude "about.html" + }) /******************************************************************************/ // Configure the worker root project @@ -219,6 +220,10 @@ dependencies { // as well and placed within the testImplementation configuration. Otherwise we can place it within // the shadowTest configuration. testImplementation project(path: ":runners:core-java", configuration: "testRuntimeMigration") + // TODO: excluding Guava until Truth updates it to >32.1.x + testImplementation(library.java.truth) { + exclude group: 'com.google.guava', module: 'guava' + } shadowTest project(path: ":sdks:java:extensions:google-cloud-platform-core", configuration: "testRuntimeMigration") shadowTest project(path: ":runners:direct-java", configuration: "shadow") shadowTest project(path: ":sdks:java:harness", configuration: "shadowTest") @@ -232,8 +237,8 @@ dependencies { project.task('validateShadedJarContainsSlf4jJdk14', dependsOn: 'shadowJar') { ext.outFile = project.file("${project.reportsDir}/${name}.out") inputs.files(project.configurations.shadow.artifacts.files) - .withPropertyName("shadowArtifactsFiles") - .withPathSensitivity(PathSensitivity.RELATIVE) + .withPropertyName("shadowArtifactsFiles") + .withPathSensitivity(PathSensitivity.RELATIVE) outputs.files outFile doLast { project.configurations.shadow.artifacts.files.each { diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/AbstractWindmillStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/AbstractWindmillStream.java index d3e7de58931f..ea7efff7a06d 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/AbstractWindmillStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/AbstractWindmillStream.java @@ -60,21 +60,16 @@ * synchronizing on this. */ public abstract class AbstractWindmillStream implements WindmillStream { - protected static final long DEFAULT_STREAM_RPC_DEADLINE_SECONDS = 300; + public static final long DEFAULT_STREAM_RPC_DEADLINE_SECONDS = 300; // Default gRPC streams to 2MB chunks, which has shown to be a large enough chunk size to reduce // per-chunk overhead, and small enough that we can still perform granular flow-control. protected static final int RPC_STREAM_CHUNK_SIZE = 2 << 20; - private static final Logger LOG = LoggerFactory.getLogger(AbstractWindmillStream.class); - protected final AtomicBoolean clientClosed; - + private final AtomicLong lastSendTimeMs; private final Executor executor; private final BackOff backoff; - // Indicates if the current stream in requestObserver is closed by calling close() method - private final AtomicBoolean streamClosed; private final AtomicLong startTimeMs; - private final AtomicLong lastSendTimeMs; private final AtomicLong lastResponseTimeMs; private final AtomicInteger errorCount; private final AtomicReference lastError; @@ -83,6 +78,8 @@ public abstract class AbstractWindmillStream implements Win private final Set> streamRegistry; private final int logEveryNStreamFailures; private final Supplier> requestObserverSupplier; + // Indicates if the current stream in requestObserver is closed by calling close() method + private final AtomicBoolean streamClosed; private @Nullable StreamObserver requestObserver; protected AbstractWindmillStream( @@ -132,9 +129,9 @@ private static long debugDuration(long nowMs, long startMs) { protected abstract boolean hasPendingRequests(); /** - * Called when the stream is throttled due to resource exhausted errors. Will be called for each - * resource exhausted error not just the first. onResponse() must stop throttling on receipt of - * the first good message. + * Called when the client side stream is throttled due to resource exhausted errors. Will be + * called for each resource exhausted error not just the first. onResponse() must stop throttling + * on receipt of the first good message. */ protected abstract void startThrottleTimer(); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/ForwardingClientResponseObserver.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/ForwardingClientResponseObserver.java index 3737e29efb13..a1f80598d89a 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/ForwardingClientResponseObserver.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/ForwardingClientResponseObserver.java @@ -27,23 +27,23 @@ *

Used to wrap existing {@link StreamObserver}s to be able to install an {@link * ClientCallStreamObserver#setOnReadyHandler(Runnable) onReadyHandler}. * - *

This is as thread-safe as the undering stream observer that is being wrapped. + *

This is as thread-safe as the underlying stream observer that is being wrapped. */ -final class ForwardingClientResponseObserver - implements ClientResponseObserver { +final class ForwardingClientResponseObserver + implements ClientResponseObserver { private final Runnable onReadyHandler; private final Runnable onDoneHandler; - private final StreamObserver inboundObserver; + private final StreamObserver inboundObserver; ForwardingClientResponseObserver( - StreamObserver inboundObserver, Runnable onReadyHandler, Runnable onDoneHandler) { + StreamObserver inboundObserver, Runnable onReadyHandler, Runnable onDoneHandler) { this.inboundObserver = inboundObserver; this.onReadyHandler = onReadyHandler; this.onDoneHandler = onDoneHandler; } @Override - public void onNext(ReqT value) { + public void onNext(ResponseT value) { inboundObserver.onNext(value); } @@ -60,7 +60,7 @@ public void onCompleted() { } @Override - public void beforeStart(ClientCallStreamObserver stream) { + public void beforeStart(ClientCallStreamObserver stream) { stream.setOnReadyHandler(onReadyHandler); } } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/StreamObserverFactory.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/StreamObserverFactory.java index a046f2fd46ac..e0878b7b0b91 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/StreamObserverFactory.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/StreamObserverFactory.java @@ -33,9 +33,9 @@ public static StreamObserverFactory direct( return new Direct(deadlineSeconds, messagesBetweenIsReadyChecks); } - public abstract StreamObserver from( - Function, StreamObserver> clientFactory, - StreamObserver responseObserver); + public abstract StreamObserver from( + Function, StreamObserver> clientFactory, + StreamObserver responseObserver); private static class Direct extends StreamObserverFactory { private final long deadlineSeconds; @@ -47,14 +47,14 @@ private static class Direct extends StreamObserverFactory { } @Override - public StreamObserver from( - Function, StreamObserver> clientFactory, - StreamObserver inboundObserver) { + public StreamObserver from( + Function, StreamObserver> clientFactory, + StreamObserver inboundObserver) { AdvancingPhaser phaser = new AdvancingPhaser(1); - CallStreamObserver outboundObserver = - (CallStreamObserver) + CallStreamObserver outboundObserver = + (CallStreamObserver) clientFactory.apply( - new ForwardingClientResponseObserver( + new ForwardingClientResponseObserver( inboundObserver, phaser::arrive, phaser::forceTermination)); return new DirectStreamObserver<>( phaser, outboundObserver, deadlineSeconds, messagesBetweenIsReadyChecks); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillStream.java index 70c7cc36ba31..4dd4164fc4ef 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/WindmillStream.java @@ -86,4 +86,8 @@ boolean commitWorkItem( /** Flushes any pending work items to the wire. */ void flush(); } + + /** Interface for streaming GetWorkerMetadata requests to Windmill. */ + @ThreadSafe + interface GetWorkerMetadataStream extends WindmillStream {} } diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcCommitWorkStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcCommitWorkStream.java index 74bd93a5474f..1bba40805dec 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcCommitWorkStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcCommitWorkStream.java @@ -17,16 +17,17 @@ */ package org.apache.beam.runners.dataflow.worker.windmill.grpcclient; +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkNotNull; + import java.io.PrintWriter; import java.util.HashMap; import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLong; import java.util.function.Consumer; +import java.util.function.Function; import org.apache.beam.runners.dataflow.worker.windmill.AbstractWindmillStream; -import org.apache.beam.runners.dataflow.worker.windmill.CloudWindmillServiceV1Alpha1Grpc; import org.apache.beam.runners.dataflow.worker.windmill.StreamObserverFactory; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.CommitStatus; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.JobHeader; @@ -37,7 +38,7 @@ import org.apache.beam.runners.dataflow.worker.windmill.WindmillStream.CommitWorkStream; import org.apache.beam.sdk.util.BackOff; import org.apache.beam.vendor.grpc.v1p54p0.com.google.protobuf.ByteString; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; +import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.stub.StreamObserver; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -56,7 +57,8 @@ final class GrpcCommitWorkStream private final int streamingRpcBatchLimit; private GrpcCommitWorkStream( - CloudWindmillServiceV1Alpha1Grpc.CloudWindmillServiceV1Alpha1Stub stub, + Function, StreamObserver> + startCommitWorkRpcFn, BackOff backoff, StreamObserverFactory streamObserverFactory, Set> streamRegistry, @@ -66,10 +68,7 @@ private GrpcCommitWorkStream( AtomicLong idGenerator, int streamingRpcBatchLimit) { super( - responseObserver -> - stub.withDeadlineAfter( - AbstractWindmillStream.DEFAULT_STREAM_RPC_DEADLINE_SECONDS, TimeUnit.SECONDS) - .commitWorkStream(responseObserver), + startCommitWorkRpcFn, backoff, streamObserverFactory, streamRegistry, @@ -83,7 +82,8 @@ private GrpcCommitWorkStream( } static GrpcCommitWorkStream create( - CloudWindmillServiceV1Alpha1Grpc.CloudWindmillServiceV1Alpha1Stub stub, + Function, StreamObserver> + startCommitWorkRpcFn, BackOff backoff, StreamObserverFactory streamObserverFactory, Set> streamRegistry, @@ -94,7 +94,7 @@ static GrpcCommitWorkStream create( int streamingRpcBatchLimit) { GrpcCommitWorkStream commitWorkStream = new GrpcCommitWorkStream( - stub, + startCommitWorkRpcFn, backoff, streamObserverFactory, streamRegistry, @@ -252,7 +252,7 @@ private void issueBatchedRequest(Map requests) { } private void issueMultiChunkRequest(final long id, PendingRequest pendingRequest) { - Preconditions.checkNotNull(pendingRequest.computation); + checkNotNull(pendingRequest.computation); final ByteString serializedCommit = pendingRequest.request.toByteString(); synchronized (this) { @@ -306,8 +306,13 @@ long getBytes() { private class Batcher { - final Map queue = new HashMap<>(); - long queuedBytes = 0; + private final Map queue; + private long queuedBytes; + + private Batcher() { + this.queuedBytes = 0; + this.queue = new HashMap<>(); + } boolean canAccept(PendingRequest request) { return queue.isEmpty() diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcGetDataStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcGetDataStream.java index b51daabb1a2b..238cc771dce8 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcGetDataStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcGetDataStream.java @@ -17,6 +17,9 @@ */ package org.apache.beam.runners.dataflow.worker.windmill.grpcclient; +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument; +import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Verify.verify; + import java.io.IOException; import java.io.InputStream; import java.io.PrintWriter; @@ -28,10 +31,9 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentLinkedDeque; import java.util.concurrent.CountDownLatch; -import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLong; +import java.util.function.Function; import org.apache.beam.runners.dataflow.worker.windmill.AbstractWindmillStream; -import org.apache.beam.runners.dataflow.worker.windmill.CloudWindmillServiceV1Alpha1Grpc; import org.apache.beam.runners.dataflow.worker.windmill.StreamObserverFactory; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.ComputationGetDataRequest; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GlobalData; @@ -45,8 +47,7 @@ import org.apache.beam.runners.dataflow.worker.windmill.grpcclient.GrpcGetDataStreamRequests.QueuedBatch; import org.apache.beam.runners.dataflow.worker.windmill.grpcclient.GrpcGetDataStreamRequests.QueuedRequest; import org.apache.beam.sdk.util.BackOff; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions; -import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Verify; +import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.stub.StreamObserver; import org.joda.time.Instant; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -64,7 +65,8 @@ final class GrpcGetDataStream private final int streamingRpcBatchLimit; private GrpcGetDataStream( - CloudWindmillServiceV1Alpha1Grpc.CloudWindmillServiceV1Alpha1Stub stub, + Function, StreamObserver> + startGetDataRpcFn, BackOff backoff, StreamObserverFactory streamObserverFactory, Set> streamRegistry, @@ -74,14 +76,7 @@ private GrpcGetDataStream( AtomicLong idGenerator, int streamingRpcBatchLimit) { super( - responseObserver -> - stub.withDeadlineAfter( - AbstractWindmillStream.DEFAULT_STREAM_RPC_DEADLINE_SECONDS, TimeUnit.SECONDS) - .getDataStream(responseObserver), - backoff, - streamObserverFactory, - streamRegistry, - logEveryNStreamFailures); + startGetDataRpcFn, backoff, streamObserverFactory, streamRegistry, logEveryNStreamFailures); this.idGenerator = idGenerator; this.getDataThrottleTimer = getDataThrottleTimer; this.jobHeader = jobHeader; @@ -91,7 +86,8 @@ private GrpcGetDataStream( } static GrpcGetDataStream create( - CloudWindmillServiceV1Alpha1Grpc.CloudWindmillServiceV1Alpha1Stub stub, + Function, StreamObserver> + startGetDataRpcFn, BackOff backoff, StreamObserverFactory streamObserverFactory, Set> streamRegistry, @@ -102,7 +98,7 @@ static GrpcGetDataStream create( int streamingRpcBatchLimit) { GrpcGetDataStream getDataStream = new GrpcGetDataStream( - stub, + startGetDataRpcFn, backoff, streamObserverFactory, streamRegistry, @@ -122,7 +118,7 @@ protected synchronized void onNewStream() { // We rely on close only occurring after all methods on the stream have returned. // Since the requestKeyedData and requestGlobalData methods are blocking this // means there should be no pending requests. - Verify.verify(!hasPendingRequests()); + verify(!hasPendingRequests()); } else { for (AppendableInputStream responseStream : pending.values()) { responseStream.cancel(); @@ -138,14 +134,13 @@ protected boolean hasPendingRequests() { @Override @SuppressWarnings("dereference.of.nullable") protected void onResponse(StreamingGetDataResponse chunk) { - Preconditions.checkArgument(chunk.getRequestIdCount() == chunk.getSerializedResponseCount()); - Preconditions.checkArgument( - chunk.getRemainingBytesForResponse() == 0 || chunk.getRequestIdCount() == 1); + checkArgument(chunk.getRequestIdCount() == chunk.getSerializedResponseCount()); + checkArgument(chunk.getRemainingBytesForResponse() == 0 || chunk.getRequestIdCount() == 1); getDataThrottleTimer.stop(); for (int i = 0; i < chunk.getRequestIdCount(); ++i) { AppendableInputStream responseStream = pending.get(chunk.getRequestId(i)); - Verify.verify(responseStream != null, "No pending response stream"); + verify(responseStream != null, "No pending response stream"); responseStream.append(chunk.getSerializedResponse(i).newInput()); if (chunk.getRemainingBytesForResponse() == 0) { responseStream.complete(); @@ -283,12 +278,12 @@ private void queueRequestAndWait(QueuedRequest request) throws InterruptedExcept // Finalize the batch so that no additional requests will be added. Leave the batch in the // queue so that a subsequent batch will wait for it's completion. synchronized (batches) { - Verify.verify(batch == batches.peekFirst()); + verify(batch == batches.peekFirst()); batch.markFinalized(); } sendBatch(batch.requests()); synchronized (batches) { - Verify.verify(batch == batches.pollFirst()); + verify(batch == batches.pollFirst()); } // Notify all waiters with requests in this batch as well as the sender // of the next batch (if one exists). @@ -308,7 +303,7 @@ private void sendBatch(List requests) { for (QueuedRequest request : requests) { // Map#put returns null if there was no previous mapping for the key, meaning we have not // seen it before. - Verify.verify(pending.put(request.id(), request.getResponseStream()) == null); + verify(pending.put(request.id(), request.getResponseStream()) == null); } try { send(batchedRequest); diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcGetWorkStream.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcGetWorkStream.java index 6e35beccdb6a..4660fe25b13b 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcGetWorkStream.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcGetWorkStream.java @@ -23,12 +23,11 @@ import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLong; +import java.util.function.Function; import javax.annotation.Nullable; import org.apache.beam.runners.dataflow.worker.WindmillTimeUtils; import org.apache.beam.runners.dataflow.worker.windmill.AbstractWindmillStream; -import org.apache.beam.runners.dataflow.worker.windmill.CloudWindmillServiceV1Alpha1Grpc; import org.apache.beam.runners.dataflow.worker.windmill.StreamObserverFactory; import org.apache.beam.runners.dataflow.worker.windmill.Windmill; import org.apache.beam.runners.dataflow.worker.windmill.Windmill.GetWorkRequest; @@ -40,6 +39,7 @@ import org.apache.beam.runners.dataflow.worker.windmill.WindmillStream.GetWorkStream.WorkItemReceiver; import org.apache.beam.sdk.util.BackOff; import org.apache.beam.vendor.grpc.v1p54p0.com.google.protobuf.ByteString; +import org.apache.beam.vendor.grpc.v1p54p0.io.grpc.stub.StreamObserver; import org.joda.time.Instant; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -58,7 +58,10 @@ final class GrpcGetWorkStream private final AtomicLong inflightBytes; private GrpcGetWorkStream( - CloudWindmillServiceV1Alpha1Grpc.CloudWindmillServiceV1Alpha1Stub stub, + Function< + StreamObserver, + StreamObserver> + startGetWorkRpcFn, GetWorkRequest request, BackOff backoff, StreamObserverFactory streamObserverFactory, @@ -67,14 +70,7 @@ private GrpcGetWorkStream( ThrottleTimer getWorkThrottleTimer, WorkItemReceiver receiver) { super( - responseObserver -> - stub.withDeadlineAfter( - AbstractWindmillStream.DEFAULT_STREAM_RPC_DEADLINE_SECONDS, TimeUnit.SECONDS) - .getWorkStream(responseObserver), - backoff, - streamObserverFactory, - streamRegistry, - logEveryNStreamFailures); + startGetWorkRpcFn, backoff, streamObserverFactory, streamRegistry, logEveryNStreamFailures); this.request = request; this.getWorkThrottleTimer = getWorkThrottleTimer; this.receiver = receiver; @@ -84,7 +80,10 @@ private GrpcGetWorkStream( } static GrpcGetWorkStream create( - CloudWindmillServiceV1Alpha1Grpc.CloudWindmillServiceV1Alpha1Stub stub, + Function< + StreamObserver, + StreamObserver> + startGetWorkRpcFn, GetWorkRequest request, BackOff backoff, StreamObserverFactory streamObserverFactory, @@ -94,7 +93,7 @@ static GrpcGetWorkStream create( WorkItemReceiver receiver) { GrpcGetWorkStream getWorkStream = new GrpcGetWorkStream( - stub, + startGetWorkRpcFn, request, backoff, streamObserverFactory, diff --git a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcWindmillServer.java b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcWindmillServer.java index e8745e265eea..19cb90297df5 100644 --- a/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcWindmillServer.java +++ b/runners/google-cloud-dataflow-java/worker/src/main/java/org/apache/beam/runners/dataflow/worker/windmill/grpcclient/GrpcWindmillServer.java @@ -107,7 +107,6 @@ public final class GrpcWindmillServer extends WindmillServerStub { private final ThrottleTimer commitWorkThrottleTimer; private final Random rand; private final Set> streamRegistry; - private ImmutableSet endpoints; private int logEveryNStreamFailures; private Duration maxBackoff = MAX_BACKOFF; @@ -301,14 +300,21 @@ private Channel remoteChannel(HostAndPort endpoint) throws IOException { .build(); } + /** + * Stubs returned from this method do not (and should not) have {@link + * org.apache.beam.vendor.grpc.v1p54p0.io.grpc.Deadline}(s) set since they represent an absolute + * point in time. {@link org.apache.beam.vendor.grpc.v1p54p0.io.grpc.Deadline}(s) should not be + * treated as a timeout which represents a relative point in time. + * + * @see Official gRPC deadline documentation for more + * details. + */ private synchronized CloudWindmillServiceV1Alpha1Grpc.CloudWindmillServiceV1Alpha1Stub stub() { if (stubList.isEmpty()) { throw new RuntimeException("windmillServiceEndpoint has not been set"); } - if (stubList.size() == 1) { - return stubList.get(0); - } - return stubList.get(rand.nextInt(stubList.size())); + + return stubList.size() == 1 ? stubList.get(0) : stubList.get(rand.nextInt(stubList.size())); } @Override @@ -398,7 +404,13 @@ public GetWorkStream getWorkStream(GetWorkRequest request, WorkItemReceiver rece .build(); return GrpcGetWorkStream.create( - stub(), + responseObserver -> + stub() + // Deadlines are absolute points in time, so generate a new one everytime this + // function is called. + .withDeadlineAfter( + AbstractWindmillStream.DEFAULT_STREAM_RPC_DEADLINE_SECONDS, TimeUnit.SECONDS) + .getWorkStream(responseObserver), getWorkRequest, grpcBackoff(), newStreamObserverFactory(), @@ -411,7 +423,13 @@ public GetWorkStream getWorkStream(GetWorkRequest request, WorkItemReceiver rece @Override public GetDataStream getDataStream() { return GrpcGetDataStream.create( - stub(), + responseObserver -> + stub() + // Deadlines are absolute points in time, so generate a new one everytime this + // function is called. + .withDeadlineAfter( + AbstractWindmillStream.DEFAULT_STREAM_RPC_DEADLINE_SECONDS, TimeUnit.SECONDS) + .getDataStream(responseObserver), grpcBackoff(), newStreamObserverFactory(), streamRegistry, @@ -425,7 +443,13 @@ public GetDataStream getDataStream() { @Override public CommitWorkStream commitWorkStream() { return GrpcCommitWorkStream.create( - stub(), + responseObserver -> + stub() + // Deadlines are absolute points in time, so generate a new one everytime this + // function is called. + .withDeadlineAfter( + AbstractWindmillStream.DEFAULT_STREAM_RPC_DEADLINE_SECONDS, TimeUnit.SECONDS) + .commitWorkStream(responseObserver), grpcBackoff(), newStreamObserverFactory(), streamRegistry, diff --git a/runners/google-cloud-dataflow-java/worker/windmill/src/main/proto/windmill.proto b/runners/google-cloud-dataflow-java/worker/windmill/src/main/proto/windmill.proto index f66b2bed48c6..1759185911d4 100644 --- a/runners/google-cloud-dataflow-java/worker/windmill/src/main/proto/windmill.proto +++ b/runners/google-cloud-dataflow-java/worker/windmill/src/main/proto/windmill.proto @@ -746,6 +746,8 @@ message WorkerMetadataRequest { optional JobHeader header = 1; } +// Converted into org.apache.beam.runners.dataflow.worker.windmill.WindmillEndpoints +// used to connect to Streaming Engine. message WorkerMetadataResponse { // The metadata version increases with every modification. Within a single // stream it will always be increasing. The version may be used across streams @@ -758,7 +760,9 @@ message WorkerMetadataResponse { // CommitWorkStream. Each response on this stream replaces the previous, and // connections to endpoints that are no longer present should be closed. message Endpoint { - optional string endpoint = 1; + // IPv6 address of a streaming engine windmill worker. + optional string direct_endpoint = 1; + optional string worker_token = 2; } repeated Endpoint work_endpoints = 2; @@ -766,10 +770,7 @@ message WorkerMetadataResponse { // calls to retrieve that global data. map global_data_endpoints = 3; - // DirectPath endpoints to be used by user workers for streaming engine jobs. - // DirectPath endpoints here are virtual IPv6 addresses of the windmill - // workers. - repeated Endpoint direct_path_endpoints = 4; + reserved 4; } service WindmillAppliance { diff --git a/runners/google-cloud-dataflow-java/worker/windmill/src/main/proto/windmill_service.proto b/runners/google-cloud-dataflow-java/worker/windmill/src/main/proto/windmill_service.proto index 803766d1a464..d9183e54e0dd 100644 --- a/runners/google-cloud-dataflow-java/worker/windmill/src/main/proto/windmill_service.proto +++ b/runners/google-cloud-dataflow-java/worker/windmill/src/main/proto/windmill_service.proto @@ -34,7 +34,7 @@ service CloudWindmillServiceV1Alpha1 { returns (stream .windmill.StreamingGetWorkResponseChunk); // Gets worker metadata. Response is a stream. - rpc GetWorkerMetadataStream(.windmill.WorkerMetadataRequest) + rpc GetWorkerMetadataStream(stream .windmill.WorkerMetadataRequest) returns (stream .windmill.WorkerMetadataResponse); // Gets data from Windmill. diff --git a/sdks/go.mod b/sdks/go.mod index 2bfb346a5e34..5e91aea021f8 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -23,13 +23,13 @@ module github.com/apache/beam/sdks/v2 go 1.20 require ( - cloud.google.com/go/bigquery v1.54.0 + cloud.google.com/go/bigquery v1.55.0 cloud.google.com/go/bigtable v1.19.0 - cloud.google.com/go/datastore v1.13.0 + cloud.google.com/go/datastore v1.14.0 cloud.google.com/go/profiler v0.3.1 cloud.google.com/go/pubsub v1.33.0 cloud.google.com/go/spanner v1.49.0 - cloud.google.com/go/storage v1.32.0 + cloud.google.com/go/storage v1.33.0 github.com/aws/aws-sdk-go-v2 v1.21.0 github.com/aws/aws-sdk-go-v2/config v1.18.39 github.com/aws/aws-sdk-go-v2/credentials v1.13.37 @@ -57,9 +57,9 @@ require ( golang.org/x/sync v0.3.0 golang.org/x/sys v0.12.0 golang.org/x/text v0.13.0 - google.golang.org/api v0.139.0 - google.golang.org/genproto v0.0.0-20230803162519-f966b187b2e5 - google.golang.org/grpc v1.57.0 + google.golang.org/api v0.140.0 + google.golang.org/genproto v0.0.0-20230821184602-ccc8af3d0e93 + google.golang.org/grpc v1.58.0 google.golang.org/protobuf v1.31.0 gopkg.in/retry.v1 v1.0.3 gopkg.in/yaml.v2 v2.4.0 @@ -74,7 +74,7 @@ require ( require dario.cat/mergo v1.0.0 // indirect require ( - cloud.google.com/go v0.110.6 // indirect + cloud.google.com/go v0.110.7 // indirect cloud.google.com/go/compute v1.23.0 // indirect cloud.google.com/go/compute/metadata v0.2.3 // indirect cloud.google.com/go/iam v1.1.1 // indirect @@ -109,8 +109,8 @@ require ( github.com/docker/distribution v2.8.2+incompatible // indirect github.com/docker/docker v24.0.5+incompatible // indirect; but required to resolve issue docker has with go1.20 github.com/docker/go-units v0.5.0 // indirect - github.com/envoyproxy/go-control-plane v0.11.1-0.20230524094728-9239064ad72f // indirect - github.com/envoyproxy/protoc-gen-validate v0.10.1 // indirect + github.com/envoyproxy/go-control-plane v0.11.1 // indirect + github.com/envoyproxy/protoc-gen-validate v1.0.2 // indirect github.com/felixge/httpsnoop v1.0.2 // indirect github.com/goccy/go-json v0.9.11 // indirect github.com/gogo/protobuf v1.3.2 // indirect @@ -156,9 +156,9 @@ require ( go.opencensus.io v0.24.0 // indirect golang.org/x/crypto v0.13.0 // indirect golang.org/x/mod v0.11.0 // indirect - golang.org/x/tools v0.9.1 // indirect + golang.org/x/tools v0.10.0 // indirect golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect google.golang.org/appengine v1.6.7 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20230803162519-f966b187b2e5 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20230822172742-b8732ec3820d // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20230911183012-2d3300fd4832 // indirect ) diff --git a/sdks/go.sum b/sdks/go.sum index 638b327fe87e..c30891294dbd 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -8,13 +8,13 @@ cloud.google.com/go v0.46.3/go.mod h1:a6bKKbmY7er1mI7TEI4lsAkts/mkhTSZK8w33B4RAg cloud.google.com/go v0.50.0/go.mod h1:r9sluTvynVuxRIOHXQEHMFffphuXHOMZMycpNR5e6To= cloud.google.com/go v0.52.0/go.mod h1:pXajvRH/6o3+F9jDHZWQ5PbGhn+o8w9qiu/CffaVdO4= cloud.google.com/go v0.53.0/go.mod h1:fp/UouUEsRkN6ryDKNW/Upv/JBKnv6WDthjR6+vze6M= -cloud.google.com/go v0.110.6 h1:8uYAkj3YHTP/1iwReuHPxLSbdcyc+dSBbzFMrVwDR6Q= -cloud.google.com/go v0.110.6/go.mod h1:+EYjdK8e5RME/VY/qLCAtuyALQ9q67dvuum8i+H5xsI= +cloud.google.com/go v0.110.7 h1:rJyC7nWRg2jWGZ4wSJ5nY65GTdYJkg0cd/uXb+ACI6o= +cloud.google.com/go v0.110.7/go.mod h1:+EYjdK8e5RME/VY/qLCAtuyALQ9q67dvuum8i+H5xsI= cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o= cloud.google.com/go/bigquery v1.3.0/go.mod h1:PjpwJnslEMmckchkHFfq+HTD2DmtT67aNFKH1/VBDHE= cloud.google.com/go/bigquery v1.4.0/go.mod h1:S8dzgnTigyfTmLBfrtrhyYhwRxG72rYxvftPBK2Dvzc= -cloud.google.com/go/bigquery v1.54.0 h1:ify6s7sy+kQuAimRnVTrPUzaeY0+X5GEsKt2C5CiA8w= -cloud.google.com/go/bigquery v1.54.0/go.mod h1:9Y5I3PN9kQWuid6183JFhOGOW3GcirA5LpsKCUn+2ec= +cloud.google.com/go/bigquery v1.55.0 h1:hs44Xxov3XLWQiCx2J8lK5U/ihLqnpm4RVVl5fdtLLI= +cloud.google.com/go/bigquery v1.55.0/go.mod h1:9Y5I3PN9kQWuid6183JFhOGOW3GcirA5LpsKCUn+2ec= cloud.google.com/go/bigtable v1.19.0 h1:wiq9LT0kukfInzvy1joMDijCw/OD1UChpSbORXYn0LI= cloud.google.com/go/bigtable v1.19.0/go.mod h1:xl5kPa8PTkJjdBxg6qdGH88464nNqmbISHSRU+D2yFE= cloud.google.com/go/compute v1.23.0 h1:tP41Zoavr8ptEqaW6j+LQOnyBBhO7OkOMAGrgLopTwY= @@ -24,8 +24,8 @@ cloud.google.com/go/compute/metadata v0.2.3/go.mod h1:VAV5nSsACxMJvgaAuX6Pk2Aawl cloud.google.com/go/datacatalog v1.16.0 h1:qVeQcw1Cz93/cGu2E7TYUPh8Lz5dn5Ws2siIuQ17Vng= cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE= cloud.google.com/go/datastore v1.1.0/go.mod h1:umbIZjpQpHh4hmRpGhH4tLFup+FVzqBi1b3c64qFpCk= -cloud.google.com/go/datastore v1.13.0 h1:ktbC66bOQB3HJPQe8qNI1/aiQ77PMu7hD4mzE6uxe3w= -cloud.google.com/go/datastore v1.13.0/go.mod h1:KjdB88W897MRITkvWWJrg2OUtrR5XVj1EoLgSp6/N70= +cloud.google.com/go/datastore v1.14.0 h1:Mq0ApTRdLW3/dyiw+DkjTk0+iGIUvkbzaC8sfPwWTH4= +cloud.google.com/go/datastore v1.14.0/go.mod h1:GAeStMBIt9bPS7jMJA85kgkpsMkvseWWXiaHya9Jes8= cloud.google.com/go/iam v1.1.1 h1:lW7fzj15aVIXYHREOqjRBV9PsH0Z6u8Y46a1YGvQP4Y= cloud.google.com/go/iam v1.1.1/go.mod h1:A5avdyVL2tCppe4unb0951eI9jreack+RJ0/d+KUZOU= cloud.google.com/go/kms v1.15.0 h1:xYl5WEaSekKYN5gGRyhjvZKM22GVBBCzegGNVPy+aIs= @@ -43,8 +43,8 @@ cloud.google.com/go/spanner v1.49.0/go.mod h1:eGj9mQGK8+hkgSVbHNQ06pQ4oS+cyc4tXX cloud.google.com/go/storage v1.0.0/go.mod h1:IhtSnM/ZTZV8YYJWCY8RULGVqBDmpoyjwiyrjsg+URw= cloud.google.com/go/storage v1.5.0/go.mod h1:tpKbwo567HUNpVclU5sGELwQWBDZ8gh0ZeosJ0Rtdos= cloud.google.com/go/storage v1.6.0/go.mod h1:N7U0C8pVQ/+NIKOBQyamJIeKQKkZ+mxpohlUTyfDhBk= -cloud.google.com/go/storage v1.32.0 h1:5w6DxEGOnktmJHarxAOUywxVW9lbNWIzlzzUltG/3+o= -cloud.google.com/go/storage v1.32.0/go.mod h1:Hhh/dogNRGca7IWv1RC2YqEn0c0G77ctA/OxflYkiD8= +cloud.google.com/go/storage v1.33.0 h1:PVrDOkIC8qQVa1P3SXGpQvfuJhN2LHOoyZvWs8D2X5M= +cloud.google.com/go/storage v1.33.0/go.mod h1:Hhh/dogNRGca7IWv1RC2YqEn0c0G77ctA/OxflYkiD8= dario.cat/mergo v1.0.0 h1:AGCNq9Evsj31mOgNPcLyXc+4PNABt905YmuqPYYpBWk= dario.cat/mergo v1.0.0/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk= dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= @@ -179,11 +179,11 @@ github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+m github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= -github.com/envoyproxy/go-control-plane v0.11.1-0.20230524094728-9239064ad72f h1:7T++XKzy4xg7PKy+bM+Sa9/oe1OC88yz2hXQUISoXfA= -github.com/envoyproxy/go-control-plane v0.11.1-0.20230524094728-9239064ad72f/go.mod h1:sfYdkwUW4BA3PbKjySwjJy+O4Pu0h62rlqCMHNk+K+Q= +github.com/envoyproxy/go-control-plane v0.11.1 h1:wSUXTlLfiAQRWs2F+p+EKOY9rUyis1MyGqJ2DIk5HpM= +github.com/envoyproxy/go-control-plane v0.11.1/go.mod h1:uhMcXKCQMEJHiAb0w+YGefQLaTEw+YhGluxZkrTmD0g= github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= -github.com/envoyproxy/protoc-gen-validate v0.10.1 h1:c0g45+xCJhdgFGw7a5QAfdS4byAbud7miNWJ1WwEVf8= -github.com/envoyproxy/protoc-gen-validate v0.10.1/go.mod h1:DRjgyB0I43LtJapqN6NiRwroiAU2PaFuvk/vjgh61ss= +github.com/envoyproxy/protoc-gen-validate v1.0.2 h1:QkIBuU5k+x7/QXPvPPnWXWlCdaBFApVqftFV6k087DA= +github.com/envoyproxy/protoc-gen-validate v1.0.2/go.mod h1:GpiZQP3dDbg4JouG/NNS7QWXpgx6x8QiMKdmN72jogE= github.com/felixge/httpsnoop v1.0.1/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/felixge/httpsnoop v1.0.2 h1:+nS9g82KMXccJ/wp0zyRW9ZBHFETmMGtkk+2CTTrW4o= github.com/felixge/httpsnoop v1.0.2/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= @@ -629,8 +629,8 @@ golang.org/x/tools v0.0.0-20200224181240-023911ca70b2/go.mod h1:TB2adYChydJhpapK golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= -golang.org/x/tools v0.9.1 h1:8WMNJAz3zrtPmnYC7ISf5dEn3MT0gY7jBJfw27yrrLo= -golang.org/x/tools v0.9.1/go.mod h1:owI94Op576fPu3cIGQeHs3joujW/2Oc6MtlxbF5dfNc= +golang.org/x/tools v0.10.0 h1:tvDr/iQoUqNdohiYm0LmmKcBk+q86lb9EprIUFhHHGg= +golang.org/x/tools v0.10.0/go.mod h1:UJwyiVBsOA2uwvK/e5OY3GTpDUJriEd+/YlqAwLPmyM= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -647,8 +647,8 @@ google.golang.org/api v0.14.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsb google.golang.org/api v0.15.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= google.golang.org/api v0.17.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= google.golang.org/api v0.18.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= -google.golang.org/api v0.139.0 h1:A1TrCPgMmOiYu0AiNkvQIpIx+D8blHTDcJ5EogkP7LI= -google.golang.org/api v0.139.0/go.mod h1:CVagp6Eekz9CjGZ718Z+sloknzkDJE7Vc1Ckj9+viBk= +google.golang.org/api v0.140.0 h1:CaXNdYOH5oQQI7l6iKTHHiMTdxZca4/02hRg2U8c2hM= +google.golang.org/api v0.140.0/go.mod h1:aGbCiFgtwb2P6badchFbSBUurV6oR5d50Af4iNJtDdI= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= @@ -674,12 +674,12 @@ google.golang.org/genproto v0.0.0-20200204135345-fa8e72b47b90/go.mod h1:GmwEX6Z4 google.golang.org/genproto v0.0.0-20200212174721-66ed5ce911ce/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= google.golang.org/genproto v0.0.0-20200224152610-e50cd9704f63/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= -google.golang.org/genproto v0.0.0-20230803162519-f966b187b2e5 h1:L6iMMGrtzgHsWofoFcihmDEMYeDR9KN/ThbPWGrh++g= -google.golang.org/genproto v0.0.0-20230803162519-f966b187b2e5/go.mod h1:oH/ZOT02u4kWEp7oYBGYFFkCdKS/uYR9Z7+0/xuuFp8= +google.golang.org/genproto v0.0.0-20230821184602-ccc8af3d0e93 h1:zv6ieVm8jNcN33At1+APsRISkRgynuWUxUhv6G123jY= +google.golang.org/genproto v0.0.0-20230821184602-ccc8af3d0e93/go.mod h1:yZTlhN0tQnXo3h00fuXNCxJdLdIdnVFVBaRJ5LWBbw4= google.golang.org/genproto/googleapis/api v0.0.0-20230803162519-f966b187b2e5 h1:nIgk/EEq3/YlnmVVXVnm14rC2oxgs1o0ong4sD/rd44= google.golang.org/genproto/googleapis/api v0.0.0-20230803162519-f966b187b2e5/go.mod h1:5DZzOUPCLYL3mNkQ0ms0F3EuUNZ7py1Bqeq6sxzI7/Q= -google.golang.org/genproto/googleapis/rpc v0.0.0-20230822172742-b8732ec3820d h1:uvYuEyMHKNt+lT4K3bN6fGswmK8qSvcreM3BwjDh+y4= -google.golang.org/genproto/googleapis/rpc v0.0.0-20230822172742-b8732ec3820d/go.mod h1:+Bk1OCOj40wS2hwAMA+aCW9ypzm63QTBBHp6lQ3p+9M= +google.golang.org/genproto/googleapis/rpc v0.0.0-20230911183012-2d3300fd4832 h1:o4LtQxebKIJ4vkzyhtD2rfUNZ20Zf0ik5YVP5E7G7VE= +google.golang.org/genproto/googleapis/rpc v0.0.0-20230911183012-2d3300fd4832/go.mod h1:+Bk1OCOj40wS2hwAMA+aCW9ypzm63QTBBHp6lQ3p+9M= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= @@ -689,8 +689,8 @@ google.golang.org/grpc v1.26.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8 google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= google.golang.org/grpc v1.27.1/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= google.golang.org/grpc v1.33.2/go.mod h1:JMHMWHQWaTccqQQlmk3MJZS+GWXOdAesneDmEnv2fbc= -google.golang.org/grpc v1.57.0 h1:kfzNeI/klCGD2YPMUlaGNT3pxvYfga7smW3Vth8Zsiw= -google.golang.org/grpc v1.57.0/go.mod h1:Sd+9RMTACXwmub0zcNY2c4arhtrbBYD1AUHI/dt16Mo= +google.golang.org/grpc v1.58.0 h1:32JY8YpPMSR45K+c3o6b8VL73V+rR8k+DeMIr4vRH8o= +google.golang.org/grpc v1.58.0/go.mod h1:tgX3ZQDlNJGU96V6yHh1T/JeoBQ2TXdr43YbYSsCJk0= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= diff --git a/sdks/java/build-tools/src/main/resources/beam/checkstyle/suppressions.xml b/sdks/java/build-tools/src/main/resources/beam/checkstyle/suppressions.xml index bb8954839d50..7037f0543f4f 100644 --- a/sdks/java/build-tools/src/main/resources/beam/checkstyle/suppressions.xml +++ b/sdks/java/build-tools/src/main/resources/beam/checkstyle/suppressions.xml @@ -20,6 +20,7 @@ + diff --git a/sdks/python/apache_beam/dataframe/frame_base.py b/sdks/python/apache_beam/dataframe/frame_base.py index 24497f1de069..b64e8e2d53c2 100644 --- a/sdks/python/apache_beam/dataframe/frame_base.py +++ b/sdks/python/apache_beam/dataframe/frame_base.py @@ -475,7 +475,7 @@ def wrapper(self, inplace=False, **kwargs): return wrapper -def args_to_kwargs(base_type): +def args_to_kwargs(base_type, removed_method=False, removed_args=None): """Convert all args to kwargs before calling the decorated function. When applied to a function, this decorator creates a new function @@ -484,18 +484,52 @@ def args_to_kwargs(base_type): determine the name to use for arguments that are converted to keyword arguments. - For internal use only. No backwards compatibility guarantees.""" + For internal use only. No backwards compatibility guarantees. + + Args: + base_type: The pandas type of the method that this is trying to replicate. + removed_method: Whether this method has been removed in the running + Pandas version. + removed_args: If not empty, which arguments have been dropped in the + running Pandas version. + """ def wrap(func): - arg_names = getfullargspec(unwrap(getattr(base_type, func.__name__))).args + if removed_method: + # Do no processing, let Beam function itself raise the error if called. + return func + + removed_arg_names = removed_args if removed_args is not None else [] + + # TODO: Better handle position only arguments if they are ever a thing + # in Pandas (as of 2.1 currently they aren't). + base_arg_spec = getfullargspec(unwrap(getattr(base_type, func.__name__))) + base_arg_names = base_arg_spec.args + # Some arguments are keyword only and we still want to check against those. + all_possible_base_arg_names = base_arg_names + base_arg_spec.kwonlyargs + beam_arg_names = getfullargspec(func).args + + if not_found := (set(beam_arg_names) - set(all_possible_base_arg_names) - + set(removed_arg_names)): + raise TypeError( + f"Beam definition of {func.__name__} has arguments that are not found" + f" in the base version of the function: {not_found}") @functools.wraps(func) def wrapper(*args, **kwargs): - for name, value in zip(arg_names, args): + if len(args) > len(base_arg_names): + raise TypeError(f"{func.__name__} got too many positioned arguments.") + + for name, value in zip(base_arg_names, args): if name in kwargs: raise TypeError( "%s() got multiple values for argument '%s'" % (func.__name__, name)) kwargs[name] = value + # Still have to populate these for the Beam function signature. + if removed_args: + for name in removed_args: + if name not in kwargs: + kwargs[name] = None return func(**kwargs) return wrapper @@ -524,14 +558,22 @@ def wrapper(*args, **kwargs): f"**{BEAM_SPECIFIC!r}** for details.") -def with_docs_from(base_type, name=None): +def with_docs_from(base_type, name=None, removed_method=False): """Decorator that updates the documentation from the wrapped function to duplicate the documentation from the identically-named method in `base_type`. Any docstring on the original function will be included in the new function under a "Differences from pandas" heading. + + removed_method used in cases where a method has been removed in a later + version of Pandas. """ def wrap(func): + if removed_method: + func.__doc__ = ( + "This method has been removed in the current version of Pandas.") + return func + fn_name = name or func.__name__ orig_doc = getattr(base_type, fn_name).__doc__ if orig_doc is None: @@ -588,23 +630,39 @@ def format_section(header): return wrap -def populate_defaults(base_type): +def populate_defaults(base_type, removed_method=False, removed_args=None): """Populate default values for keyword arguments in decorated function. When applied to a function, this decorator creates a new function with default values for all keyword arguments, based on the default values for the identically-named method on `base_type`. - For internal use only. No backwards compatibility guarantees.""" + For internal use only. No backwards compatibility guarantees. + + Args: + base_type: The pandas type of the method that this is trying to replicate. + removed_method: Whether this method has been removed in the running + Pandas version. + removed_args: If not empty, which arguments have been dropped in the + running Pandas version. + """ def wrap(func): + if removed_method: + return func + base_argspec = getfullargspec(unwrap(getattr(base_type, func.__name__))) - if not base_argspec.defaults: + if not base_argspec.defaults and not base_argspec.kwonlydefaults: return func - arg_to_default = dict( - zip( - base_argspec.args[-len(base_argspec.defaults):], - base_argspec.defaults)) + arg_to_default = {} + if base_argspec.defaults: + arg_to_default.update( + zip( + base_argspec.args[-len(base_argspec.defaults):], + base_argspec.defaults)) + + if base_argspec.kwonlydefaults: + arg_to_default.update(base_argspec.kwonlydefaults) unwrapped_func = unwrap(func) # args that do not have defaults in func, but do have defaults in base @@ -613,6 +671,8 @@ def wrap(func): defaults_to_populate = set( func_argspec.args[:num_non_defaults]).intersection( arg_to_default.keys()) + if removed_args: + defaults_to_populate -= set(removed_args) @functools.wraps(func) def wrapper(**kwargs): diff --git a/sdks/python/apache_beam/dataframe/frame_base_test.py b/sdks/python/apache_beam/dataframe/frame_base_test.py index 82d5b65e1a49..5962bb2fb72b 100644 --- a/sdks/python/apache_beam/dataframe/frame_base_test.py +++ b/sdks/python/apache_beam/dataframe/frame_base_test.py @@ -72,7 +72,7 @@ def add_one(frame): def test_args_to_kwargs(self): class Base(object): - def func(self, a=1, b=2, c=3): + def func(self, a=1, b=2, c=3, *, kw_only=4): pass class Proxy(object): @@ -87,20 +87,36 @@ def func(self, **kwargs): self.assertEqual(proxy.func(2, 4, 6), {'a': 2, 'b': 4, 'c': 6}) self.assertEqual(proxy.func(2, c=6), {'a': 2, 'c': 6}) self.assertEqual(proxy.func(c=6, a=2), {'a': 2, 'c': 6}) + self.assertEqual(proxy.func(2, kw_only=20), {'a': 2, 'kw_only': 20}) + with self.assertRaises(TypeError): # got too many positioned arguments + proxy.func(2, 4, 6, 8) def test_args_to_kwargs_populates_defaults(self): class Base(object): def func(self, a=1, b=2, c=3): pass + def func_removed_args(self, a): + pass + class Proxy(object): @frame_base.args_to_kwargs(Base) @frame_base.populate_defaults(Base) def func(self, a, c=1000, **kwargs): return dict(kwargs, a=a, c=c) + @frame_base.args_to_kwargs(Base, removed_method=True) + @frame_base.populate_defaults(Base, removed_method=True) + def func_removed_method(self, a, **kwargs): + return dict(kwargs, a=a) + + @frame_base.args_to_kwargs(Base, removed_args=['c']) + @frame_base.populate_defaults(Base, removed_args=['c']) + def func_removed_args(self, a, c, **kwargs): + return dict(kwargs, a=a) + proxy = Proxy() - # pylint: disable=too-many-function-args + # pylint: disable=too-many-function-args,no-value-for-parameter self.assertEqual(proxy.func(), {'a': 1, 'c': 1000}) self.assertEqual(proxy.func(100), {'a': 100, 'c': 1000}) self.assertEqual(proxy.func(2, 4, 6), {'a': 2, 'b': 4, 'c': 6}) @@ -108,6 +124,53 @@ def func(self, a, c=1000, **kwargs): self.assertEqual(proxy.func(c=6, a=2), {'a': 2, 'c': 6}) self.assertEqual(proxy.func(c=6), {'a': 1, 'c': 6}) + with self.assertRaises(TypeError): # missing 1 required positional argument + proxy.func_removed_method() + self.assertEqual(proxy.func_removed_method(12, c=100), {'a': 12, 'c': 100}) + + with self.assertRaises(TypeError): # missing 1 required positional argument + proxy.func_removed_args() + self.assertEqual(proxy.func_removed_args(12, d=100), {'a': 12, 'd': 100}) + + def test_args_to_kwargs_populates_default_handles_kw_only(self): + class Base(object): + def func(self, a=1, b=2, c=3, *, kw_only=4): + pass + + class ProxyUsesKwOnly(object): + @frame_base.args_to_kwargs(Base) + @frame_base.populate_defaults(Base) + def func(self, a, kw_only, **kwargs): + return dict(kwargs, a=a, kw_only=kw_only) + + proxy = ProxyUsesKwOnly() + + # pylint: disable=too-many-function-args,no-value-for-parameter + self.assertEqual(proxy.func(), {'a': 1, 'kw_only': 4}) + self.assertEqual(proxy.func(100), {'a': 100, 'kw_only': 4}) + self.assertEqual( + proxy.func(2, 4, 6, kw_only=8), { + 'a': 2, 'b': 4, 'c': 6, 'kw_only': 8 + }) + with self.assertRaises(TypeError): + proxy.func(2, 4, 6, 8) # got too many positioned arguments + + class ProxyDoesntUseKwOnly(object): + @frame_base.args_to_kwargs(Base) + @frame_base.populate_defaults(Base) + def func(self, a, **kwargs): + return dict(kwargs, a=a) + + proxy = ProxyDoesntUseKwOnly() + + # pylint: disable=too-many-function-args,no-value-for-parameter + self.assertEqual(proxy.func(), {'a': 1}) + self.assertEqual(proxy.func(100), {'a': 100}) + self.assertEqual( + proxy.func(2, 4, 6, kw_only=8), { + 'a': 2, 'b': 4, 'c': 6, 'kw_only': 8 + }) + if __name__ == '__main__': unittest.main() diff --git a/sdks/python/apache_beam/dataframe/frames.py b/sdks/python/apache_beam/dataframe/frames.py index 0d9b22ae3f9e..f85a7c61640b 100644 --- a/sdks/python/apache_beam/dataframe/frames.py +++ b/sdks/python/apache_beam/dataframe/frames.py @@ -907,15 +907,17 @@ def sort_index(self, axis, **kwargs): return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'sort_index', - lambda df: df.sort_index(axis, **kwargs), + lambda df: df.sort_index(axis=axis, **kwargs), [self._expr], requires_partition_by=partitionings.Arbitrary(), preserves_partition_by=partitionings.Arbitrary(), )) @frame_base.with_docs_from(pd.DataFrame) - @frame_base.args_to_kwargs(pd.DataFrame) - @frame_base.populate_defaults(pd.DataFrame) + @frame_base.args_to_kwargs( + pd.DataFrame, removed_args=["errors"] if PD_VERSION >= (2, 0) else None) + @frame_base.populate_defaults( + pd.DataFrame, removed_args=["errors"] if PD_VERSION >= (2, 0) else None) @frame_base.maybe_inplace def where(self, cond, other, errors, **kwargs): """where is not parallelizable when ``errors="ignore"`` is specified.""" @@ -937,16 +939,19 @@ def where(self, cond, other, errors, **kwargs): else: actual_args['other'] = other - if errors == "ignore": - # We need all data in order to ignore errors and propagate the original - # data. - requires = partitionings.Singleton( - reason=( - f"where(errors={errors!r}) is currently not parallelizable, " - "because all data must be collected on one node to determine if " - "the original data should be propagated instead.")) + # For Pandas 2.0, errors was removed as an argument. + if PD_VERSION < (2, 0): + if "errors" in kwargs and kwargs['errors'] == "ignore": + # We need all data in order to ignore errors and propagate the original + # data. + requires = partitionings.Singleton( + reason=( + f"where(errors={kwargs['errors']!r}) is currently not " + "parallelizable, because all data must be collected on one " + "node to determine if the original data should be propagated " + "instead.")) - actual_args['errors'] = errors + actual_args['errors'] = kwargs['errors'] if 'errors' in kwargs else None def where_execution(df, *args): runtime_values = { @@ -1336,12 +1341,14 @@ def keys(self): frame_base.wont_implement_method( pd.Series, 'shape', reason="non-deferred-result")) - @frame_base.with_docs_from(pd.Series) - @frame_base.args_to_kwargs(pd.Series) - @frame_base.populate_defaults(pd.Series) + @frame_base.with_docs_from(pd.Series, removed_method=PD_VERSION >= (2, 0)) + @frame_base.args_to_kwargs(pd.Series, removed_method=PD_VERSION >= (2, 0)) + @frame_base.populate_defaults(pd.Series, removed_method=PD_VERSION >= (2, 0)) def append(self, to_append, ignore_index, verify_integrity, **kwargs): """``ignore_index=True`` is not supported, because it requires generating an order-sensitive index.""" + if PD_VERSION >= (2, 0): + raise frame_base.WontImplementError('append() was removed in Pandas 2.0.') if not isinstance(to_append, DeferredSeries): raise frame_base.WontImplementError( "append() only accepts DeferredSeries instances, received " + @@ -1603,14 +1610,11 @@ def mean(self, skipna, **kwargs): return self.sum(skipna=skipna, **kwargs) / size @frame_base.with_docs_from(pd.Series) - @frame_base.args_to_kwargs(pd.Series) - @frame_base.populate_defaults(pd.Series) + @frame_base.args_to_kwargs( + pd.Series, removed_args=["level"] if PD_VERSION >= (2, 0) else None) + @frame_base.populate_defaults( + pd.Series, removed_args=["level"] if PD_VERSION >= (2, 0) else None) def var(self, axis, skipna, level, ddof, **kwargs): - """Per-level aggregation is not yet supported - (https://github.com/apache/beam/issues/21829). Only the default, - ``level=None``, is allowed.""" - if level is not None: - raise NotImplementedError("per-level aggregation") if skipna is None or skipna: self = self.dropna() # pylint: disable=self-cls-assignment @@ -1678,11 +1682,11 @@ def corr(self, other, method, min_periods): requires_partition_by=partitionings.Singleton(reason=reason))) @frame_base.with_docs_from(pd.Series) - @frame_base.args_to_kwargs(pd.Series) - @frame_base.populate_defaults(pd.Series) + @frame_base.args_to_kwargs( + pd.Series, removed_args=["level"] if PD_VERSION >= (2, 0) else None) + @frame_base.populate_defaults( + pd.Series, removed_args=["level"] if PD_VERSION >= (2, 0) else None) def skew(self, axis, skipna, level, numeric_only, **kwargs): - if level is not None: - raise NotImplementedError("per-level aggregation") if skipna is None or skipna: self = self.dropna() # pylint: disable=self-cls-assignment # See the online, numerically stable formulae at @@ -1742,11 +1746,11 @@ def combine_moments(data): requires_partition_by=partitionings.Singleton())) @frame_base.with_docs_from(pd.Series) - @frame_base.args_to_kwargs(pd.Series) - @frame_base.populate_defaults(pd.Series) + @frame_base.args_to_kwargs( + pd.Series, removed_args=["level"] if PD_VERSION >= (2, 0) else None) + @frame_base.populate_defaults( + pd.Series, removed_args=["level"] if PD_VERSION >= (2, 0) else None) def kurtosis(self, axis, skipna, level, numeric_only, **kwargs): - if level is not None: - raise NotImplementedError("per-level aggregation") if skipna is None or skipna: self = self.dropna() # pylint: disable=self-cls-assignment @@ -2576,7 +2580,8 @@ def align(self, other, join, axis, copy, level, method, **kwargs): if kwargs: raise NotImplementedError('align(%s)' % ', '.join(kwargs.keys())) - if level is not None: + # In Pandas 2.0, all aggregations lost the level keyword. + if PD_VERSION < (2, 0) and level is not None: # Could probably get by partitioning on the used levels. requires_partition_by = partitionings.Singleton(reason=( f"align(level={level}) is not currently parallelizable. Only " @@ -2593,12 +2598,15 @@ def align(self, other, join, axis, copy, level, method, **kwargs): requires_partition_by=requires_partition_by, preserves_partition_by=partitionings.Arbitrary())) - @frame_base.with_docs_from(pd.DataFrame) - @frame_base.args_to_kwargs(pd.DataFrame) - @frame_base.populate_defaults(pd.DataFrame) + @frame_base.with_docs_from(pd.DataFrame, removed_method=PD_VERSION >= (2, 0)) + @frame_base.args_to_kwargs(pd.DataFrame, removed_method=PD_VERSION >= (2, 0)) + @frame_base.populate_defaults(pd.DataFrame, + removed_method=PD_VERSION >= (2, 0)) def append(self, other, ignore_index, verify_integrity, sort, **kwargs): """``ignore_index=True`` is not supported, because it requires generating an order-sensitive index.""" + if PD_VERSION >= (2, 0): + raise frame_base.WontImplementError('append() was removed in Pandas 2.0.') if not isinstance(other, DeferredDataFrame): raise frame_base.WontImplementError( "append() only accepts DeferredDataFrame instances, received " + @@ -2679,7 +2687,7 @@ def set_axis(self, labels, axis, **kwargs): return frame_base.DeferredFrame.wrap( expressions.ComputedExpression( 'set_axis', - lambda df: df.set_axis(labels, axis, **kwargs), + lambda df: df.set_axis(labels, axis=axis, **kwargs), [self._expr], requires_partition_by=partitionings.Arbitrary(), preserves_partition_by=partitionings.Arbitrary())) @@ -4907,9 +4915,9 @@ def __setitem__(self, index, value): class _DeferredStringMethods(frame_base.DeferredBase): - @frame_base.with_docs_from(pd.core.strings.StringMethods) - @frame_base.args_to_kwargs(pd.core.strings.StringMethods) - @frame_base.populate_defaults(pd.core.strings.StringMethods) + @frame_base.with_docs_from(pd.Series.str) + @frame_base.args_to_kwargs(pd.Series.str) + @frame_base.populate_defaults(pd.Series.str) def cat(self, others, join, **kwargs): """If defined, ``others`` must be a :class:`DeferredSeries` or a ``list`` of ``DeferredSeries``.""" @@ -4949,8 +4957,8 @@ def func(*args): requires_partition_by=requires, preserves_partition_by=partitionings.Arbitrary())) - @frame_base.with_docs_from(pd.core.strings.StringMethods) - @frame_base.args_to_kwargs(pd.core.strings.StringMethods) + @frame_base.with_docs_from(pd.Series.str) + @frame_base.args_to_kwargs(pd.Series.str) def repeat(self, repeats): """``repeats`` must be an ``int`` or a :class:`DeferredSeries`. Lists are not supported because they make this operation order-sensitive.""" @@ -4987,8 +4995,8 @@ def repeat(self, repeats): raise TypeError("str.repeat(repeats=) value must be an int or a " f"DeferredSeries (encountered {type(repeats)}).") - @frame_base.with_docs_from(pd.core.strings.StringMethods) - @frame_base.args_to_kwargs(pd.core.strings.StringMethods) + @frame_base.with_docs_from(pd.Series.str) + @frame_base.args_to_kwargs(pd.Series.str) def get_dummies(self, **kwargs): """ Series must be categorical dtype. Please cast to ``CategoricalDtype`` @@ -5070,9 +5078,9 @@ def func(s): requires_partition_by=partitionings.Arbitrary(), preserves_partition_by=partitionings.Arbitrary())) - @frame_base.with_docs_from(pd.core.strings.StringMethods) - @frame_base.args_to_kwargs(pd.core.strings.StringMethods) - @frame_base.populate_defaults(pd.core.strings.StringMethods) + @frame_base.with_docs_from(pd.Series.str) + @frame_base.args_to_kwargs(pd.Series.str) + @frame_base.populate_defaults(pd.Series.str) def split(self, **kwargs): """ Like other non-deferred methods, dtype must be CategoricalDtype. @@ -5081,9 +5089,9 @@ def split(self, **kwargs): """ return self._split_helper(rsplit=False, **kwargs) - @frame_base.with_docs_from(pd.core.strings.StringMethods) - @frame_base.args_to_kwargs(pd.core.strings.StringMethods) - @frame_base.populate_defaults(pd.core.strings.StringMethods) + @frame_base.with_docs_from(pd.Series.str) + @frame_base.args_to_kwargs(pd.Series.str) + @frame_base.populate_defaults(pd.Series.str) def rsplit(self, **kwargs): """ Like other non-deferred methods, dtype must be CategoricalDtype. @@ -5161,17 +5169,17 @@ def func(df, *args, **kwargs): return func for method in ELEMENTWISE_STRING_METHODS: - if not hasattr(pd.core.strings.StringMethods, method): + if not hasattr(pd.Series.str, method): # older versions (1.0.x) don't support some of these methods continue setattr(_DeferredStringMethods, method, frame_base._elementwise_method(make_str_func(method), name=method, - base=pd.core.strings.StringMethods)) + base=pd.Series.str)) for method in NON_ELEMENTWISE_STRING_METHODS: - if not hasattr(pd.core.strings.StringMethods, method): + if not hasattr(pd.Series.str, method): # older versions (1.0.x) don't support some of these methods continue setattr(_DeferredStringMethods, @@ -5179,7 +5187,7 @@ def func(df, *args, **kwargs): frame_base._proxy_method( make_str_func(method), name=method, - base=pd.core.strings.StringMethods, + base=pd.Series.str, requires_partition_by=partitionings.Arbitrary(), preserves_partition_by=partitionings.Singleton())) @@ -5388,6 +5396,7 @@ def func(df, *args, **kwargs): name, frame_base._elementwise_method(name, restrictions={'level': None}, base=pd.Series)) + if hasattr(pd.DataFrame, name): setattr( DeferredDataFrame, diff --git a/sdks/python/apache_beam/dataframe/frames_test.py b/sdks/python/apache_beam/dataframe/frames_test.py index fa121aa85c30..959f09e29b26 100644 --- a/sdks/python/apache_beam/dataframe/frames_test.py +++ b/sdks/python/apache_beam/dataframe/frames_test.py @@ -797,6 +797,7 @@ def test_loc(self): self._run_test(lambda df: df.C.loc[df.A > 10], df) self._run_test(lambda df, s: df.loc[s.loc[1:3]], df, pd.Series(dates)) + @unittest.skipIf(PD_VERSION >= (2, 0), 'append removed in Pandas 2.0') def test_append_sort(self): # yapf: disable df1 = pd.DataFrame({'int': [1, 2, 3], 'str': ['a', 'b', 'c']}, @@ -985,6 +986,7 @@ def test_series_fillna_series_as_value(self): self._run_test(lambda df, df2: df.A.fillna(df2.A), df, df2) + @unittest.skipIf(PD_VERSION >= (2, 0), 'append removed in Pandas 2.0') def test_append_verify_integrity(self): df1 = pd.DataFrame({'A': range(10), 'B': range(10)}, index=range(10)) df2 = pd.DataFrame({'A': range(10), 'B': range(10)}, index=range(9, 19)) @@ -1683,8 +1685,15 @@ def test_groupby_series(self, agg_type): "https://github.com/apache/beam/issues/20967: proxy generation of " "DataFrameGroupBy.describe fails in pandas < 1.2") + kwargs = {} + # Behavior for numeric_only in these methods changed in Pandas 2 to default + # to False instead of True, so explicitly make it True in Pandas 2. + if PD_VERSION >= (2, 0) and agg_type in ('corr', 'cov', 'quantile'): + kwargs["numeric_only"] = True + self._run_test( - lambda df: getattr(df[df.foo > 40].groupby(df.group), agg_type)(), + lambda df: getattr(df[df.foo > 40].groupby(df.group), agg_type) + (**kwargs), GROUPBY_DF, check_proxy=False) @@ -1898,12 +1907,19 @@ def test_dataframe_groupby_series(self, agg_type): self.skipTest( "https://github.com/apache/beam/issues/20967: proxy generation of " "DataFrameGroupBy.describe fails in pandas < 1.2") + + kwargs = {} + # Behavior for numeric_only in these methods changed in Pandas 2 to default + # to False instead of True, so explicitly make it True in Pandas 2. + if PD_VERSION >= (2, 0) and agg_type in ('corr', 'cov', 'quantile'): + kwargs["numeric_only"] = True + self._run_test( - lambda df: df[df.foo > 40].groupby(df.group).agg(agg_type), + lambda df: df[df.foo > 40].groupby(df.group).agg(agg_type, **kwargs), GROUPBY_DF, check_proxy=False) self._run_test( - lambda df: df[df.foo > 40].groupby(df.foo % 3).agg(agg_type), + lambda df: df[df.foo > 40].groupby(df.foo % 3).agg(agg_type, **kwargs), GROUPBY_DF, check_proxy=False) @@ -2042,6 +2058,7 @@ def test_dataframe_agg_modes(self): self._run_test(lambda df: df.agg({'A': ['sum', 'mean']}), df) self._run_test(lambda df: df.agg({'A': ['sum', 'mean'], 'B': 'min'}), df) + @unittest.skipIf(PD_VERSION >= (2, 0), "level argument removed in Pandas 2") def test_series_agg_level(self): self._run_test( lambda df: df.set_index(['group', 'foo']).bar.count(level=0), @@ -2065,6 +2082,7 @@ def test_series_agg_level(self): lambda df: df.set_index(['group', 'foo']).bar.median(level=1), GROUPBY_DF) + @unittest.skipIf(PD_VERSION >= (2, 0), "level argument removed in Pandas 2") def test_dataframe_agg_level(self): self._run_test( lambda df: df.set_index(['group', 'foo']).count(level=0), GROUPBY_DF) @@ -2232,6 +2250,7 @@ def test_df_agg_method_invalid_kwarg_raises(self): self._run_error_test( lambda df: df.median(min_count=3, numeric_only=True), GROUPBY_DF) + @unittest.skipIf(PD_VERSION >= (2, 0), "level argument removed in Pandas 2") def test_agg_min_count(self): df = pd.DataFrame({ 'good': [1, 2, 3, np.nan], @@ -2936,7 +2955,7 @@ class DocstringTest(unittest.TestCase): (frames.DeferredDataFrame, pd.DataFrame), (frames.DeferredSeries, pd.Series), #(frames._DeferredIndex, pd.Index), - (frames._DeferredStringMethods, pd.core.strings.StringMethods), + (frames._DeferredStringMethods, pd.Series.str), ( frames._DeferredCategoricalMethods, pd.core.arrays.categorical.CategoricalAccessor), diff --git a/sdks/python/apache_beam/runners/worker/data_sampler.py b/sdks/python/apache_beam/runners/worker/data_sampler.py index a5992b9cebac..303648738f3d 100644 --- a/sdks/python/apache_beam/runners/worker/data_sampler.py +++ b/sdks/python/apache_beam/runners/worker/data_sampler.py @@ -49,11 +49,19 @@ class SampleTimer: """Periodic timer for sampling elements.""" def __init__(self, timeout_secs: float, sampler: OutputSampler) -> None: - self._timeout_secs = timeout_secs + self._target_timeout_secs = timeout_secs + self._timeout_secs = min(timeout_secs, 0.5) if timeout_secs > 0 else 0.0 self._timer = Timer(self._timeout_secs, self.sample) self._sampler = sampler + self._sample_duration_secs = 0.0 def reset(self) -> None: + # For the first 30 seconds, sample every 0.5 seconds. After that, sample at + # the normal rate. + if self._sample_duration_secs >= 30.0: + self._timeout_secs = self._target_timeout_secs + self._sample_duration_secs += self._timeout_secs + self._timer.cancel() self._timer = Timer(self._timeout_secs, self.sample) self._timer.start() diff --git a/sdks/python/apache_beam/transforms/ptransform.py b/sdks/python/apache_beam/transforms/ptransform.py index c7eaa152ae06..28614c6561c7 100644 --- a/sdks/python/apache_beam/transforms/ptransform.py +++ b/sdks/python/apache_beam/transforms/ptransform.py @@ -38,11 +38,13 @@ class and wrapper class that allows lambda functions to be used as import copy import itertools +import json import logging import operator import os import sys import threading +import warnings from functools import reduce from functools import wraps from typing import TYPE_CHECKING @@ -83,6 +85,7 @@ class and wrapper class that allows lambda functions to be used as from apache_beam.typehints.trivial_inference import instance_to_type from apache_beam.typehints.typehints import validate_composite_type_param from apache_beam.utils import proto_utils +from apache_beam.utils import python_callable if TYPE_CHECKING: from apache_beam import coders @@ -95,6 +98,7 @@ class and wrapper class that allows lambda functions to be used as 'PTransform', 'ptransform_fn', 'label_from_callable', + 'annotate_yaml', ] _LOGGER = logging.getLogger(__name__) @@ -1096,3 +1100,51 @@ def __ror__(self, pvalueish, _unused=None): def expand(self, pvalue): raise RuntimeError("Should never be expanded directly.") + + +# Defined here to avoid circular import issues for Beam library transforms. +def annotate_yaml(constructor): + """Causes instances of this transform to be annotated with their yaml syntax. + + Should only be used for transforms that are fully defined by their constructor + arguments. + """ + @wraps(constructor) + def wrapper(*args, **kwargs): + transform = constructor(*args, **kwargs) + + fully_qualified_name = ( + f'{constructor.__module__}.{constructor.__qualname__}') + try: + imported_constructor = ( + python_callable.PythonCallableWithSource. + load_from_fully_qualified_name(fully_qualified_name)) + if imported_constructor != wrapper: + raise ImportError('Different object.') + except ImportError: + warnings.warn(f'Cannot import {constructor} as {fully_qualified_name}.') + return transform + + try: + config = json.dumps({ + 'constructor': fully_qualified_name, + 'args': args, + 'kwargs': kwargs, + }) + except TypeError as exn: + warnings.warn( + f'Cannot serialize arguments for {constructor} as json: {exn}') + return transform + + original_annotations = transform.annotations + transform.annotations = lambda: { + **original_annotations(), + # These override whatever may have been provided earlier. + # The outermost call is expected to be the most specific. + 'yaml_provider': 'python', + 'yaml_type': 'PyTransform', + 'yaml_args': config, + } + return transform + + return wrapper diff --git a/sdks/python/apache_beam/typehints/trivial_inference.py b/sdks/python/apache_beam/typehints/trivial_inference.py index f4b350e8f052..a880b5c70ea1 100644 --- a/sdks/python/apache_beam/typehints/trivial_inference.py +++ b/sdks/python/apache_beam/typehints/trivial_inference.py @@ -399,7 +399,10 @@ def infer_return_type_func(f, input_types, debug=False, depth=0): jump_multiplier = 1 last_pc = -1 + last_real_opname = opname = None while pc < end: # pylint: disable=too-many-nested-blocks + if opname not in ('PRECALL', 'CACHE'): + last_real_opname = opname start = pc instruction = ofs_table[pc] op = instruction.opcode @@ -534,13 +537,13 @@ def infer_return_type_func(f, input_types, debug=False, depth=0): return_type = Any state.kw_names = None else: - # Handle lambdas always having an arg of 0 for CALL + # Handle comprehensions always having an arg of 0 for CALL # See https://github.com/python/cpython/issues/102403 for context. - if pop_count == 1: - while pop_count <= len(state.stack): - if isinstance(state.stack[-pop_count], Const): - break - pop_count += 1 + if (pop_count == 1 and last_real_opname == 'GET_ITER' and + len(state.stack) > 1 and isinstance(state.stack[-2], Const) and + getattr(state.stack[-2].value, '__name__', None) in ( + '', '', '', '')): + pop_count += 1 if depth <= 0 or pop_count > len(state.stack): return_type = Any elif isinstance(state.stack[-pop_count], Const): diff --git a/sdks/python/apache_beam/typehints/trivial_inference_test.py b/sdks/python/apache_beam/typehints/trivial_inference_test.py index d8cc2ab19a03..4341d11d3604 100644 --- a/sdks/python/apache_beam/typehints/trivial_inference_test.py +++ b/sdks/python/apache_beam/typehints/trivial_inference_test.py @@ -251,11 +251,30 @@ def testCall(self): self.assertReturnType( typehints.Tuple[int, typehints.Any], lambda: (1, f(x=1.0))) + def testCallNullaryMethod(self): + class Foo: + pass + + self.assertReturnType( + typehints.Tuple[Foo, typehints.Any], lambda x: (x, x.unknown()), [Foo]) + + def testCallNestedLambda(self): + class Foo: + pass + + self.assertReturnType( + typehints.Tuple[Foo, int], lambda x: (x, (lambda: 3)()), [Foo]) + def testClosure(self): x = 1 y = 1.0 self.assertReturnType(typehints.Tuple[int, float], lambda: (x, y)) + @unittest.skip("https://github.com/apache/beam/issues/28420") + def testLocalClosure(self): + self.assertReturnType( + typehints.Tuple[int, int], lambda x: (x, (lambda: x)()), [int]) + def testGlobals(self): self.assertReturnType(int, lambda: global_int) diff --git a/sdks/python/apache_beam/yaml/yaml_provider.py b/sdks/python/apache_beam/yaml/yaml_provider.py index d42d7aaffeee..6e035811d4b9 100644 --- a/sdks/python/apache_beam/yaml/yaml_provider.py +++ b/sdks/python/apache_beam/yaml/yaml_provider.py @@ -32,6 +32,7 @@ from typing import Dict from typing import Iterable from typing import Mapping +from typing import Optional import yaml from yaml.loader import SafeLoader @@ -57,6 +58,9 @@ def available(self) -> bool: """Returns whether this provider is available to use in this environment.""" raise NotImplementedError(type(self)) + def cache_artifacts(self) -> Optional[Iterable[str]]: + raise NotImplementedError(type(self)) + def provided_transforms(self) -> Iterable[str]: """Returns a list of transform type names this provider can handle.""" raise NotImplementedError(type(self)) @@ -256,17 +260,24 @@ def available(self): self._is_available = False return self._is_available + def cache_artifacts(self): + pass + class ExternalJavaProvider(ExternalProvider): def __init__(self, urns, jar_provider): super().__init__( urns, lambda: external.JavaJarExpansionService(jar_provider())) + self._jar_provider = jar_provider def available(self): # pylint: disable=subprocess-run-check return subprocess.run(['which', 'java'], capture_output=True).returncode == 0 + def cache_artifacts(self): + return [self._jar_provider()] + @ExternalProvider.register_provider_type('python') def python(urns, packages=()): @@ -289,6 +300,9 @@ def __init__(self, urns, packages): def available(self): return True # If we're running this script, we have Python installed. + def cache_artifacts(self): + return [self._service._venv()] + def create_external_transform(self, urn, args): # Python transforms are "registered" by fully qualified name. return external.ExternalTransform( @@ -351,6 +365,9 @@ def __init__(self, transform_factories): def available(self): return True + def cache_artifacts(self): + pass + def provided_transforms(self): return self._transform_factories.keys() @@ -527,23 +544,60 @@ def __init__(self, packages, base_python=sys.executable): self._packages = packages self._base_python = base_python - def _key(self): - return json.dumps({'binary': self._base_python, 'packages': self._packages}) + @classmethod + def _key(cls, base_python, packages): + return json.dumps({ + 'binary': base_python, 'packages': sorted(packages) + }, + sort_keys=True) - def _venv(self): - venv = os.path.join( - self.VENV_CACHE, - hashlib.sha256(self._key().encode('utf-8')).hexdigest()) + @classmethod + def _path(cls, base_python, packages): + return os.path.join( + cls.VENV_CACHE, + hashlib.sha256(cls._key(base_python, + packages).encode('utf-8')).hexdigest()) + + @classmethod + def _create_venv_from_scratch(cls, base_python, packages): + venv = cls._path(base_python, packages) if not os.path.exists(venv): - python_binary = os.path.join(venv, 'bin', 'python') - subprocess.run([self._base_python, '-m', 'venv', venv], check=True) - subprocess.run([python_binary, '-m', 'ensurepip'], check=True) - subprocess.run([python_binary, '-m', 'pip', 'install'] + self._packages, + subprocess.run([base_python, '-m', 'venv', venv], check=True) + venv_python = os.path.join(venv, 'bin', 'python') + subprocess.run([venv_python, '-m', 'ensurepip'], check=True) + subprocess.run([venv_python, '-m', 'pip', 'install'] + packages, check=True) with open(venv + '-requirements.txt', 'w') as fout: - fout.write('\n'.join(self._packages)) + fout.write('\n'.join(packages)) return venv + @classmethod + def _create_venv_from_clone(cls, base_python, packages): + venv = cls._path(base_python, packages) + if not os.path.exists(venv): + clonable_venv = cls._create_venv_to_clone(base_python) + clonable_python = os.path.join(clonable_venv, 'bin', 'python') + subprocess.run( + [clonable_python, '-m', 'clonevirtualenv', clonable_venv, venv], + check=True) + venv_binary = os.path.join(venv, 'bin', 'python') + subprocess.run([venv_binary, '-m', 'pip', 'install'] + packages, + check=True) + with open(venv + '-requirements.txt', 'w') as fout: + fout.write('\n'.join(packages)) + return venv + + @classmethod + def _create_venv_to_clone(cls, base_python): + return cls._create_venv_from_scratch( + base_python, [ + 'apache_beam[dataframe,gcp,test]==' + beam_version, + 'virtualenv-clone' + ]) + + def _venv(self): + return self._create_venv_from_clone(self._base_python, self._packages) + def __enter__(self): venv = self._venv() self._service_provider = subprocess_server.SubprocessServer( diff --git a/sdks/python/apache_beam/yaml/yaml_transform_test.py b/sdks/python/apache_beam/yaml/yaml_transform_test.py index f969761092e0..26baebec86e4 100644 --- a/sdks/python/apache_beam/yaml/yaml_transform_test.py +++ b/sdks/python/apache_beam/yaml/yaml_transform_test.py @@ -250,6 +250,23 @@ def test_name_is_ambiguous(self): output: AnotherFilter ''') + def test_annotations(self): + t = LinearTransform(5, b=100) + annotations = t.annotations() + with beam.Pipeline(options=beam.options.pipeline_options.PipelineOptions( + pickle_library='cloudpickle')) as p: + result = p | YamlTransform( + ''' + type: chain + transforms: + - type: Create + config: + elements: [0, 1, 2, 3] + - type: %r + config: %s + ''' % (annotations['yaml_type'], annotations['yaml_args'])) + assert_that(result, equal_to([100, 105, 110, 115])) + class CreateTimestamped(beam.PTransform): def __init__(self, elements): @@ -631,6 +648,19 @@ def test_prefers_same_provider_class(self): label='StartWith3') +@beam.transforms.ptransform.annotate_yaml +class LinearTransform(beam.PTransform): + """A transform used for testing annotate_yaml.""" + def __init__(self, a, b): + self._a = a + self._b = b + + def expand(self, pcoll): + a = self._a + b = self._b + return pcoll | beam.Map(lambda x: a * x + b) + + if __name__ == '__main__': logging.getLogger().setLevel(logging.INFO) unittest.main() diff --git a/sdks/python/container/base_image_requirements_manual.txt b/sdks/python/container/base_image_requirements_manual.txt index a1d80320d42d..e952b2126604 100644 --- a/sdks/python/container/base_image_requirements_manual.txt +++ b/sdks/python/container/base_image_requirements_manual.txt @@ -43,4 +43,3 @@ nose==1.3.7 # For Dataflow internal testing. TODO: remove this. python-snappy;python_version<"3.11" # Optimizes execution of some Beam codepaths. scipy scikit-learn -tensorflow>=2.12.0 diff --git a/sdks/python/container/boot.go b/sdks/python/container/boot.go index 73a2f8324401..da3b4a1536ec 100644 --- a/sdks/python/container/boot.go +++ b/sdks/python/container/boot.go @@ -371,13 +371,7 @@ func setupAcceptableWheelSpecs() error { return fmt.Errorf("cannot get parse Python version from %s", stdoutStderr) } pyVersion := fmt.Sprintf("%s%s", pyVersions[1], pyVersions[2]) - var wheelName string - switch pyVersion { - case "36", "37": - wheelName = fmt.Sprintf("cp%s-cp%sm-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", pyVersion, pyVersion) - default: - wheelName = fmt.Sprintf("cp%s-cp%s-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", pyVersion, pyVersion) - } + wheelName := fmt.Sprintf("cp%s-cp%s-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", pyVersion, pyVersion) acceptableWhlSpecs = append(acceptableWhlSpecs, wheelName) return nil } diff --git a/sdks/python/container/py310/base_image_requirements.txt b/sdks/python/container/py310/base_image_requirements.txt index 0cae8ca6ef08..58aca4a4aea7 100644 --- a/sdks/python/container/py310/base_image_requirements.txt +++ b/sdks/python/container/py310/base_image_requirements.txt @@ -21,8 +21,6 @@ # https://s.apache.org/beam-python-dev-wiki # Reach out to a committer if you need help. -absl-py==1.4.0 -astunparse==1.6.3 attrs==23.1.0 beautifulsoup4==4.12.2 bs4==0.0.1 @@ -30,7 +28,7 @@ cachetools==5.3.1 certifi==2023.7.22 cffi==1.15.1 charset-normalizer==3.2.0 -click==8.1.6 +click==8.1.7 cloudpickle==2.2.1 crcmod==1.7 cryptography==41.0.3 @@ -40,74 +38,63 @@ dill==0.3.1.1 dnspython==2.4.2 docker==6.1.3 docopt==0.6.2 -exceptiongroup==1.1.2 +exceptiongroup==1.1.3 execnet==2.0.2 -fastavro==1.8.2 +fastavro==1.8.3 fasteners==0.18 -flatbuffers==23.5.26 freezegun==1.2.2 future==0.18.3 -gast==0.4.0 google-api-core==2.11.1 -google-api-python-client==2.96.0 +google-api-python-client==2.99.0 google-apitools==0.5.31 -google-auth==2.22.0 -google-auth-httplib2==0.1.0 -google-auth-oauthlib==1.0.0 -google-cloud-aiplatform==1.29.0 +google-auth==2.23.0 +google-auth-httplib2==0.1.1 +google-cloud-aiplatform==1.32.0 google-cloud-bigquery==3.11.4 google-cloud-bigquery-storage==2.22.0 google-cloud-bigtable==2.21.0 google-cloud-core==2.3.3 -google-cloud-datastore==2.17.0 +google-cloud-datastore==2.18.0 google-cloud-dlp==3.12.2 -google-cloud-language==2.10.1 -google-cloud-profiler==4.0.0 -google-cloud-pubsub==2.18.2 +google-cloud-language==2.11.0 +google-cloud-profiler==4.1.0 +google-cloud-pubsub==2.18.4 google-cloud-pubsublite==1.8.3 google-cloud-recommendations-ai==0.10.4 google-cloud-resource-manager==1.10.3 -google-cloud-spanner==3.40.0 +google-cloud-spanner==3.40.1 google-cloud-storage==2.10.0 google-cloud-videointelligence==2.11.3 google-cloud-vision==3.4.4 google-crc32c==1.5.0 -google-pasta==0.2.0 -google-resumable-media==2.5.0 +google-resumable-media==2.6.0 googleapis-common-protos==1.60.0 greenlet==2.0.2 grpc-google-iam-v1==0.12.6 -grpcio==1.56.2 -grpcio-status==1.56.2 +grpcio==1.58.0 +grpcio-status==1.58.0 guppy3==3.1.3 -h5py==3.9.0 hdfs==2.7.2 httplib2==0.22.0 -hypothesis==6.82.3 +hypothesis==6.84.3 idna==3.4 iniconfig==2.0.0 joblib==1.3.2 -keras==2.13.1 -libclang==16.0.6 -Markdown==3.4.4 -MarkupSafe==2.1.3 mmh3==4.0.1 mock==5.1.0 nltk==3.8.1 nose==1.3.7 -numpy==1.24.3 +numpy==1.24.4 oauth2client==4.1.3 -oauthlib==3.2.2 objsize==0.6.1 -opt-einsum==3.3.0 -orjson==3.9.2 +orjson==3.9.7 overrides==6.5.0 packaging==23.1 pandas==1.5.3 parameterized==0.9.0 -pluggy==1.2.0 +pluggy==1.3.0 proto-plus==1.22.3 -protobuf==4.23.4 +protobuf==4.24.3 psycopg2-binary==2.9.7 pyarrow==11.0.0 pyasn1==0.5.0 @@ -115,45 +102,36 @@ pyasn1-modules==0.3.0 pycparser==2.21 pydot==1.4.2 PyHamcrest==2.0.4 -pymongo==4.4.1 +pymongo==4.5.0 PyMySQL==1.1.0 pyparsing==3.1.1 -pytest==7.4.0 +pytest==7.4.2 pytest-timeout==2.1.0 pytest-xdist==3.3.1 python-dateutil==2.8.2 python-snappy==0.6.1 -pytz==2023.3 +pytz==2023.3.post1 PyYAML==6.0.1 regex==2023.8.8 requests==2.31.0 requests-mock==1.11.0 -requests-oauthlib==1.3.1 rsa==4.9 scikit-learn==1.3.0 -scipy==1.11.1 +scipy==1.11.2 Shapely==1.8.5.post1 six==1.16.0 sortedcontainers==2.4.0 -soupsieve==2.4.1 +soupsieve==2.5 SQLAlchemy==1.4.49 sqlparse==0.4.4 -tenacity==8.2.2 -tensorboard==2.13.0 -tensorboard-data-server==0.7.1 -tensorflow==2.13.0 -tensorflow-cpu-aws==2.13.0;platform_machine=="aarch64" -tensorflow-estimator==2.13.0 -tensorflow-io-gcs-filesystem==0.33.0 -termcolor==2.3.0 +tenacity==8.2.3 testcontainers==3.7.1 threadpoolctl==3.2.0 tomli==2.0.1 -tqdm==4.66.0 -typing_extensions==4.5.0 +tqdm==4.66.1 +typing_extensions==4.7.1 uritemplate==4.1.1 urllib3==1.26.16 -websocket-client==1.6.1 -Werkzeug==2.3.6 +websocket-client==1.6.3 wrapt==1.15.0 zstandard==0.21.0 diff --git a/sdks/python/container/py311/base_image_requirements.txt b/sdks/python/container/py311/base_image_requirements.txt index 241a9ad581cb..5aaeba15c69e 100644 --- a/sdks/python/container/py311/base_image_requirements.txt +++ b/sdks/python/container/py311/base_image_requirements.txt @@ -21,8 +21,6 @@ # https://s.apache.org/beam-python-dev-wiki # Reach out to a committer if you need help. -absl-py==1.4.0 -astunparse==1.6.3 attrs==23.1.0 beautifulsoup4==4.12.2 bs4==0.0.1 @@ -30,7 +28,7 @@ cachetools==5.3.1 certifi==2023.7.22 cffi==1.15.1 charset-normalizer==3.2.0 -click==8.1.6 +click==8.1.7 cloudpickle==2.2.1 crcmod==1.7 cryptography==41.0.3 @@ -41,70 +39,59 @@ dnspython==2.4.2 docker==6.1.3 docopt==0.6.2 execnet==2.0.2 -fastavro==1.8.2 +fastavro==1.8.3 fasteners==0.18 -flatbuffers==23.5.26 freezegun==1.2.2 future==0.18.3 -gast==0.4.0 google-api-core==2.11.1 google-apitools==0.5.31 -google-auth==2.22.0 -google-auth-httplib2==0.1.0 -google-auth-oauthlib==1.0.0 -google-cloud-aiplatform==1.29.0 +google-auth==2.23.0 +google-auth-httplib2==0.1.1 +google-cloud-aiplatform==1.32.0 google-cloud-bigquery==3.11.4 google-cloud-bigquery-storage==2.22.0 google-cloud-bigtable==2.21.0 google-cloud-core==2.3.3 -google-cloud-datastore==2.17.0 +google-cloud-datastore==2.18.0 google-cloud-dlp==3.12.2 -google-cloud-language==2.10.1 -google-cloud-pubsub==2.18.2 +google-cloud-language==2.11.0 +google-cloud-pubsub==2.18.4 google-cloud-pubsublite==1.8.3 google-cloud-recommendations-ai==0.10.4 google-cloud-resource-manager==1.10.3 -google-cloud-spanner==3.40.0 +google-cloud-spanner==3.40.1 google-cloud-storage==2.10.0 google-cloud-videointelligence==2.11.3 google-cloud-vision==3.4.4 google-crc32c==1.5.0 -google-pasta==0.2.0 -google-resumable-media==2.5.0 +google-resumable-media==2.6.0 googleapis-common-protos==1.60.0 greenlet==2.0.2 grpc-google-iam-v1==0.12.6 -grpcio==1.56.2 -grpcio-status==1.56.2 +grpcio==1.58.0 +grpcio-status==1.58.0 guppy3==3.1.3 -h5py==3.9.0 hdfs==2.7.2 httplib2==0.22.0 -hypothesis==6.82.3 +hypothesis==6.84.3 idna==3.4 iniconfig==2.0.0 joblib==1.3.2 -keras==2.13.1 -libclang==16.0.6 -Markdown==3.4.4 -MarkupSafe==2.1.3 mmh3==4.0.1 mock==5.1.0 nltk==3.8.1 nose==1.3.7 -numpy==1.24.3 +numpy==1.24.4 oauth2client==4.1.3 -oauthlib==3.2.2 objsize==0.6.1 -opt-einsum==3.3.0 -orjson==3.9.2 +orjson==3.9.7 overrides==6.5.0 packaging==23.1 pandas==1.5.3 parameterized==0.9.0 -pluggy==1.2.0 +pluggy==1.3.0 proto-plus==1.22.3 -protobuf==4.23.4 +protobuf==4.24.3 psycopg2-binary==2.9.7 pyarrow==11.0.0 pyasn1==0.5.0 @@ -112,42 +99,33 @@ pyasn1-modules==0.3.0 pycparser==2.21 pydot==1.4.2 PyHamcrest==2.0.4 -pymongo==4.4.1 +pymongo==4.5.0 PyMySQL==1.1.0 pyparsing==3.1.1 -pytest==7.4.0 +pytest==7.4.2 pytest-timeout==2.1.0 pytest-xdist==3.3.1 python-dateutil==2.8.2 -pytz==2023.3 +pytz==2023.3.post1 PyYAML==6.0.1 regex==2023.8.8 requests==2.31.0 requests-mock==1.11.0 -requests-oauthlib==1.3.1 rsa==4.9 scikit-learn==1.3.0 -scipy==1.11.1 +scipy==1.11.2 Shapely==1.8.5.post1 six==1.16.0 sortedcontainers==2.4.0 -soupsieve==2.4.1 +soupsieve==2.5 SQLAlchemy==1.4.49 sqlparse==0.4.4 -tenacity==8.2.2 -tensorboard==2.13.0 -tensorboard-data-server==0.7.1 -tensorflow==2.13.0 -tensorflow-cpu-aws==2.13.0;platform_machine=="aarch64" -tensorflow-estimator==2.13.0 -tensorflow-io-gcs-filesystem==0.33.0 -termcolor==2.3.0 +tenacity==8.2.3 testcontainers==3.7.1 threadpoolctl==3.2.0 -tqdm==4.66.0 -typing_extensions==4.5.0 +tqdm==4.66.1 +typing_extensions==4.7.1 urllib3==1.26.16 -websocket-client==1.6.1 -Werkzeug==2.3.6 +websocket-client==1.6.3 wrapt==1.15.0 zstandard==0.21.0 diff --git a/sdks/python/container/py38/base_image_requirements.txt b/sdks/python/container/py38/base_image_requirements.txt index 96caec61c09a..472ee0c0bf8d 100644 --- a/sdks/python/container/py38/base_image_requirements.txt +++ b/sdks/python/container/py38/base_image_requirements.txt @@ -21,8 +21,6 @@ # https://s.apache.org/beam-python-dev-wiki # Reach out to a committer if you need help. -absl-py==1.4.0 -astunparse==1.6.3 attrs==23.1.0 beautifulsoup4==4.12.2 bs4==0.0.1 @@ -30,7 +28,7 @@ cachetools==5.3.1 certifi==2023.7.22 cffi==1.15.1 charset-normalizer==3.2.0 -click==8.1.6 +click==8.1.7 cloudpickle==2.2.1 crcmod==1.7 cryptography==41.0.3 @@ -40,75 +38,63 @@ dill==0.3.1.1 dnspython==2.4.2 docker==6.1.3 docopt==0.6.2 -exceptiongroup==1.1.2 +exceptiongroup==1.1.3 execnet==2.0.2 -fastavro==1.8.2 +fastavro==1.8.3 fasteners==0.18 -flatbuffers==23.5.26 freezegun==1.2.2 future==0.18.3 -gast==0.4.0 google-api-core==2.11.1 -google-api-python-client==2.96.0 +google-api-python-client==2.99.0 google-apitools==0.5.31 -google-auth==2.22.0 -google-auth-httplib2==0.1.0 -google-auth-oauthlib==1.0.0 -google-cloud-aiplatform==1.29.0 +google-auth==2.23.0 +google-auth-httplib2==0.1.1 +google-cloud-aiplatform==1.32.0 google-cloud-bigquery==3.11.4 google-cloud-bigquery-storage==2.22.0 google-cloud-bigtable==2.21.0 google-cloud-core==2.3.3 -google-cloud-datastore==2.17.0 +google-cloud-datastore==2.18.0 google-cloud-dlp==3.12.2 -google-cloud-language==2.10.1 -google-cloud-profiler==4.0.0 -google-cloud-pubsub==2.18.2 +google-cloud-language==2.11.0 +google-cloud-profiler==4.1.0 +google-cloud-pubsub==2.18.4 google-cloud-pubsublite==1.8.3 google-cloud-recommendations-ai==0.10.4 google-cloud-resource-manager==1.10.3 -google-cloud-spanner==3.40.0 +google-cloud-spanner==3.40.1 google-cloud-storage==2.10.0 google-cloud-videointelligence==2.11.3 google-cloud-vision==3.4.4 google-crc32c==1.5.0 -google-pasta==0.2.0 -google-resumable-media==2.5.0 +google-resumable-media==2.6.0 googleapis-common-protos==1.60.0 greenlet==2.0.2 grpc-google-iam-v1==0.12.6 -grpcio==1.56.2 -grpcio-status==1.56.2 +grpcio==1.58.0 +grpcio-status==1.58.0 guppy3==3.1.3 -h5py==3.9.0 hdfs==2.7.2 httplib2==0.22.0 -hypothesis==6.82.3 +hypothesis==6.84.3 idna==3.4 -importlib-metadata==6.8.0 iniconfig==2.0.0 joblib==1.3.2 -keras==2.13.1 -libclang==16.0.6 -Markdown==3.4.4 -MarkupSafe==2.1.3 mmh3==4.0.1 mock==5.1.0 nltk==3.8.1 nose==1.3.7 -numpy==1.24.3 +numpy==1.24.4 oauth2client==4.1.3 -oauthlib==3.2.2 objsize==0.6.1 -opt-einsum==3.3.0 -orjson==3.9.2 +orjson==3.9.7 overrides==6.5.0 packaging==23.1 pandas==1.5.3 parameterized==0.9.0 -pluggy==1.2.0 +pluggy==1.3.0 proto-plus==1.22.3 -protobuf==4.23.4 +protobuf==4.24.3 psycopg2-binary==2.9.7 pyarrow==11.0.0 pyasn1==0.5.0 @@ -116,46 +102,36 @@ pyasn1-modules==0.3.0 pycparser==2.21 pydot==1.4.2 PyHamcrest==2.0.4 -pymongo==4.4.1 +pymongo==4.5.0 PyMySQL==1.1.0 pyparsing==3.1.1 -pytest==7.4.0 +pytest==7.4.2 pytest-timeout==2.1.0 pytest-xdist==3.3.1 python-dateutil==2.8.2 python-snappy==0.6.1 -pytz==2023.3 +pytz==2023.3.post1 PyYAML==6.0.1 regex==2023.8.8 requests==2.31.0 requests-mock==1.11.0 -requests-oauthlib==1.3.1 rsa==4.9 scikit-learn==1.3.0 scipy==1.10.1 Shapely==1.8.5.post1 six==1.16.0 sortedcontainers==2.4.0 -soupsieve==2.4.1 +soupsieve==2.5 SQLAlchemy==1.4.49 sqlparse==0.4.4 -tenacity==8.2.2 -tensorboard==2.13.0 -tensorboard-data-server==0.7.1 -tensorflow==2.13.0 -tensorflow-cpu-aws==2.13.0;platform_machine=="aarch64" -tensorflow-estimator==2.13.0 -tensorflow-io-gcs-filesystem==0.33.0 -termcolor==2.3.0 +tenacity==8.2.3 testcontainers==3.7.1 threadpoolctl==3.2.0 tomli==2.0.1 -tqdm==4.66.0 -typing_extensions==4.5.0 +tqdm==4.66.1 +typing_extensions==4.7.1 uritemplate==4.1.1 urllib3==1.26.16 -websocket-client==1.6.1 -Werkzeug==2.3.6 +websocket-client==1.6.3 wrapt==1.15.0 -zipp==3.16.2 zstandard==0.21.0 diff --git a/sdks/python/container/py39/base_image_requirements.txt b/sdks/python/container/py39/base_image_requirements.txt index 417b82fbb29b..257bcf9869e2 100644 --- a/sdks/python/container/py39/base_image_requirements.txt +++ b/sdks/python/container/py39/base_image_requirements.txt @@ -21,8 +21,6 @@ # https://s.apache.org/beam-python-dev-wiki # Reach out to a committer if you need help. -absl-py==1.4.0 -astunparse==1.6.3 attrs==23.1.0 beautifulsoup4==4.12.2 bs4==0.0.1 @@ -30,7 +28,7 @@ cachetools==5.3.1 certifi==2023.7.22 cffi==1.15.1 charset-normalizer==3.2.0 -click==8.1.6 +click==8.1.7 cloudpickle==2.2.1 crcmod==1.7 cryptography==41.0.3 @@ -40,75 +38,63 @@ dill==0.3.1.1 dnspython==2.4.2 docker==6.1.3 docopt==0.6.2 -exceptiongroup==1.1.2 +exceptiongroup==1.1.3 execnet==2.0.2 -fastavro==1.8.2 +fastavro==1.8.3 fasteners==0.18 -flatbuffers==23.5.26 freezegun==1.2.2 future==0.18.3 -gast==0.4.0 google-api-core==2.11.1 -google-api-python-client==2.96.0 +google-api-python-client==2.99.0 google-apitools==0.5.31 -google-auth==2.22.0 -google-auth-httplib2==0.1.0 -google-auth-oauthlib==1.0.0 -google-cloud-aiplatform==1.29.0 +google-auth==2.23.0 +google-auth-httplib2==0.1.1 +google-cloud-aiplatform==1.32.0 google-cloud-bigquery==3.11.4 google-cloud-bigquery-storage==2.22.0 google-cloud-bigtable==2.21.0 google-cloud-core==2.3.3 -google-cloud-datastore==2.17.0 +google-cloud-datastore==2.18.0 google-cloud-dlp==3.12.2 -google-cloud-language==2.10.1 -google-cloud-profiler==4.0.0 -google-cloud-pubsub==2.18.2 +google-cloud-language==2.11.0 +google-cloud-profiler==4.1.0 +google-cloud-pubsub==2.18.4 google-cloud-pubsublite==1.8.3 google-cloud-recommendations-ai==0.10.4 google-cloud-resource-manager==1.10.3 -google-cloud-spanner==3.40.0 +google-cloud-spanner==3.40.1 google-cloud-storage==2.10.0 google-cloud-videointelligence==2.11.3 google-cloud-vision==3.4.4 google-crc32c==1.5.0 -google-pasta==0.2.0 -google-resumable-media==2.5.0 +google-resumable-media==2.6.0 googleapis-common-protos==1.60.0 greenlet==2.0.2 grpc-google-iam-v1==0.12.6 -grpcio==1.56.2 -grpcio-status==1.56.2 +grpcio==1.58.0 +grpcio-status==1.58.0 guppy3==3.1.3 -h5py==3.9.0 hdfs==2.7.2 httplib2==0.22.0 -hypothesis==6.82.3 +hypothesis==6.84.3 idna==3.4 -importlib-metadata==6.8.0 iniconfig==2.0.0 joblib==1.3.2 -keras==2.13.1 -libclang==16.0.6 -Markdown==3.4.4 -MarkupSafe==2.1.3 mmh3==4.0.1 mock==5.1.0 nltk==3.8.1 nose==1.3.7 -numpy==1.24.3 +numpy==1.24.4 oauth2client==4.1.3 -oauthlib==3.2.2 objsize==0.6.1 -opt-einsum==3.3.0 -orjson==3.9.2 +orjson==3.9.7 overrides==6.5.0 packaging==23.1 pandas==1.5.3 parameterized==0.9.0 -pluggy==1.2.0 +pluggy==1.3.0 proto-plus==1.22.3 -protobuf==4.23.4 +protobuf==4.24.3 psycopg2-binary==2.9.7 pyarrow==11.0.0 pyasn1==0.5.0 @@ -116,46 +102,36 @@ pyasn1-modules==0.3.0 pycparser==2.21 pydot==1.4.2 PyHamcrest==2.0.4 -pymongo==4.4.1 +pymongo==4.5.0 PyMySQL==1.1.0 pyparsing==3.1.1 -pytest==7.4.0 +pytest==7.4.2 pytest-timeout==2.1.0 pytest-xdist==3.3.1 python-dateutil==2.8.2 python-snappy==0.6.1 -pytz==2023.3 +pytz==2023.3.post1 PyYAML==6.0.1 regex==2023.8.8 requests==2.31.0 requests-mock==1.11.0 -requests-oauthlib==1.3.1 rsa==4.9 scikit-learn==1.3.0 -scipy==1.11.1 +scipy==1.11.2 Shapely==1.8.5.post1 six==1.16.0 sortedcontainers==2.4.0 -soupsieve==2.4.1 +soupsieve==2.5 SQLAlchemy==1.4.49 sqlparse==0.4.4 -tenacity==8.2.2 -tensorboard==2.13.0 -tensorboard-data-server==0.7.1 -tensorflow==2.13.0 -tensorflow-cpu-aws==2.13.0;platform_machine=="aarch64" -tensorflow-estimator==2.13.0 -tensorflow-io-gcs-filesystem==0.33.0 -termcolor==2.3.0 +tenacity==8.2.3 testcontainers==3.7.1 threadpoolctl==3.2.0 tomli==2.0.1 -tqdm==4.66.0 -typing_extensions==4.5.0 +tqdm==4.66.1 +typing_extensions==4.7.1 uritemplate==4.1.1 urllib3==1.26.16 -websocket-client==1.6.1 -Werkzeug==2.3.6 +websocket-client==1.6.3 wrapt==1.15.0 -zipp==3.16.2 zstandard==0.21.0 diff --git a/sdks/python/setup.py b/sdks/python/setup.py index 355b75ee9023..d5ca354fcfbe 100644 --- a/sdks/python/setup.py +++ b/sdks/python/setup.py @@ -230,7 +230,7 @@ def get_portability_package_data(): language_level=3), install_requires=[ 'crcmod>=1.7,<2.0', - 'orjson<3.9.3', # https://github.com/ijl/orjson/issues/415 + 'orjson>=3.9.7,<4', # Dill doesn't have forwards-compatibility guarantees within minor # version. Pickles created with a new version of dill may not unpickle # using older version of dill. It is best to use the same version of @@ -254,12 +254,17 @@ def get_portability_package_data(): 'packaging>=22.0', 'pymongo>=3.8.0,<5.0.0', 'proto-plus>=1.7.1,<2', - # use a tighter upper bound in protobuf dependency - # to make sure the minor version at job submission + # 1. Use a tighter upper bound in protobuf dependency to make sure + # the minor version at job submission # does not exceed the minor version at runtime. # To avoid depending on an old dependency, update the minor version on # every Beam release, see: https://github.com/apache/beam/issues/25590 - 'protobuf>=3.20.3,<4.24.0', + + # 2. Allow latest protobuf 3 version as a courtesy to some customers. + # + # 3. Exclude protobuf 4 versions that leak memory, see: + # https://github.com/apache/beam/issues/28246 + 'protobuf>=3.20.3,<4.25.0,!=4.0.*,!=4.21.*,!=4.22.0,!=4.23.*,!=4.24.0,!=4.24.1,!=4.24.2', # pylint: disable=line-too-long 'pydot>=1.2.0,<2', 'python-dateutil>=2.8.0,<3', 'pytz>=2018.3', diff --git a/website/www/site/content/en/blog/beam-2.50.0.md b/website/www/site/content/en/blog/beam-2.50.0.md index 7610459087c5..4cfddd6167a6 100644 --- a/website/www/site/content/en/blog/beam-2.50.0.md +++ b/website/www/site/content/en/blog/beam-2.50.0.md @@ -77,6 +77,7 @@ For more information on changes in 2.50.0, check out the [detailed release notes * Fixed DirectRunner bug in Python SDK where GroupByKey gets empty PCollection and fails when pipeline option `direct_num_workers!=1`.([#27373](https://github.com/apache/beam/pull/27373)) * Fixed BigQuery I/O bug when estimating size on queries that utilize row-level security ([#27474](https://github.com/apache/beam/pull/27474)) +* Beam Python containers rely on a version of Debian/aom that has several security vulnerabilities: [CVE-2021-30474](https://nvd.nist.gov/vuln/detail/CVE-2021-30474), [CVE-2021-30475](https://nvd.nist.gov/vuln/detail/CVE-2021-30475), [CVE-2021-30473](https://nvd.nist.gov/vuln/detail/CVE-2021-30473), [CVE-2020-36133](https://nvd.nist.gov/vuln/detail/CVE-2020-36133), [CVE-2020-36131](https://nvd.nist.gov/vuln/detail/CVE-2020-36131), [CVE-2020-36130](https://nvd.nist.gov/vuln/detail/CVE-2020-36130), and [CVE-2020-36135](https://nvd.nist.gov/vuln/detail/CVE-2020-36135). ## Known Issues diff --git a/website/www/site/content/en/contribute/release-guide.md b/website/www/site/content/en/contribute/release-guide.md index 19e022b65b7f..964d2bfa7050 100644 --- a/website/www/site/content/en/contribute/release-guide.md +++ b/website/www/site/content/en/contribute/release-guide.md @@ -1301,15 +1301,15 @@ After new Beam Release is published, Beam Playground can be updated following th 1. Change the value for _SDK_TAG variable (Advanced -> Substitution Variables) to the actual version of Beam SDK (e.g. 2.47.0) 1. Click the Save button. The settings window should close without any errors 1. Click the RUN button next to the trigger name - 1. Set the value for the _CONTAINER_TAG variable in format DD-MM-vXX (DD - day, MM - month, XX - version, e.g., 20-12-v01) + 1. In the panel that opened, set the value for the _CONTAINER_TAG variable in format DD-MM-vXX (DD - day, MM - month, XX - version, e.g., 20-12-v01) 1. Click the Run Trigger button 1. Open the [Trigger History](https://console.cloud.google.com/cloud-build/builds?project=apache-beam-testing) and wait for the job completion. Ensure that the job completed successfully (Status field shows a green tick) -1. Find the trigger "Playground-CD-stable-manual-stg": +1. Find the trigger "Playground-CD-stable-manual-stg", it will be run twice, once with default variables, and once with some overridden: 1. Click the RUN button next to the trigger name - 1. Click the Run Trigger button (with default varaible vaues) + 1. In the panel that opened, click the Run Trigger button (with default variable values) 1. Open the [Trigger History](https://console.cloud.google.com/cloud-build/builds?project=apache-beam-testing) and wait for the job completion. Ensure that the job completed successfully (Status field shows a green tick) 1. Click the RUN button next to the trigger name - 1. Change values for the variables: + 1. In the panel that opened, change values for the variables: * _ORIGIN = PG_BEAMDOC * _SUBDIRS = ./learning/beamdoc 1. Click the Run Trigger button