Merge branch 'master' into issue24563_tobf_gradle

apache · Jan 9, 2023 · bb0b08e · bb0b08e
2 parents 6ebe89d + 95e5391
commit bb0b08e
Show file tree

Hide file tree

Showing 176 changed files with 17,902 additions and 1,351 deletions.
diff --git a/.github/workflows/playground_examples_ci_reusable.yml b/.github/workflows/playground_examples_ci_reusable.yml
@@ -91,6 +91,7 @@ jobs:
         working-directory: playground/infrastructure
         env:
           BEAM_ROOT_DIR: "../.."
+          BEAM_EXAMPLE_CATEGORIES: "../categories.yaml"
 
   ci_cd:
     name: ${{ inputs.step }} ${{ inputs.sdk }} ${{ inputs.origin }}

diff --git a/.gitignore b/.gitignore
@@ -127,6 +127,8 @@ website/www/yarn-error.log
 **/.packages
 **/generated_plugin_registrant.dart
 playground/frontend/playground_components/pubspec.lock
+playground/frontend/playground_components/test/tools/extract_symbols_java/dependencies
+playground/frontend/playground_components_dev/pubspec.lock
 
 # Ignore Beam Playground Terraform
 **/.terraform
@@ -136,4 +138,4 @@ playground/frontend/playground_components/pubspec.lock
 **/*.tfvars
 
 # Ignore Katas auto-generated files
-**/*-remote-info.yaml
+**/*-remote-info.yaml
diff --git a/.test-infra/jenkins/job_CloudMLBenchmarkTests_Python.groovy b/.test-infra/jenkins/job_CloudMLBenchmarkTests_Python.groovy
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import CommonJobProperties as commonJobProperties
+import PhraseTriggeringPostCommitBuilder
+import CronJobBuilder
+
+def cloudMLJob = { scope ->
+  scope.description('Runs the TFT Criteo Examples on the Dataflow runner.')
+
+  // Set common parameters.
+  commonJobProperties.setTopLevelMainJobProperties(scope, 'master', 360)
+
+  // Gradle goals for this job.
+  scope.steps {
+    gradle {
+      rootBuildScriptDir(commonJobProperties.checkoutDir)
+      commonJobProperties.setGradleSwitches(delegate)
+      tasks(':sdks:python:test-suites:dataflow:tftTests')
+    }
+  }
+}
+
+PhraseTriggeringPostCommitBuilder.postCommitJob(
+    'beam_CloudML_Benchmarks_Dataflow',
+    'Run TFT Criteo Benchmarks',
+    'TFT Criteo benchmarks on Dataflow(\"Run TFT Criteo Benchmarks"\"")',
+    this
+    ) {
+      cloudMLJob(delegate)
+    }
+
+CronJobBuilder.cronJob(
+    'beam_CloudML_Benchmarks_Dataflow',
+    'H 14 * * *',
+    this
+    ) {
+      cloudMLJob(delegate)
+    }
diff --git a/.test-infra/jenkins/job_LoadTests_Combine_Python.groovy b/.test-infra/jenkins/job_LoadTests_Combine_Python.groovy
@@ -100,8 +100,7 @@ def loadTestConfigurations = { datasetName, mode ->
 
 def addStreamingOptions(test){
   test.pipelineOptions << [streaming: null,
-    // TODO(https://github.com/apache/beam/issues/20806) remove shuffle_mode=appliance with runner v2 once issue is resolved.
-    experiments: "use_runner_v2,shuffle_mode=appliance"
+    experiments: "use_runner_v2"
   ]
 }
 

diff --git a/.test-infra/jenkins/job_LoadTests_GBK_Python.groovy b/.test-infra/jenkins/job_LoadTests_GBK_Python.groovy
@@ -156,8 +156,7 @@ def addStreamingOptions(test) {
     // Use the new Dataflow runner, which offers improved efficiency of Dataflow jobs.
     // See https://cloud.google.com/dataflow/docs/guides/deploying-a-pipeline#dataflow-runner-v2
     // for more details.
-    // TODO(https://github.com/apache/beam/issues/20806) remove shuffle_mode=appliance with runner v2 once issue is resolved.
-    experiments: 'use_runner_v2,shuffle_mode=appliance',
+    experiments: 'use_runner_v2',
   ]
 }
 

diff --git a/.test-infra/jenkins/job_LoadTests_GBK_Python_reiterate.groovy b/.test-infra/jenkins/job_LoadTests_GBK_Python_reiterate.groovy
@@ -86,8 +86,7 @@ def addStreamingOptions(test) {
     // Use the new Dataflow runner, which offers improved efficiency of Dataflow jobs.
     // See https://cloud.google.com/dataflow/docs/guides/deploying-a-pipeline#dataflow-runner-v2
     // for more details.
-    // TODO(https://github.com/apache/beam/issues/20806) remove shuffle_mode=appliance with runner v2 once issue is resolved.
-    experiments: 'use_runner_v2,shuffle_mode=appliance',
+    experiments: 'use_runner_v2',
   ]
 }
 

diff --git a/.test-infra/jenkins/job_LoadTests_ParDo_Python.groovy b/.test-infra/jenkins/job_LoadTests_ParDo_Python.groovy
@@ -131,8 +131,7 @@ def addStreamingOptions(test) {
     // Use the new Dataflow runner, which offers improved efficiency of Dataflow jobs.
     // See https://cloud.google.com/dataflow/docs/guides/deploying-a-pipeline#dataflow-runner-v2
     // for more details.
-    // TODO(https://github.com/apache/beam/issues/20806) remove shuffle_mode=appliance with runner v2 once issue is resolved.
-    experiments: 'use_runner_v2,shuffle_mode=appliance',
+    experiments: 'use_runner_v2',
   ]
 }
 

diff --git a/.test-infra/jenkins/job_LoadTests_SideInput_Python.groovy b/.test-infra/jenkins/job_LoadTests_SideInput_Python.groovy
@@ -39,8 +39,7 @@ def fromTemplate = { mode, name, id, datasetName, testSpecificOptions ->
       influx_measurement   : "python_${mode}_sideinput_${id}",
       num_workers          : 10,
       autoscaling_algorithm: 'NONE',
-      // TODO(https://github.com/apache/beam/issues/20806) remove shuffle_mode=appliance with runner v2 once issue is resolved.
-      experiments          : 'use_runner_v2,shuffle_mode=appliance',
+      experiments          : 'use_runner_v2',
     ] << testSpecificOptions
   ]
 }

diff --git a/.test-infra/jenkins/job_PerformanceTests_KafkaIO_IT.groovy b/.test-infra/jenkins/job_PerformanceTests_KafkaIO_IT.groovy
@@ -106,8 +106,7 @@ job(jobName) {
     readTimeout                  : '1500',
     bigQueryTable                : 'kafkaioit_results_runner_v2',
     influxMeasurement            : 'kafkaioit_results_runner_v2',
-    // TODO(https://github.com/apache/beam/issues/20806) remove shuffle_mode=appliance with runner v2 once issue is resolved.
-    experiments                  : 'use_runner_v2,shuffle_mode=appliance,use_unified_worker',
+    experiments                  : 'use_runner_v2,use_unified_worker',
   ]
 
   steps {

diff --git a/.test-infra/metrics/sync/github/sync_workflows.py b/.test-infra/metrics/sync/github/sync_workflows.py
@@ -0,0 +1,187 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+'''
+This module queries GitHub to collect Beam-related workflows metrics and put them in
+PostgreSQL.
+This Script is running every 3 hours in a cloud function in apache-beam-testing project.
+This cloud function is triggered by a pubsub topic.
+You can find the cloud function in the next link 
+https://console.cloud.google.com/functions/details/us-central1/github_actions_workflows_dashboard_sync?env=gen1&project=apache-beam-testing
+Pub sub topic : https://console.cloud.google.com/cloudpubsub/topic/detail/github_actions_workflows_sync?project=apache-beam-testing
+Cron Job : https://console.cloud.google.com/cloudscheduler/jobs/edit/us-central1/github_actions_workflows_dashboard_sync?project=apache-beam-testing
+Writing the latest 10 runs of every postcommit workflow in master branch in a beammetrics database
+'''
+
+import os
+import sys
+import time
+import re
+import requests
+import psycopg2
+
+from datetime import datetime
+from github import GithubIntegration 
+
+DB_HOST = os.environ['DB_HOST']
+DB_PORT = os.environ['DB_PORT']
+DB_NAME = os.environ['DB_NAME']
+DB_USER_NAME = os.environ['DB_USER']
+DB_PASSWORD = os.environ['DB_PASS']
+GH_WORKFLOWS_TABLE_NAME = "github_workflows"
+# Number of workflows that fetch github API
+GH_NUMBER_OF_WORKFLOWS = 100  
+GH_WORKFLOWS_NUMBER_EXECUTIONS = 100
+WORKFLOWS_OBJECT_LIST = []
+
+
+class Workflow:
+    def __init__(self,id,name,filename):
+        self.id = id
+        self.name = name
+        self.filename = filename
+        self.listOfRuns = []
+        self.runUrl = []
+
+# The table will save the latest ten run of every workflow
+GH_WORKFLOWS_CREATE_TABLE_QUERY = f"""
+CREATE TABLE IF NOT EXISTS {GH_WORKFLOWS_TABLE_NAME} (
+    job_name text PRIMARY KEY,
+    job_yml_filename text"""
+for i in range(0,GH_WORKFLOWS_NUMBER_EXECUTIONS):
+    i = i + 1
+    GH_WORKFLOWS_CREATE_TABLE_QUERY += """,\n    run{} text,
+    run{}Id text""".format(str(i),str(i))
+GH_WORKFLOWS_CREATE_TABLE_QUERY += ")\n"
+
+def githubWorkflowsGrafanaSync(data,context):
+    print('Started')
+    print('Updating table with recent workflow runs')
+    databaseOperations(initDbConnection(),fetchWorkflowData())
+    print('Done')
+    return "Completed"
+
+def initDbConnection():
+    '''Init connection with the Database'''
+    connection = None
+    maxRetries = 3
+    i = 0 
+    while connection == None and i < maxRetries:
+        try:
+            connection = psycopg2.connect(
+                f"dbname='{DB_NAME}' user='{DB_USER_NAME}' host='{DB_HOST}'"
+                f" port='{DB_PORT}' password='{DB_PASSWORD}'")
+        except Exception as e:
+            print('Failed to connect to DB; retrying in 1 minute')
+            print(e)
+            time.sleep(60)
+            i = i + 1
+            if i >= maxRetries:
+                print("Number of retries exceded ")
+                sys.exit(1)
+    return connection
+
+def getToken():
+    git_integration = GithubIntegration(
+    os.environ["GH_APP_ID"],
+    os.environ["GH_PEM_KEY"])
+    token=git_integration.get_access_token(
+            os.environ["GH_APP_INSTALLATION_ID"]
+        ).token
+    return token
+
+def retriesRequest(request):
+    requestSucceeded = False
+    retryFactor = 1
+    while not requestSucceeded:
+        retryTime = 60 * retryFactor
+        if request.status_code != 200:
+            print('Failed to get the request with code {}'.format(request.status_code))
+            time.sleep(retryTime)
+            retryFactor = retryFactor + retryFactor
+            if retryFactor * 60 >= 3600:
+                print("Error: The request take more than an hour")
+                sys.exit(1)
+        else:
+            requestSucceeded = True
+def fetchWorkflowData():
+    '''Return a json with all the workflows and the latests
+    ten executions'''
+    completed = False
+    page = 1 
+    workflows = []
+    try:
+        while not completed:
+            url = "https://api.github.com/repos/apache/beam/actions/workflows"
+            queryOptions = { 'branch' : 'master', 'page': page, 'per_page' : GH_NUMBER_OF_WORKFLOWS }
+            response = requests.get(url = url, params = queryOptions)
+            retriesRequest(response)
+            jsonResponse = response.json()
+            if jsonResponse['total_count'] >= GH_NUMBER_OF_WORKFLOWS:
+                page = page + 1
+                workflowsPage = jsonResponse['workflows']
+                workflows.append(workflowsPage)
+            else:
+                completed = True
+                workflowsPage = jsonResponse['workflows']
+                workflows.append(workflowsPage)
+        for pageItem in workflows:
+            for item in pageItem:
+                path =item['path']
+                isPostCommit = re.search('(.*)postcommit(.*)',path)
+                if isPostCommit:
+                    result = re.search('/(.*).yml', path)
+                    path =(result.group(1)) + ".yml"
+                    workflowObject = Workflow(item['id'],item['name'],path)
+                    WORKFLOWS_OBJECT_LIST.append(workflowObject)
+            url = "https://api.github.com/repos/apache/beam/actions/workflows/"
+            queryOptions = { 'branch' : 'master', 'per_page' : GH_WORKFLOWS_NUMBER_EXECUTIONS,
+                        'page' :'1', 'exclude_pull_request':True }
+        for workflow in WORKFLOWS_OBJECT_LIST:        
+            response = requests.get(url = "{}{}/runs".format(url,workflow.id),
+                        params=queryOptions)
+            retriesRequest(response)
+            responseJson = response.json()
+            workflowsRuns = responseJson['workflow_runs']
+            for  item in workflowsRuns:
+                if item['status'] == 'completed':
+                    workflow.runUrl.append(item['html_url'])
+                    workflow.listOfRuns.append(item['conclusion'])
+                else:
+                    workflow.listOfRuns.append(item['status'])
+                    workflow.runUrl.append(item['html_url'])
+            for i in range(0,GH_WORKFLOWS_NUMBER_EXECUTIONS):   
+                if i >= len(workflow.listOfRuns):
+                    workflow.listOfRuns.append('None')
+                    workflow.runUrl.append('None')
+    except Exception as e:
+        print('Failed to get GHA workflows')
+        print(e)
+
+def databaseOperations(connection,fetchWorkflows):
+    '''Create the table if not exist and update the table with the latest runs
+    of the workflows '''
+    queryInsert = "INSERT INTO {} VALUES ".format(GH_WORKFLOWS_TABLE_NAME)
+    cursor = connection.cursor()
+    cursor.execute(GH_WORKFLOWS_CREATE_TABLE_QUERY)
+    cursor.execute("DELETE FROM {};".format(GH_WORKFLOWS_TABLE_NAME))
+    query = ""
+    for workflow in WORKFLOWS_OBJECT_LIST:
+        rowInsert = "(\'{}\',\'{}\'".format(workflow.name,workflow.filename)
+        for run, runUrl  in zip(workflow.listOfRuns,workflow.runUrl):
+            rowInsert += ",\'{}\',\'{}\'".format(run,runUrl)
+        query = query + rowInsert
+        query += "),"
+    query = query[:-1] + ";"  
+    query = queryInsert + query
+    cursor.execute(query)
+    cursor.close()
+    connection.commit()
+    connection.close()
diff --git a/CHANGES.md b/CHANGES.md
@@ -65,6 +65,9 @@
 
 * RunInference Wrapper with Sklearn Model Handler support added in Go SDK ([#24497](https://github.com/apache/beam/issues/23382)).
 * X feature added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)).
+* Adding override of allowed TLS algorithms (Java), now maintaining the disabled/legacy algorithms
+  present in 2.43.0 (up to 1.8.0_342, 11.0.16, 17.0.2 for respective Java versions). This is accompanied
+  by an explicit re-enabling of TLSv1 and TLSv1.1 for Java 8 and Java 11.
 
 ## Breaking Changes
 
@@ -81,6 +84,7 @@
 ## Bugfixes
 
 * Avoids Cassandra syntax error when user-defined query has no where clause in it (Java) ([#24829](https://github.com/apache/beam/issues/24829)).
+* Fixed JDBC connection failures (Java) during handshake due to deprecated TLSv1(.1) protocol for the JDK. ([#24623](https://github.com/apache/beam/issues/24623))
 
 ## Known Issues
 
@@ -147,7 +151,7 @@
 
 * Decreased TextSource CPU utilization by 2.3x (Java) ([#23193](https://github.com/apache/beam/issues/23193)).
 * Fixed bug when using SpannerIO with RuntimeValueProvider options (Java) ([#22146](https://github.com/apache/beam/issues/22146)).
-* Fixed issue for unicode rendering on WriteToBigQuery ([#10785](https://github.com/apache/beam/issues/10785))
+* Fixed issue for unicode rendering on WriteToBigQuery ([#22312](https://github.com/apache/beam/issues/22312))
 * Remove obsolete variants of BigQuery Read and Write, always using Beam-native variant
   ([#23564](https://github.com/apache/beam/issues/23564) and [#23559](https://github.com/apache/beam/issues/23559)).
 * Bumped google-cloud-spanner dependency version to 3.x for Python SDK ([#21198](https://github.com/apache/beam/issues/21198)).