From 7e6e13e5864a9c39fdd50d127bdd812bc73f5bc7 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 14 Sep 2022 21:28:26 +0000 Subject: [PATCH 001/456] Update nbconvert requirement in /sdks/python Updates the requirements on [nbconvert](https://github.com/jupyter/nbconvert) to permit the latest version. - [Release notes](https://github.com/jupyter/nbconvert/releases) - [Commits](https://github.com/jupyter/nbconvert/compare/6.2.0...7.0.0) --- updated-dependencies: - dependency-name: nbconvert dependency-type: direct:development ... Signed-off-by: dependabot[bot] --- sdks/python/setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdks/python/setup.py b/sdks/python/setup.py index a7da26fb9dd8..cb14cfea4adb 100644 --- a/sdks/python/setup.py +++ b/sdks/python/setup.py @@ -286,7 +286,7 @@ def get_portability_package_data(): 'google-cloud-pubsub>=2.1.0,<3', 'google-cloud-pubsublite>=1.2.0,<2', # GCP packages required by tests - 'google-cloud-bigquery>=1.6.0,<3', + 'google-cloud-bigquery>=1.6.0,<4', 'google-cloud-bigquery-storage>=2.6.3,<2.14', 'google-cloud-core>=0.28.1,<3', 'google-cloud-bigtable>=0.31.1,<2', @@ -315,7 +315,7 @@ def get_portability_package_data(): 'interactive_test': [ # notebok utils 'nbformat>=5.0.5,<6', - 'nbconvert>=6.2.0,<7', + 'nbconvert>=6.2.0,<8', # headless chrome based integration tests 'needle>=0.5.0,<1', 'chromedriver-binary>=100,<101', From ea6cf404b9f730278d49031580fbae30091e9377 Mon Sep 17 00:00:00 2001 From: Robert Bradshaw Date: Thu, 3 Nov 2022 14:51:40 -0700 Subject: [PATCH 002/456] Implement sibling protocol for Typescript. Though there are various worker thread libraries in Node.js, the lack of a shared memory model makes it difficult to share caches, negating most of the benefit of trying to run multiple threads in the same process. In addition, having multiple independent workers connect to the control service is simpler (and possibly more efficient) than adding a proxying service within the javascript process. --- sdks/typescript/boot.go | 11 ++++++++++- .../src/apache_beam/internal/environments.ts | 8 ++++++-- sdks/typescript/src/apache_beam/runners/dataflow.ts | 4 +++- sdks/typescript/test/io_test.ts | 5 ++++- 4 files changed, 23 insertions(+), 5 deletions(-) diff --git a/sdks/typescript/boot.go b/sdks/typescript/boot.go index 7bace0b1b716..60a39b770d00 100644 --- a/sdks/typescript/boot.go +++ b/sdks/typescript/boot.go @@ -169,5 +169,14 @@ func main() { args = append(args, "--status_endpoint="+info.GetStatusEndpoint().GetUrl()) } - log.Fatalf("User program exited: %v", execx.Execute("npx", args...)) + workerIds := append([]string{*workerId}, info.GetSiblingWorkerIds()...) + var wg sync.WaitGroup + wg.Add(len(workerIds)) + for _, workerId := range workerIds { + go func(workerId string) { + log.Printf("Executing: python %v", strings.Join(args, " ")) + log.Fatalf("User program exited: %v", execx.ExecuteEnv(map[string]string{"WORKER_ID": workerId}, "npx", args...)) + }(workerId) + } + wg.Wait() } diff --git a/sdks/typescript/src/apache_beam/internal/environments.ts b/sdks/typescript/src/apache_beam/internal/environments.ts index 4ffc45553972..4e45800839ea 100644 --- a/sdks/typescript/src/apache_beam/internal/environments.ts +++ b/sdks/typescript/src/apache_beam/internal/environments.ts @@ -21,8 +21,12 @@ import * as runnerApi from "../proto/beam_runner_api"; export const TYPESCRIPT_DEFAULT_ENVIRONMENT_URN = "js_default"; function javascriptCapabilities(): string[] { - // XXX This is needed for sessions to work... - return ["beam:coder:interval_window:v1"]; // TODO: Cleanup. Actually populate. + // TODO: Cleanup. Actually populate. + return [ + // This is needed for sessions to work... + "beam:coder:interval_window:v1", + "beam:protocol:sibling_workers:v1", + ]; } export function defaultJsEnvironment() { diff --git a/sdks/typescript/src/apache_beam/runners/dataflow.ts b/sdks/typescript/src/apache_beam/runners/dataflow.ts index 66ff4b46229f..6c9f459a1e26 100644 --- a/sdks/typescript/src/apache_beam/runners/dataflow.ts +++ b/sdks/typescript/src/apache_beam/runners/dataflow.ts @@ -32,13 +32,15 @@ export function dataflowRunner(runnerOptions: { pipeline: Pipeline, options: Object = {} ): Promise { + var augmentedOptions = { experiments: [] as string[], ...options }; + augmentedOptions.experiments.push("use_sibling_sdk_workers"); return new PortableRunner( runnerOptions as any, PythonService.forModule( "apache_beam.runners.dataflow.dataflow_job_service", ["--port", "{{PORT}}"] ) - ).runPipeline(pipeline, options); + ).runPipeline(pipeline, augmentedOptions); } })(); } diff --git a/sdks/typescript/test/io_test.ts b/sdks/typescript/test/io_test.ts index a6fd2743aaa4..d7bf8f2f96a7 100644 --- a/sdks/typescript/test/io_test.ts +++ b/sdks/typescript/test/io_test.ts @@ -60,7 +60,10 @@ before(() => { after(() => subprocessCache.stopAll()); function xlang_it(name, fn) { - return (process.env.BEAM_SERVICE_OVERRIDES ? it : it.skip)(name + ' @xlang', fn); + return (process.env.BEAM_SERVICE_OVERRIDES ? it : it.skip)( + name + " @xlang", + fn + ); } // These depends on fixes that will be released in 2.40. From 466db663a9b3f6140b85f2b5abb86161edfe5960 Mon Sep 17 00:00:00 2001 From: bulat safiullin Date: Mon, 28 Nov 2022 13:27:27 +0600 Subject: [PATCH 003/456] [Website] update copy-to-clipboard.js #24372 --- website/www/site/assets/js/copy-to-clipboard.js | 2 ++ website/www/site/assets/scss/_global.sass | 3 +++ 2 files changed, 5 insertions(+) diff --git a/website/www/site/assets/js/copy-to-clipboard.js b/website/www/site/assets/js/copy-to-clipboard.js index 8f58a7550094..baaf12dfb4dc 100644 --- a/website/www/site/assets/js/copy-to-clipboard.js +++ b/website/www/site/assets/js/copy-to-clipboard.js @@ -33,6 +33,8 @@ $(document).ready(function() { code.forEach((hl) => { if( !hl.parentElement.classList.contains('code-snippet') && !hl.parentElement.classList.contains('highlight')) { + const textNode = hl.innerHTML; + hl.innerHTML = `
${textNode}
` hl.prepend(copyIcon.cloneNode([true])); } }) diff --git a/website/www/site/assets/scss/_global.sass b/website/www/site/assets/scss/_global.sass index eeddf974dade..eac95aa8b39c 100644 --- a/website/www/site/assets/scss/_global.sass +++ b/website/www/site/assets/scss/_global.sass @@ -178,3 +178,6 @@ code .video--medium-size max-width: 560px + +.pre-content-container + overflow-x: auto From 5a60769700dea3c64731240166898dbff1b4e6fe Mon Sep 17 00:00:00 2001 From: Moritz Mack Date: Mon, 28 Nov 2022 17:57:27 +0100 Subject: [PATCH 004/456] [Spark RDD runner] Remove obsolete unusable AggregatorsAccumulator / NamedAggregators from the runner (closes #24379) --- .../runners/spark/SparkPipelineRunner.java | 5 - .../beam/runners/spark/SparkRunner.java | 20 +-- .../beam/runners/spark/TestSparkRunner.java | 2 - .../aggregators/AggregatorsAccumulator.java | 133 ------------------ .../spark/aggregators/NamedAggregators.java | 113 --------------- .../NamedAggregatorsAccumulator.java | 63 --------- .../spark/aggregators/package-info.java | 20 --- .../spark/metrics/AggregatorMetric.java | 74 ---------- .../spark/metrics/AggregatorMetricSource.java | 49 ------- .../spark/metrics/CompositeSource.java | 45 ------ .../runners/spark/metrics/sink/CsvSink.java | 3 +- .../spark/metrics/sink/GraphiteSink.java | 3 +- .../ResumeFromCheckpointStreamingTest.java | 2 - 13 files changed, 5 insertions(+), 527 deletions(-) delete mode 100644 runners/spark/src/main/java/org/apache/beam/runners/spark/aggregators/AggregatorsAccumulator.java delete mode 100644 runners/spark/src/main/java/org/apache/beam/runners/spark/aggregators/NamedAggregators.java delete mode 100644 runners/spark/src/main/java/org/apache/beam/runners/spark/aggregators/NamedAggregatorsAccumulator.java delete mode 100644 runners/spark/src/main/java/org/apache/beam/runners/spark/aggregators/package-info.java delete mode 100644 runners/spark/src/main/java/org/apache/beam/runners/spark/metrics/AggregatorMetric.java delete mode 100644 runners/spark/src/main/java/org/apache/beam/runners/spark/metrics/AggregatorMetricSource.java delete mode 100644 runners/spark/src/main/java/org/apache/beam/runners/spark/metrics/CompositeSource.java diff --git a/runners/spark/src/main/java/org/apache/beam/runners/spark/SparkPipelineRunner.java b/runners/spark/src/main/java/org/apache/beam/runners/spark/SparkPipelineRunner.java index 4a7ffd13ee60..89738b8a53fc 100644 --- a/runners/spark/src/main/java/org/apache/beam/runners/spark/SparkPipelineRunner.java +++ b/runners/spark/src/main/java/org/apache/beam/runners/spark/SparkPipelineRunner.java @@ -40,7 +40,6 @@ import org.apache.beam.runners.jobsubmission.PortablePipelineJarUtils; import org.apache.beam.runners.jobsubmission.PortablePipelineResult; import org.apache.beam.runners.jobsubmission.PortablePipelineRunner; -import org.apache.beam.runners.spark.aggregators.AggregatorsAccumulator; import org.apache.beam.runners.spark.metrics.MetricsAccumulator; import org.apache.beam.runners.spark.translation.SparkBatchPortablePipelineTranslator; import org.apache.beam.runners.spark.translation.SparkContextFactory; @@ -114,7 +113,6 @@ public PortablePipelineResult run(RunnerApi.Pipeline pipeline, JobInfo jobInfo) final JavaSparkContext jsc = SparkContextFactory.getSparkContext(pipelineOptions); // Initialize accumulators. - AggregatorsAccumulator.init(pipelineOptions, jsc); MetricsEnvironment.setMetricsSupported(true); MetricsAccumulator.init(pipelineOptions, jsc); @@ -133,9 +131,6 @@ public PortablePipelineResult run(RunnerApi.Pipeline pipeline, JobInfo jobInfo) final JavaStreamingContext jssc = ((SparkStreamingTranslationContext) context).getStreamingContext(); - jssc.addStreamingListener( - new JavaStreamingListenerWrapper( - new AggregatorsAccumulator.AccumulatorCheckpointingSparkListener())); jssc.addStreamingListener( new JavaStreamingListenerWrapper( new MetricsAccumulator.AccumulatorCheckpointingSparkListener())); diff --git a/runners/spark/src/main/java/org/apache/beam/runners/spark/SparkRunner.java b/runners/spark/src/main/java/org/apache/beam/runners/spark/SparkRunner.java index f3558f633829..d10209ba14ee 100644 --- a/runners/spark/src/main/java/org/apache/beam/runners/spark/SparkRunner.java +++ b/runners/spark/src/main/java/org/apache/beam/runners/spark/SparkRunner.java @@ -30,9 +30,6 @@ import org.apache.beam.runners.core.construction.TransformInputs; import org.apache.beam.runners.core.construction.graph.ProjectionPushdownOptimizer; import org.apache.beam.runners.core.metrics.MetricsPusher; -import org.apache.beam.runners.spark.aggregators.AggregatorsAccumulator; -import org.apache.beam.runners.spark.metrics.AggregatorMetricSource; -import org.apache.beam.runners.spark.metrics.CompositeSource; import org.apache.beam.runners.spark.metrics.MetricsAccumulator; import org.apache.beam.runners.spark.metrics.SparkBeamMetricSource; import org.apache.beam.runners.spark.translation.EvaluationContext; @@ -178,9 +175,6 @@ public SparkPipelineResult run(final Pipeline pipeline) { JavaStreamingContext.getOrCreate( checkpointDir.getSparkCheckpointDir().toString(), streamingContextFactory); // Checkpoint aggregator/metrics values - jssc.addStreamingListener( - new JavaStreamingListenerWrapper( - new AggregatorsAccumulator.AccumulatorCheckpointingSparkListener())); jssc.addStreamingListener( new JavaStreamingListenerWrapper( new MetricsAccumulator.AccumulatorCheckpointingSparkListener())); @@ -252,24 +246,16 @@ public SparkPipelineResult run(final Pipeline pipeline) { private void registerMetricsSource(String appName) { final MetricsSystem metricsSystem = SparkEnv$.MODULE$.get().metricsSystem(); - final AggregatorMetricSource aggregatorMetricSource = - new AggregatorMetricSource(null, AggregatorsAccumulator.getInstance().value()); - final SparkBeamMetricSource metricsSource = new SparkBeamMetricSource(null); - final CompositeSource compositeSource = - new CompositeSource( - appName + ".Beam", - metricsSource.metricRegistry(), - aggregatorMetricSource.metricRegistry()); + final SparkBeamMetricSource metricsSource = new SparkBeamMetricSource(appName + ".Beam"); // re-register the metrics in case of context re-use - metricsSystem.removeSource(compositeSource); - metricsSystem.registerSource(compositeSource); + metricsSystem.removeSource(metricsSource); + metricsSystem.registerSource(metricsSource); } /** Init Metrics/Aggregators accumulators. This method is idempotent. */ public static void initAccumulators(SparkPipelineOptions opts, JavaSparkContext jsc) { // Init metrics accumulators MetricsAccumulator.init(opts, jsc); - AggregatorsAccumulator.init(opts, jsc); } /** Visit the pipeline to determine the translation mode (batch/streaming). */ diff --git a/runners/spark/src/main/java/org/apache/beam/runners/spark/TestSparkRunner.java b/runners/spark/src/main/java/org/apache/beam/runners/spark/TestSparkRunner.java index bd9baeacfc49..8288dd49c398 100644 --- a/runners/spark/src/main/java/org/apache/beam/runners/spark/TestSparkRunner.java +++ b/runners/spark/src/main/java/org/apache/beam/runners/spark/TestSparkRunner.java @@ -25,7 +25,6 @@ import java.io.File; import java.io.IOException; import java.util.concurrent.TimeUnit; -import org.apache.beam.runners.spark.aggregators.AggregatorsAccumulator; import org.apache.beam.runners.spark.metrics.MetricsAccumulator; import org.apache.beam.runners.spark.stateful.SparkTimerInternals; import org.apache.beam.runners.spark.util.GlobalWatermarkHolder; @@ -88,7 +87,6 @@ public SparkPipelineResult run(Pipeline pipeline) { SparkPipelineResult result = null; // clear state of Aggregators, Metrics and Watermarks if exists. - AggregatorsAccumulator.clear(); MetricsAccumulator.clear(); GlobalWatermarkHolder.clear(); diff --git a/runners/spark/src/main/java/org/apache/beam/runners/spark/aggregators/AggregatorsAccumulator.java b/runners/spark/src/main/java/org/apache/beam/runners/spark/aggregators/AggregatorsAccumulator.java deleted file mode 100644 index 007ca9cdc12a..000000000000 --- a/runners/spark/src/main/java/org/apache/beam/runners/spark/aggregators/AggregatorsAccumulator.java +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.spark.aggregators; - -import java.io.IOException; -import org.apache.beam.runners.spark.SparkPipelineOptions; -import org.apache.beam.runners.spark.translation.streaming.Checkpoint; -import org.apache.beam.runners.spark.translation.streaming.Checkpoint.CheckpointDir; -import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.annotations.VisibleForTesting; -import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Optional; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.streaming.api.java.JavaStreamingListener; -import org.apache.spark.streaming.api.java.JavaStreamingListenerBatchCompleted; -import org.apache.spark.util.AccumulatorV2; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * For resilience, {@link AccumulatorV2 Accumulators} are required to be wrapped in a Singleton. - * - * @see accumulatorsV2 - */ -@SuppressWarnings({ - "nullness" // TODO(https://github.com/apache/beam/issues/20497) -}) -public class AggregatorsAccumulator { - private static final Logger LOG = LoggerFactory.getLogger(AggregatorsAccumulator.class); - - private static final String ACCUMULATOR_NAME = "Beam.Aggregators"; - private static final String ACCUMULATOR_CHECKPOINT_FILENAME = "aggregators"; - - private static volatile NamedAggregatorsAccumulator instance = null; - private static volatile FileSystem fileSystem; - private static volatile Path checkpointFilePath; - - /** Init aggregators accumulator if it has not been initiated. This method is idempotent. */ - public static void init(SparkPipelineOptions opts, JavaSparkContext jsc) { - if (instance == null) { - synchronized (AggregatorsAccumulator.class) { - if (instance == null) { - Optional maybeCheckpointDir = - opts.isStreaming() - ? Optional.of(new CheckpointDir(opts.getCheckpointDir())) - : Optional.absent(); - NamedAggregators namedAggregators = new NamedAggregators(); - NamedAggregatorsAccumulator accumulator = - new NamedAggregatorsAccumulator(namedAggregators); - - if (maybeCheckpointDir.isPresent()) { - Optional maybeRecoveredValue = - recoverValueFromCheckpoint(jsc, maybeCheckpointDir.get()); - if (maybeRecoveredValue.isPresent()) { - accumulator = new NamedAggregatorsAccumulator(maybeRecoveredValue.get()); - } - } - jsc.sc().register(accumulator, ACCUMULATOR_NAME); - instance = accumulator; - } - } - LOG.info("Instantiated aggregators accumulator: {}", instance.value()); - } - } - - public static NamedAggregatorsAccumulator getInstance() { - if (instance == null) { - throw new IllegalStateException("Aggregators accumulator has not been instantiated"); - } else { - return instance; - } - } - - private static Optional recoverValueFromCheckpoint( - JavaSparkContext jsc, CheckpointDir checkpointDir) { - try { - Path beamCheckpointPath = checkpointDir.getBeamCheckpointDir(); - checkpointFilePath = new Path(beamCheckpointPath, ACCUMULATOR_CHECKPOINT_FILENAME); - fileSystem = checkpointFilePath.getFileSystem(jsc.hadoopConfiguration()); - NamedAggregators recoveredValue = Checkpoint.readObject(fileSystem, checkpointFilePath); - if (recoveredValue != null) { - LOG.info("Recovered aggregators from checkpoint"); - return Optional.of(recoveredValue); - } else { - LOG.info("No accumulator checkpoint found."); - } - } catch (Exception e) { - throw new RuntimeException("Failure while reading accumulator checkpoint.", e); - } - return Optional.absent(); - } - - private static void checkpoint() throws IOException { - if (checkpointFilePath != null) { - Checkpoint.writeObject(fileSystem, checkpointFilePath, instance.value()); - } - } - - @VisibleForTesting - public static void clear() { - synchronized (AggregatorsAccumulator.class) { - instance = null; - } - } - - /** Spark Listener which checkpoints {@link NamedAggregators} values for fault-tolerance. */ - public static class AccumulatorCheckpointingSparkListener extends JavaStreamingListener { - @Override - public void onBatchCompleted(JavaStreamingListenerBatchCompleted batchCompleted) { - try { - checkpoint(); - } catch (IOException e) { - LOG.error("Failed to checkpoint accumulator singleton.", e); - } - } - } -} diff --git a/runners/spark/src/main/java/org/apache/beam/runners/spark/aggregators/NamedAggregators.java b/runners/spark/src/main/java/org/apache/beam/runners/spark/aggregators/NamedAggregators.java deleted file mode 100644 index da81bafeef44..000000000000 --- a/runners/spark/src/main/java/org/apache/beam/runners/spark/aggregators/NamedAggregators.java +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.spark.aggregators; - -import java.io.Serializable; -import java.util.Map; -import java.util.TreeMap; -import org.apache.beam.sdk.transforms.Combine; -import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap; -import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Maps; - -/** - * This class wraps a map of named aggregators. Spark expects that all accumulators be declared - * before a job is launched. Beam allows aggregators to be used and incremented on the fly. We - * create a map of named aggregators and instantiate in the spark context before the job is - * launched. We can then add aggregators on the fly in Spark. - */ -@SuppressWarnings({ - "nullness" // TODO(https://github.com/apache/beam/issues/20497) -}) -public class NamedAggregators implements Serializable { - /** Map from aggregator name to current state. */ - private final Map> mNamedAggregators = new TreeMap<>(); - - /** Constructs a new NamedAggregators instance. */ - public NamedAggregators() {} - - /** - * @param name Name of aggregator to retrieve. - * @param typeClass Type class to cast the value to. - * @param Type to be returned. - * @return the value of the aggregator associated with the specified name, or null if - * the specified aggregator could not be found. - */ - public T getValue(String name, Class typeClass) { - final State state = mNamedAggregators.get(name); - return state != null ? typeClass.cast(state.render()) : null; - } - - /** @return a map of all the aggregator names and their rendered values */ - public Map renderAll() { - return ImmutableMap.copyOf(Maps.transformValues(mNamedAggregators, State::render)); - } - - /** - * Merges another NamedAggregators instance with this instance. - * - * @param other The other instance of named aggregators ot merge. - * @return This instance of Named aggregators with associated states updated to reflect the other - * instance's aggregators. - */ - public NamedAggregators merge(NamedAggregators other) { - for (Map.Entry> e : other.mNamedAggregators.entrySet()) { - String key = e.getKey(); - State otherValue = e.getValue(); - mNamedAggregators.merge(key, otherValue, NamedAggregators::merge); - } - return this; - } - - /** - * Helper method to merge States whose generic types aren't provably the same, so require some - * casting. - */ - @SuppressWarnings("unchecked") - private static State merge( - State s1, State s2) { - return ((State) s1).merge((State) s2); - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - for (Map.Entry> e : mNamedAggregators.entrySet()) { - sb.append(e.getKey()).append(": ").append(e.getValue().render()).append(" "); - } - return sb.toString(); - } - - /** - * @param Input data type - * @param Intermediate data type (useful for averages) - * @param Output data type - */ - public interface State extends Serializable { - - /** @param element new element to update state */ - void update(InputT element); - - State merge(State other); - - InterT current(); - - OutputT render(); - - Combine.CombineFn getCombineFn(); - } -} diff --git a/runners/spark/src/main/java/org/apache/beam/runners/spark/aggregators/NamedAggregatorsAccumulator.java b/runners/spark/src/main/java/org/apache/beam/runners/spark/aggregators/NamedAggregatorsAccumulator.java deleted file mode 100644 index 5775814ec702..000000000000 --- a/runners/spark/src/main/java/org/apache/beam/runners/spark/aggregators/NamedAggregatorsAccumulator.java +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.spark.aggregators; - -import org.apache.spark.util.AccumulatorV2; - -/** {@link AccumulatorV2} implementation for {@link NamedAggregators}. */ -public class NamedAggregatorsAccumulator extends AccumulatorV2 { - private static final NamedAggregators empty = new NamedAggregators(); - - private NamedAggregators value; - - public NamedAggregatorsAccumulator(NamedAggregators value) { - this.value = value; - } - - @Override - public boolean isZero() { - return value.equals(empty); - } - - @Override - public NamedAggregatorsAccumulator copy() { - NamedAggregators newContainer = new NamedAggregators(); - newContainer.merge(value); - return new NamedAggregatorsAccumulator(newContainer); - } - - @Override - public void reset() { - this.value = new NamedAggregators(); - } - - @Override - public void add(NamedAggregators other) { - this.value.merge(other); - } - - @Override - public void merge(AccumulatorV2 other) { - this.value.merge(other.value()); - } - - @Override - public NamedAggregators value() { - return this.value; - } -} diff --git a/runners/spark/src/main/java/org/apache/beam/runners/spark/aggregators/package-info.java b/runners/spark/src/main/java/org/apache/beam/runners/spark/aggregators/package-info.java deleted file mode 100644 index 14264f1ea3c7..000000000000 --- a/runners/spark/src/main/java/org/apache/beam/runners/spark/aggregators/package-info.java +++ /dev/null @@ -1,20 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** Provides internal utilities for implementing Beam aggregators using Spark accumulators. */ -package org.apache.beam.runners.spark.aggregators; diff --git a/runners/spark/src/main/java/org/apache/beam/runners/spark/metrics/AggregatorMetric.java b/runners/spark/src/main/java/org/apache/beam/runners/spark/metrics/AggregatorMetric.java deleted file mode 100644 index 41db37c92af8..000000000000 --- a/runners/spark/src/main/java/org/apache/beam/runners/spark/metrics/AggregatorMetric.java +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.spark.metrics; - -import com.codahale.metrics.Gauge; -import com.codahale.metrics.Metric; -import com.codahale.metrics.MetricFilter; -import java.util.HashMap; -import java.util.Map; -import org.apache.beam.runners.spark.aggregators.NamedAggregators; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** An adapter between the {@link NamedAggregators} and the Dropwizard {@link Metric} interface. */ -public class AggregatorMetric extends BeamMetricSet { - - private static final Logger LOG = LoggerFactory.getLogger(AggregatorMetric.class); - - private final NamedAggregators namedAggregators; - - private AggregatorMetric(NamedAggregators namedAggregators) { - this.namedAggregators = namedAggregators; - } - - public static AggregatorMetric of(NamedAggregators namedAggregators) { - return new AggregatorMetric(namedAggregators); - } - - @Override - public Map> getValue(String prefix, MetricFilter filter) { - Map> metrics = new HashMap<>(); - for (Map.Entry entry : namedAggregators.renderAll().entrySet()) { - String name = prefix + "." + entry.getKey(); - Object rawValue = entry.getValue(); - if (rawValue != null) { - try { - Gauge gauge = staticGauge(rawValue); - if (filter.matches(name, gauge)) { - metrics.put(name, gauge); - } - } catch (NumberFormatException e) { - LOG.warn( - "Metric `{}` of type {} can't be reported, conversion to double failed.", - name, - rawValue.getClass().getSimpleName(), - e); - } - } - } - return metrics; - } - - // Metric type is assumed to be compatible with Double - protected Gauge staticGauge(Object rawValue) throws NumberFormatException { - return rawValue instanceof Number - ? super.staticGauge((Number) rawValue) - : super.staticGauge(Double.parseDouble(rawValue.toString())); - } -} diff --git a/runners/spark/src/main/java/org/apache/beam/runners/spark/metrics/AggregatorMetricSource.java b/runners/spark/src/main/java/org/apache/beam/runners/spark/metrics/AggregatorMetricSource.java deleted file mode 100644 index a52b5e663382..000000000000 --- a/runners/spark/src/main/java/org/apache/beam/runners/spark/metrics/AggregatorMetricSource.java +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.spark.metrics; - -import com.codahale.metrics.MetricRegistry; -import org.apache.beam.runners.spark.aggregators.NamedAggregators; -import org.apache.spark.metrics.source.Source; - -/** - * A Spark {@link Source} that is tailored to expose an {@link AggregatorMetric}, wrapping an - * underlying {@link NamedAggregators} instance. - */ -public class AggregatorMetricSource implements Source { - private static final String METRIC_NAME = "Aggregators"; - - private final String name; - - private final MetricRegistry metricRegistry = new MetricRegistry(); - - public AggregatorMetricSource(final String name, final NamedAggregators aggregators) { - this.name = name; - metricRegistry.register(METRIC_NAME, AggregatorMetric.of(aggregators)); - } - - @Override - public String sourceName() { - return name; - } - - @Override - public MetricRegistry metricRegistry() { - return metricRegistry; - } -} diff --git a/runners/spark/src/main/java/org/apache/beam/runners/spark/metrics/CompositeSource.java b/runners/spark/src/main/java/org/apache/beam/runners/spark/metrics/CompositeSource.java deleted file mode 100644 index c9efa456b9d1..000000000000 --- a/runners/spark/src/main/java/org/apache/beam/runners/spark/metrics/CompositeSource.java +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.spark.metrics; - -import com.codahale.metrics.MetricRegistry; -import org.apache.spark.metrics.source.Source; - -/** Composite source made up of several {@link MetricRegistry} instances. */ -public class CompositeSource implements Source { - private final String name; - private final MetricRegistry metricRegistry; - - public CompositeSource(final String name, MetricRegistry... metricRegistries) { - this.name = name; - this.metricRegistry = new MetricRegistry(); - for (MetricRegistry metricRegistry : metricRegistries) { - this.metricRegistry.registerAll(metricRegistry); - } - } - - @Override - public String sourceName() { - return name; - } - - @Override - public MetricRegistry metricRegistry() { - return metricRegistry; - } -} diff --git a/runners/spark/src/main/java/org/apache/beam/runners/spark/metrics/sink/CsvSink.java b/runners/spark/src/main/java/org/apache/beam/runners/spark/metrics/sink/CsvSink.java index d880cd3cf9e7..994c87535b7f 100644 --- a/runners/spark/src/main/java/org/apache/beam/runners/spark/metrics/sink/CsvSink.java +++ b/runners/spark/src/main/java/org/apache/beam/runners/spark/metrics/sink/CsvSink.java @@ -19,14 +19,13 @@ import com.codahale.metrics.MetricRegistry; import java.util.Properties; -import org.apache.beam.runners.spark.metrics.AggregatorMetric; import org.apache.beam.runners.spark.metrics.WithMetricsSupport; import org.apache.spark.SecurityManager; import org.apache.spark.metrics.sink.Sink; /** * A {@link Sink} for Spark's - * metric system that is tailored to report {@link AggregatorMetric}s to a CSV file. + * metric system reporting metrics (including Beam step metrics) to a CSV file. * *

The sink is configured using Spark configuration parameters, for example: * diff --git a/runners/spark/src/main/java/org/apache/beam/runners/spark/metrics/sink/GraphiteSink.java b/runners/spark/src/main/java/org/apache/beam/runners/spark/metrics/sink/GraphiteSink.java index 0b21554069dd..43781b6b05a7 100644 --- a/runners/spark/src/main/java/org/apache/beam/runners/spark/metrics/sink/GraphiteSink.java +++ b/runners/spark/src/main/java/org/apache/beam/runners/spark/metrics/sink/GraphiteSink.java @@ -19,14 +19,13 @@ import com.codahale.metrics.MetricRegistry; import java.util.Properties; -import org.apache.beam.runners.spark.metrics.AggregatorMetric; import org.apache.beam.runners.spark.metrics.WithMetricsSupport; import org.apache.spark.SecurityManager; import org.apache.spark.metrics.sink.Sink; /** * A {@link Sink} for Spark's - * metric system that is tailored to report {@link AggregatorMetric}s to Graphite. + * metric system reporting metrics (including Beam step metrics) to Graphite. * *

The sink is configured using Spark configuration parameters, for example: * diff --git a/runners/spark/src/test/java/org/apache/beam/runners/spark/translation/streaming/ResumeFromCheckpointStreamingTest.java b/runners/spark/src/test/java/org/apache/beam/runners/spark/translation/streaming/ResumeFromCheckpointStreamingTest.java index a7618aad61b4..46c106e75727 100644 --- a/runners/spark/src/test/java/org/apache/beam/runners/spark/translation/streaming/ResumeFromCheckpointStreamingTest.java +++ b/runners/spark/src/test/java/org/apache/beam/runners/spark/translation/streaming/ResumeFromCheckpointStreamingTest.java @@ -34,7 +34,6 @@ import org.apache.beam.runners.spark.TestSparkPipelineOptions; import org.apache.beam.runners.spark.TestSparkRunner; import org.apache.beam.runners.spark.UsesCheckpointRecovery; -import org.apache.beam.runners.spark.aggregators.AggregatorsAccumulator; import org.apache.beam.runners.spark.io.MicrobatchSource; import org.apache.beam.runners.spark.metrics.MetricsAccumulator; import org.apache.beam.runners.spark.translation.streaming.utils.EmbeddedKafkaCluster; @@ -314,7 +313,6 @@ private SparkPipelineResult run(Optional stopWatermarkOption, int expec @After public void clean() { - AggregatorsAccumulator.clear(); MetricsAccumulator.clear(); GlobalWatermarkHolder.clear(); MicrobatchSource.clearCache(); From f0c7f86c990a2cfc321cd36adaeddb55cf11fa7e Mon Sep 17 00:00:00 2001 From: Moritz Mack Date: Mon, 28 Nov 2022 18:11:06 +0100 Subject: [PATCH 005/456] [Spark Dataset runner] Remove obsolete unusable AggregatorsAccumulator / NamedAggregators from the runner (closes #24379) --- .../SparkStructuredStreamingRunner.java | 28 ++--- .../aggregators/AggregatorsAccumulator.java | 73 ----------- .../aggregators/NamedAggregators.java | 113 ------------------ .../NamedAggregatorsAccumulator.java | 63 ---------- .../aggregators/package-info.java | 20 ---- .../metrics/AggregatorMetric.java | 74 ------------ .../metrics/AggregatorMetricSource.java | 49 -------- .../metrics/CompositeSource.java | 45 ------- .../metrics/SparkBeamMetricSource.java | 4 +- .../metrics/sink/CodahaleCsvSink.java | 3 +- .../metrics/sink/CodahaleGraphiteSink.java | 3 +- 11 files changed, 10 insertions(+), 465 deletions(-) delete mode 100644 runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/aggregators/AggregatorsAccumulator.java delete mode 100644 runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/aggregators/NamedAggregators.java delete mode 100644 runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/aggregators/NamedAggregatorsAccumulator.java delete mode 100644 runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/aggregators/package-info.java delete mode 100644 runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/metrics/AggregatorMetric.java delete mode 100644 runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/metrics/AggregatorMetricSource.java delete mode 100644 runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/metrics/CompositeSource.java diff --git a/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/SparkStructuredStreamingRunner.java b/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/SparkStructuredStreamingRunner.java index 7fc96bad755d..a271ff6375c2 100644 --- a/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/SparkStructuredStreamingRunner.java +++ b/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/SparkStructuredStreamingRunner.java @@ -28,9 +28,6 @@ import org.apache.beam.runners.core.construction.SplittableParDo; import org.apache.beam.runners.core.construction.graph.ProjectionPushdownOptimizer; import org.apache.beam.runners.core.metrics.MetricsPusher; -import org.apache.beam.runners.spark.structuredstreaming.aggregators.AggregatorsAccumulator; -import org.apache.beam.runners.spark.structuredstreaming.metrics.AggregatorMetricSource; -import org.apache.beam.runners.spark.structuredstreaming.metrics.CompositeSource; import org.apache.beam.runners.spark.structuredstreaming.metrics.MetricsAccumulator; import org.apache.beam.runners.spark.structuredstreaming.metrics.SparkBeamMetricSource; import org.apache.beam.runners.spark.structuredstreaming.translation.EvaluationContext; @@ -150,12 +147,8 @@ public SparkStructuredStreamingPipelineResult run(final Pipeline pipeline) { PipelineTranslator.detectStreamingMode(pipeline, options); checkArgument(!options.isStreaming(), "Streaming is not supported."); - // clear state of Aggregators, Metrics and Watermarks if exists. - AggregatorsAccumulator.clear(); - MetricsAccumulator.clear(); - final SparkSession sparkSession = SparkSessionFactory.getOrCreateSession(options); - initAccumulators(sparkSession.sparkContext()); + initMetrics(sparkSession.sparkContext()); final Future submissionFuture = runAsync(() -> translatePipeline(sparkSession, pipeline).evaluate()); @@ -202,24 +195,17 @@ private EvaluationContext translatePipeline(SparkSession sparkSession, Pipeline private void registerMetricsSource(String appName) { final MetricsSystem metricsSystem = SparkEnv$.MODULE$.get().metricsSystem(); - final AggregatorMetricSource aggregatorMetricSource = - new AggregatorMetricSource(null, AggregatorsAccumulator.getInstance().value()); - final SparkBeamMetricSource metricsSource = new SparkBeamMetricSource(null); - final CompositeSource compositeSource = - new CompositeSource( - appName + ".Beam", - metricsSource.metricRegistry(), - aggregatorMetricSource.metricRegistry()); + final SparkBeamMetricSource metricsSource = new SparkBeamMetricSource(appName + ".Beam"); // re-register the metrics in case of context re-use - metricsSystem.removeSource(compositeSource); - metricsSystem.registerSource(compositeSource); + metricsSystem.removeSource(metricsSource); + metricsSystem.registerSource(metricsSource); } /** Init Metrics/Aggregators accumulators. This method is idempotent. */ - private static void initAccumulators(SparkContext sparkContext) { - // Init metrics accumulators + private static void initMetrics(SparkContext sparkContext) { + // Clear and init metrics accumulators + MetricsAccumulator.clear(); MetricsAccumulator.init(sparkContext); - AggregatorsAccumulator.init(sparkContext); } private static Future runAsync(Runnable task) { diff --git a/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/aggregators/AggregatorsAccumulator.java b/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/aggregators/AggregatorsAccumulator.java deleted file mode 100644 index 1765a24c45b0..000000000000 --- a/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/aggregators/AggregatorsAccumulator.java +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.spark.structuredstreaming.aggregators; - -import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.annotations.VisibleForTesting; -import org.apache.spark.SparkContext; -import org.apache.spark.util.AccumulatorV2; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * For resilience, {@link AccumulatorV2 Accumulators} are required to be wrapped in a Singleton. - * - * @see accumulatorsV2 - */ -@SuppressWarnings({ - "nullness" // TODO(https://github.com/apache/beam/issues/20497) -}) -public class AggregatorsAccumulator { - private static final Logger LOG = LoggerFactory.getLogger(AggregatorsAccumulator.class); - - private static final String ACCUMULATOR_NAME = "Beam.Aggregators"; - - private static volatile NamedAggregatorsAccumulator instance = null; - - /** Init aggregators accumulator if it has not been initiated. This method is idempotent. */ - public static void init(SparkContext sparkContext) { - if (instance == null) { - synchronized (AggregatorsAccumulator.class) { - if (instance == null) { - NamedAggregators namedAggregators = new NamedAggregators(); - NamedAggregatorsAccumulator accumulator = - new NamedAggregatorsAccumulator(namedAggregators); - sparkContext.register(accumulator, ACCUMULATOR_NAME); - - instance = accumulator; - } - } - LOG.info("Instantiated aggregators accumulator: {}", instance.value()); - } - } - - public static NamedAggregatorsAccumulator getInstance() { - if (instance == null) { - throw new IllegalStateException("Aggregrators accumulator has not been instantiated"); - } else { - return instance; - } - } - - @VisibleForTesting - public static void clear() { - synchronized (AggregatorsAccumulator.class) { - instance = null; - } - } -} diff --git a/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/aggregators/NamedAggregators.java b/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/aggregators/NamedAggregators.java deleted file mode 100644 index de53458b3149..000000000000 --- a/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/aggregators/NamedAggregators.java +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.spark.structuredstreaming.aggregators; - -import java.io.Serializable; -import java.util.Map; -import java.util.TreeMap; -import org.apache.beam.sdk.transforms.Combine; -import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap; -import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Maps; - -/** - * This class wraps a map of named aggregators. Spark expects that all accumulators be declared - * before a job is launched. Beam allows aggregators to be used and incremented on the fly. We - * create a map of named aggregators and instantiate in the spark context before the job is - * launched. We can then add aggregators on the fly in Spark. - */ -@SuppressWarnings({ - "nullness" // TODO(https://github.com/apache/beam/issues/20497) -}) -public class NamedAggregators implements Serializable { - /** Map from aggregator name to current state. */ - private final Map> mNamedAggregators = new TreeMap<>(); - - /** Constructs a new NamedAggregators instance. */ - public NamedAggregators() {} - - /** - * @param name Name of aggregator to retrieve. - * @param typeClass Type class to cast the value to. - * @param Type to be returned. - * @return the value of the aggregator associated with the specified name, or null if - * the specified aggregator could not be found. - */ - public T getValue(String name, Class typeClass) { - final State state = mNamedAggregators.get(name); - return state != null ? typeClass.cast(state.render()) : null; - } - - /** @return a map of all the aggregator names and their rendered values */ - public Map renderAll() { - return ImmutableMap.copyOf(Maps.transformValues(mNamedAggregators, State::render)); - } - - /** - * Merges another NamedAggregators instance with this instance. - * - * @param other The other instance of named aggregators ot merge. - * @return This instance of Named aggregators with associated states updated to reflect the other - * instance's aggregators. - */ - public NamedAggregators merge(NamedAggregators other) { - for (Map.Entry> e : other.mNamedAggregators.entrySet()) { - String key = e.getKey(); - State otherValue = e.getValue(); - mNamedAggregators.merge(key, otherValue, NamedAggregators::merge); - } - return this; - } - - /** - * Helper method to merge States whose generic types aren't provably the same, so require some - * casting. - */ - @SuppressWarnings("unchecked") - private static State merge( - State s1, State s2) { - return ((State) s1).merge((State) s2); - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - for (Map.Entry> e : mNamedAggregators.entrySet()) { - sb.append(e.getKey()).append(": ").append(e.getValue().render()).append(" "); - } - return sb.toString(); - } - - /** - * @param Input data type - * @param Intermediate data type (useful for averages) - * @param Output data type - */ - public interface State extends Serializable { - - /** @param element new element to update state */ - void update(InputT element); - - State merge(State other); - - InterT current(); - - OutputT render(); - - Combine.CombineFn getCombineFn(); - } -} diff --git a/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/aggregators/NamedAggregatorsAccumulator.java b/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/aggregators/NamedAggregatorsAccumulator.java deleted file mode 100644 index 992e63f47b8a..000000000000 --- a/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/aggregators/NamedAggregatorsAccumulator.java +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.spark.structuredstreaming.aggregators; - -import org.apache.spark.util.AccumulatorV2; - -/** {@link AccumulatorV2} implementation for {@link NamedAggregators}. */ -public class NamedAggregatorsAccumulator extends AccumulatorV2 { - private static final NamedAggregators empty = new NamedAggregators(); - - private NamedAggregators value; - - public NamedAggregatorsAccumulator(NamedAggregators value) { - this.value = value; - } - - @Override - public boolean isZero() { - return value.equals(empty); - } - - @Override - public NamedAggregatorsAccumulator copy() { - NamedAggregators newContainer = new NamedAggregators(); - newContainer.merge(value); - return new NamedAggregatorsAccumulator(newContainer); - } - - @Override - public void reset() { - this.value = new NamedAggregators(); - } - - @Override - public void add(NamedAggregators other) { - this.value.merge(other); - } - - @Override - public void merge(AccumulatorV2 other) { - this.value.merge(other.value()); - } - - @Override - public NamedAggregators value() { - return this.value; - } -} diff --git a/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/aggregators/package-info.java b/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/aggregators/package-info.java deleted file mode 100644 index 11a87eed8970..000000000000 --- a/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/aggregators/package-info.java +++ /dev/null @@ -1,20 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** Provides internal utilities for implementing Beam aggregators using Spark accumulators. */ -package org.apache.beam.runners.spark.structuredstreaming.aggregators; diff --git a/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/metrics/AggregatorMetric.java b/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/metrics/AggregatorMetric.java deleted file mode 100644 index 74bea7f5255e..000000000000 --- a/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/metrics/AggregatorMetric.java +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.spark.structuredstreaming.metrics; - -import com.codahale.metrics.Gauge; -import com.codahale.metrics.Metric; -import com.codahale.metrics.MetricFilter; -import java.util.HashMap; -import java.util.Map; -import org.apache.beam.runners.spark.structuredstreaming.aggregators.NamedAggregators; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** An adapter between the {@link NamedAggregators} and the Dropwizard {@link Metric} interface. */ -public class AggregatorMetric extends BeamMetricSet { - - private static final Logger LOG = LoggerFactory.getLogger(AggregatorMetric.class); - - private final NamedAggregators namedAggregators; - - private AggregatorMetric(NamedAggregators namedAggregators) { - this.namedAggregators = namedAggregators; - } - - public static AggregatorMetric of(NamedAggregators namedAggregators) { - return new AggregatorMetric(namedAggregators); - } - - @Override - public Map> getValue(String prefix, MetricFilter filter) { - Map> metrics = new HashMap<>(); - for (Map.Entry entry : namedAggregators.renderAll().entrySet()) { - String name = prefix + "." + entry.getKey(); - Object rawValue = entry.getValue(); - if (rawValue != null) { - try { - Gauge gauge = staticGauge(rawValue); - if (filter.matches(name, gauge)) { - metrics.put(name, gauge); - } - } catch (NumberFormatException e) { - LOG.warn( - "Metric `{}` of type {} can't be reported, conversion to double failed.", - name, - rawValue.getClass().getSimpleName(), - e); - } - } - } - return metrics; - } - - // Metric type is assumed to be compatible with Double - protected Gauge staticGauge(Object rawValue) throws NumberFormatException { - return rawValue instanceof Number - ? super.staticGauge((Number) rawValue) - : super.staticGauge(Double.parseDouble(rawValue.toString())); - } -} diff --git a/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/metrics/AggregatorMetricSource.java b/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/metrics/AggregatorMetricSource.java deleted file mode 100644 index 406dba365cc6..000000000000 --- a/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/metrics/AggregatorMetricSource.java +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.spark.structuredstreaming.metrics; - -import com.codahale.metrics.MetricRegistry; -import org.apache.beam.runners.spark.structuredstreaming.aggregators.NamedAggregators; -import org.apache.spark.metrics.source.Source; - -/** - * A Spark {@link Source} that is tailored to expose an {@link AggregatorMetric}, wrapping an - * underlying {@link NamedAggregators} instance. - */ -public class AggregatorMetricSource implements Source { - private static final String METRIC_NAME = "Aggregators"; - - private final String name; - - private final MetricRegistry metricRegistry = new MetricRegistry(); - - public AggregatorMetricSource(final String name, final NamedAggregators aggregators) { - this.name = name; - metricRegistry.register(METRIC_NAME, AggregatorMetric.of(aggregators)); - } - - @Override - public String sourceName() { - return name; - } - - @Override - public MetricRegistry metricRegistry() { - return metricRegistry; - } -} diff --git a/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/metrics/CompositeSource.java b/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/metrics/CompositeSource.java deleted file mode 100644 index 8de06f76064f..000000000000 --- a/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/metrics/CompositeSource.java +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.beam.runners.spark.structuredstreaming.metrics; - -import com.codahale.metrics.MetricRegistry; -import org.apache.spark.metrics.source.Source; - -/** Composite source made up of several {@link MetricRegistry} instances. */ -public class CompositeSource implements Source { - private final String name; - private final MetricRegistry metricRegistry; - - public CompositeSource(final String name, MetricRegistry... metricRegistries) { - this.name = name; - this.metricRegistry = new MetricRegistry(); - for (MetricRegistry metricRegistry : metricRegistries) { - this.metricRegistry.registerAll(metricRegistry); - } - } - - @Override - public String sourceName() { - return name; - } - - @Override - public MetricRegistry metricRegistry() { - return metricRegistry; - } -} diff --git a/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/metrics/SparkBeamMetricSource.java b/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/metrics/SparkBeamMetricSource.java index 665dbc479c4d..ed938ac84138 100644 --- a/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/metrics/SparkBeamMetricSource.java +++ b/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/metrics/SparkBeamMetricSource.java @@ -25,15 +25,13 @@ * underlying {@link org.apache.beam.sdk.metrics.MetricResults} instance. */ public class SparkBeamMetricSource implements Source { - private static final String METRIC_NAME = "Metrics"; - private final String name; private final MetricRegistry metricRegistry = new MetricRegistry(); public SparkBeamMetricSource(final String name) { this.name = name; - metricRegistry.register(METRIC_NAME, new SparkBeamMetric()); + metricRegistry.register(name, new SparkBeamMetric()); } @Override diff --git a/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/metrics/sink/CodahaleCsvSink.java b/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/metrics/sink/CodahaleCsvSink.java index c8f9139a2ebf..dd23d5040464 100644 --- a/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/metrics/sink/CodahaleCsvSink.java +++ b/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/metrics/sink/CodahaleCsvSink.java @@ -19,14 +19,13 @@ import com.codahale.metrics.MetricRegistry; import java.util.Properties; -import org.apache.beam.runners.spark.structuredstreaming.metrics.AggregatorMetric; import org.apache.beam.runners.spark.structuredstreaming.metrics.WithMetricsSupport; import org.apache.spark.SecurityManager; import org.apache.spark.metrics.sink.Sink; /** * A {@link Sink} for Spark's - * metric system that is tailored to report {@link AggregatorMetric}s to a CSV file. + * metric system reporting metrics (including Beam step metrics) to a CSV file. * *

The sink is configured using Spark configuration parameters, for example: * diff --git a/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/metrics/sink/CodahaleGraphiteSink.java b/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/metrics/sink/CodahaleGraphiteSink.java index 5640c965740b..fe709ad81ab7 100644 --- a/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/metrics/sink/CodahaleGraphiteSink.java +++ b/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/metrics/sink/CodahaleGraphiteSink.java @@ -19,14 +19,13 @@ import com.codahale.metrics.MetricRegistry; import java.util.Properties; -import org.apache.beam.runners.spark.structuredstreaming.metrics.AggregatorMetric; import org.apache.beam.runners.spark.structuredstreaming.metrics.WithMetricsSupport; import org.apache.spark.SecurityManager; import org.apache.spark.metrics.sink.Sink; /** * A {@link Sink} for Spark's - * metric system that is tailored to report {@link AggregatorMetric}s to Graphite. + * metric system reporting metrics (including Beam step metrics) to Graphite. * *

The sink is configured using Spark configuration parameters, for example: * From 58b340cd0f53cc4c76ce2e79e8b6be6a165c466e Mon Sep 17 00:00:00 2001 From: Kenneth Knowles Date: Mon, 28 Nov 2022 13:13:18 -0800 Subject: [PATCH 006/456] Fix dependencies of archetype tasks --- sdks/java/maven-archetypes/examples/build.gradle | 1 + sdks/java/maven-archetypes/gcp-bom-examples/build.gradle | 1 + 2 files changed, 2 insertions(+) diff --git a/sdks/java/maven-archetypes/examples/build.gradle b/sdks/java/maven-archetypes/examples/build.gradle index 6a034029f10e..55e944c269dc 100644 --- a/sdks/java/maven-archetypes/examples/build.gradle +++ b/sdks/java/maven-archetypes/examples/build.gradle @@ -24,6 +24,7 @@ ext.summary = """A Maven Archetype to create a project containing all the example pipelines from the Apache Beam Java SDK.""" processResources { + dependsOn 'generateSources' filter org.apache.tools.ant.filters.ReplaceTokens, tokens: [ 'project.version': version, 'bigquery.version': dependencies.create(project.library.java.google_api_services_bigquery).getVersion(), diff --git a/sdks/java/maven-archetypes/gcp-bom-examples/build.gradle b/sdks/java/maven-archetypes/gcp-bom-examples/build.gradle index af06bfc41d8e..5b5153e11d4a 100644 --- a/sdks/java/maven-archetypes/gcp-bom-examples/build.gradle +++ b/sdks/java/maven-archetypes/gcp-bom-examples/build.gradle @@ -25,6 +25,7 @@ ext.summary = """A Maven Archetype to create a project using the Beam Google Cloud Platform BOM""" processResources { + dependsOn 'generateSources' filter org.apache.tools.ant.filters.ReplaceTokens, tokens: [ 'project.version': version, 'bigquery.version': dependencies.create(project.library.java.google_api_services_bigquery).getVersion(), From 596e19f938aa42b25b9689197dea03e0d6a47a49 Mon Sep 17 00:00:00 2001 From: Kenneth Knowles Date: Mon, 28 Nov 2022 12:23:57 -0800 Subject: [PATCH 007/456] Upgrade checker framework to 3.13.0 This version of checker framework shortened many warning suppression keys. For atomicity, those key changes are included in this commit. --- .../org/apache/beam/gradle/BeamModulePlugin.groovy | 2 +- .../complete/datatokenization/DataTokenization.java | 2 +- .../datatokenization/transforms/DataProtectors.java | 4 ++-- .../datatokenization/transforms/JsonToBeamRow.java | 2 +- .../complete/datatokenization/utils/CsvConverters.java | 2 +- .../datatokenization/utils/ErrorConverters.java | 4 ++-- .../complete/datatokenization/utils/SchemasUtils.java | 6 +++--- .../java/org/apache/beam/sdk/io/LocalResourceId.java | 7 ++++--- .../apache/beam/sdk/metrics/MetricsEnvironment.java | 2 +- .../sdk/util/UnboundedScheduledExecutorService.java | 10 +++++----- .../service/JavaClassLookupTransformProvider.java | 6 +++++- .../beam/sdk/extensions/python/PythonService.java | 2 +- .../beam/sdk/extensions/sql/jdbc/BeamSqlLine.java | 2 +- .../beam/sdk/extensions/sql/jdbc/BeamSqlLineIT.java | 2 +- .../sdk/extensions/sql/impl/rel/CalcRelSplitter.java | 2 +- .../meta/provider/kafka/NestedPayloadKafkaTable.java | 2 +- .../pubsublite/PubsubLiteTableProviderTest.java | 4 ++-- .../zetasketch/ApproximateCountDistinct.java | 4 ++-- .../beam/sdk/fn/stream/BufferingStreamObserver.java | 4 ++-- .../beam/sdk/fn/stream/PrefetchableIterables.java | 2 +- .../beam/fn/harness/control/ExecutionStateSampler.java | 3 +-- .../beam/sdk/io/aws/dynamodb/AwsClientsProvider.java | 2 +- .../apache/beam/sdk/io/aws/sns/AwsClientsProvider.java | 2 +- .../beam/sdk/io/gcp/firestore/FirestoreV1ReadFn.java | 2 +- .../beam/sdk/io/gcp/pubsub/NestedRowToMessage.java | 2 +- .../internal/LimitingTopicBacklogReader.java | 4 ++-- .../internal/MemoryBufferedSubscriberImpl.java | 2 +- .../pubsublite/internal/TopicBacklogReaderImpl.java | 2 +- .../changestreams/encoder/TimestampEncoding.java | 2 +- .../internal/OffsetByteRangeTrackerTest.java | 4 ++-- .../internal/PerSubscriptionPartitionSdfTest.java | 4 ++-- 31 files changed, 52 insertions(+), 48 deletions(-) diff --git a/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy index abb32698ddbd..2e4bf4639f2b 100644 --- a/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy +++ b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy @@ -461,7 +461,7 @@ class BeamModulePlugin implements Plugin { def aws_java_sdk2_version = "2.17.127" def cassandra_driver_version = "3.10.2" def cdap_version = "6.5.1" - def checkerframework_version = "3.12.0" + def checkerframework_version = "3.13.0" def classgraph_version = "4.8.104" def dbcp2_version = "2.8.0" def errorprone_version = "2.10.0" diff --git a/examples/java/src/main/java/org/apache/beam/examples/complete/datatokenization/DataTokenization.java b/examples/java/src/main/java/org/apache/beam/examples/complete/datatokenization/DataTokenization.java index 1bc0069945fa..e4d2d6342e60 100644 --- a/examples/java/src/main/java/org/apache/beam/examples/complete/datatokenization/DataTokenization.java +++ b/examples/java/src/main/java/org/apache/beam/examples/complete/datatokenization/DataTokenization.java @@ -215,7 +215,7 @@ public static void main(String[] args) { * @param options The execution options. * @return The pipeline result. */ - @SuppressWarnings({"dereference.of.nullable", "argument.type.incompatible"}) + @SuppressWarnings({"dereference.of.nullable", "argument"}) public static PipelineResult run(DataTokenizationOptions options) { SchemasUtils schema = null; try { diff --git a/examples/java/src/main/java/org/apache/beam/examples/complete/datatokenization/transforms/DataProtectors.java b/examples/java/src/main/java/org/apache/beam/examples/complete/datatokenization/transforms/DataProtectors.java index 700b94b93dfa..cc75b4f65630 100644 --- a/examples/java/src/main/java/org/apache/beam/examples/complete/datatokenization/transforms/DataProtectors.java +++ b/examples/java/src/main/java/org/apache/beam/examples/complete/datatokenization/transforms/DataProtectors.java @@ -198,7 +198,7 @@ public void close() { } @ProcessElement - @SuppressWarnings("argument.type.incompatible") + @SuppressWarnings("argument") public void process(@Element KV> element, ProcessContext context) { Iterable rows = element.getValue(); @@ -246,7 +246,7 @@ private String formatJsonsToRpcBatch(Iterable jsons) { return stringBuilder.toString(); } - @SuppressWarnings("argument.type.incompatible") + @SuppressWarnings("argument") private ArrayList getTokenizedRow(Iterable inputRows) throws IOException { ArrayList outputRows = new ArrayList<>(); diff --git a/examples/java/src/main/java/org/apache/beam/examples/complete/datatokenization/transforms/JsonToBeamRow.java b/examples/java/src/main/java/org/apache/beam/examples/complete/datatokenization/transforms/JsonToBeamRow.java index a6c87a368cf9..6ae023e692cd 100644 --- a/examples/java/src/main/java/org/apache/beam/examples/complete/datatokenization/transforms/JsonToBeamRow.java +++ b/examples/java/src/main/java/org/apache/beam/examples/complete/datatokenization/transforms/JsonToBeamRow.java @@ -42,7 +42,7 @@ public JsonToBeamRow(String failedToParseDeadLetterPath, SchemasUtils schema) { } @Override - @SuppressWarnings("argument.type.incompatible") + @SuppressWarnings("argument") public PCollection expand(PCollection jsons) { ParseResult rows = jsons.apply( diff --git a/examples/java/src/main/java/org/apache/beam/examples/complete/datatokenization/utils/CsvConverters.java b/examples/java/src/main/java/org/apache/beam/examples/complete/datatokenization/utils/CsvConverters.java index 090abcb80eb5..b570d1b0c6ba 100644 --- a/examples/java/src/main/java/org/apache/beam/examples/complete/datatokenization/utils/CsvConverters.java +++ b/examples/java/src/main/java/org/apache/beam/examples/complete/datatokenization/utils/CsvConverters.java @@ -59,7 +59,7 @@ import org.slf4j.LoggerFactory; /** Common transforms for Csv files. */ -@SuppressWarnings({"argument.type.incompatible"}) +@SuppressWarnings({"argument"}) public class CsvConverters { /* Logger for class. */ diff --git a/examples/java/src/main/java/org/apache/beam/examples/complete/datatokenization/utils/ErrorConverters.java b/examples/java/src/main/java/org/apache/beam/examples/complete/datatokenization/utils/ErrorConverters.java index 91695fb315a0..e91ab6c97216 100644 --- a/examples/java/src/main/java/org/apache/beam/examples/complete/datatokenization/utils/ErrorConverters.java +++ b/examples/java/src/main/java/org/apache/beam/examples/complete/datatokenization/utils/ErrorConverters.java @@ -64,7 +64,7 @@ public static Builder newBuilder() { public abstract @Nullable Duration windowDuration(); - @SuppressWarnings("argument.type.incompatible") + @SuppressWarnings("argument") @Override public PDone expand(PCollection> pCollection) { @@ -266,7 +266,7 @@ public static WriteErrorsToTextIO.Builder newBuilder() { public abstract @Nullable Duration windowDuration(); @Override - @SuppressWarnings("argument.type.incompatible") + @SuppressWarnings("argument") public PDone expand(PCollection> pCollection) { PCollection formattedErrorRows = diff --git a/examples/java/src/main/java/org/apache/beam/examples/complete/datatokenization/utils/SchemasUtils.java b/examples/java/src/main/java/org/apache/beam/examples/complete/datatokenization/utils/SchemasUtils.java index e8dd24eabeee..4543aa623b83 100644 --- a/examples/java/src/main/java/org/apache/beam/examples/complete/datatokenization/utils/SchemasUtils.java +++ b/examples/java/src/main/java/org/apache/beam/examples/complete/datatokenization/utils/SchemasUtils.java @@ -48,10 +48,10 @@ */ @SuppressWarnings({ "initialization.fields.uninitialized", - "method.invocation.invalid", + "method.invocation", "dereference.of.nullable", - "argument.type.incompatible", - "return.type.incompatible" + "argument", + "return" }) public class SchemasUtils { diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/LocalResourceId.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/LocalResourceId.java index e6394f79291c..3a109267577e 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/io/LocalResourceId.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/io/LocalResourceId.java @@ -17,8 +17,9 @@ */ package org.apache.beam.sdk.io; +import static org.apache.beam.sdk.util.Preconditions.checkArgumentNotNull; +import static org.apache.beam.sdk.util.Preconditions.checkStateNotNull; import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument; -import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkNotNull; import static org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkState; import java.io.File; @@ -40,7 +41,7 @@ class LocalResourceId implements ResourceId { private transient volatile @Nullable Path cachedPath; static LocalResourceId fromPath(Path path, boolean isDirectory) { - checkNotNull(path, "path"); + checkArgumentNotNull(path, "path"); return new LocalResourceId(path, isDirectory); } @@ -78,7 +79,7 @@ public LocalResourceId getCurrentDirectory() { if (parent == null && path.getNameCount() == 1) { parent = Paths.get("."); } - checkState(parent != null, "Failed to get the current directory for path: [%s].", pathString); + checkStateNotNull(parent, "Failed to get the current directory for path: [%s].", pathString); return fromPath(parent, true /* isDirectory */); } } diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/MetricsEnvironment.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/MetricsEnvironment.java index bf6da9bdb32d..e3a9621856d8 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/MetricsEnvironment.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/metrics/MetricsEnvironment.java @@ -53,7 +53,7 @@ public class MetricsEnvironment { private static final AtomicBoolean METRICS_SUPPORTED = new AtomicBoolean(false); private static final AtomicBoolean REPORTED_MISSING_CONTAINER = new AtomicBoolean(false); - @SuppressWarnings("type.argument.type.incompatible") // object guaranteed to be non-null + @SuppressWarnings("type.argument") // object guaranteed to be non-null private static final ThreadLocal<@NonNull MetricsContainerHolder> CONTAINER_FOR_THREAD = ThreadLocal.withInitial(MetricsContainerHolder::new); diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/util/UnboundedScheduledExecutorService.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/util/UnboundedScheduledExecutorService.java index 57a1a829f10c..33cba79a3646 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/util/UnboundedScheduledExecutorService.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/util/UnboundedScheduledExecutorService.java @@ -98,7 +98,7 @@ final class ScheduledFutureTask<@Nullable @KeyForBottom V> extends FutureTask } /** Creates a periodic action with given nanoTime-based initial trigger time and period. */ - @SuppressWarnings("argument.type.incompatible") + @SuppressWarnings("argument") ScheduledFutureTask(Runnable r, @Nullable V result, long triggerTime, long period) { super(r, result); this.time = triggerTime; @@ -247,7 +247,7 @@ public boolean awaitTermination(long timeout, TimeUnit unit) throws InterruptedE @Override /* UnboundedScheduledExecutorService is the only caller after it has been initialized.*/ - @SuppressWarnings("method.invocation.invalid") + @SuppressWarnings("method.invocation") public void execute(Runnable command) { // These are already guaranteed to be a ScheduledFutureTask so there is no need to wrap // it in another ScheduledFutureTask. @@ -366,7 +366,7 @@ public void execute(Runnable command) { @Override /* Ignore improper flag since FB detects that ScheduledExecutorService can't have nullable V. */ - @SuppressWarnings("override.return.invalid") + @SuppressWarnings("override.return") public <@Nullable @KeyForBottom T> Future submit(Runnable command, T result) { if (command == null) { throw new NullPointerException(); @@ -378,7 +378,7 @@ public void execute(Runnable command) { @Override /* Ignore improper flag since FB detects that ScheduledExecutorService can't have nullable V. */ - @SuppressWarnings({"override.param.invalid", "override.return.invalid"}) + @SuppressWarnings({"override.param", "override.return"}) public <@Nullable @KeyForBottom T> Future submit(Callable command) { if (command == null) { throw new NullPointerException(); @@ -430,7 +430,7 @@ public ScheduledFuture schedule(Runnable command, long delay, TimeUnit unit) @Override /* Ignore improper flag since FB detects that ScheduledExecutorService can't have nullable V. */ - @SuppressWarnings({"override.param.invalid", "override.return.invalid"}) + @SuppressWarnings({"override.param", "override.return"}) public <@Nullable @KeyForBottom V> ScheduledFuture schedule( Callable callable, long delay, TimeUnit unit) { if (callable == null || unit == null) { diff --git a/sdks/java/expansion-service/src/main/java/org/apache/beam/sdk/expansion/service/JavaClassLookupTransformProvider.java b/sdks/java/expansion-service/src/main/java/org/apache/beam/sdk/expansion/service/JavaClassLookupTransformProvider.java index 032378481e84..09bfd73b2413 100644 --- a/sdks/java/expansion-service/src/main/java/org/apache/beam/sdk/expansion/service/JavaClassLookupTransformProvider.java +++ b/sdks/java/expansion-service/src/main/java/org/apache/beam/sdk/expansion/service/JavaClassLookupTransformProvider.java @@ -70,7 +70,6 @@ * @param input {@link PInput} type of the transform * @param output {@link POutput} type of the transform */ -@SuppressWarnings({"argument.type.incompatible", "assignment.type.incompatible"}) @SuppressFBWarnings("UWF_UNWRITTEN_PUBLIC_OR_PROTECTED_FIELD") class JavaClassLookupTransformProvider implements TransformProvider { @@ -89,6 +88,7 @@ public JavaClassLookupTransformProvider(AllowList allowList) { this.allowList = allowList; } + @SuppressWarnings("argument") @Override public PTransform getTransform(FunctionSpec spec) { JavaClassLookupPayload payload; @@ -137,6 +137,7 @@ public PTransform getTransform(FunctionSpec spec) { } } + @SuppressWarnings("assignment") private PTransform applyBuilderMethods( PTransform transform, JavaClassLookupPayload payload, @@ -328,6 +329,7 @@ private boolean parametersCompatible( return true; } + @SuppressWarnings("argument") private @Nullable Object getDecodedValueFromRow( Class type, Object valueFromRow, @Nullable Type genericType) { if (isPrimitiveOrWrapperOrString(type)) { @@ -366,6 +368,7 @@ private boolean parametersCompatible( throw new RuntimeException("Could not decode the value from Row " + valueFromRow); } + @SuppressWarnings("argument") private Object[] getParameterValues( java.lang.reflect.Parameter[] parameters, Row constrtuctorRow, Type[] genericTypes) { ArrayList parameterValues = new ArrayList<>(); @@ -380,6 +383,7 @@ private Object[] getParameterValues( return parameterValues.toArray(); } + @SuppressWarnings("argument") private Object[] getDecodedArrayValueFromRow(Class arrayComponentType, Object valueFromRow) { List originalValues = (List) valueFromRow; List decodedValues = new ArrayList<>(); diff --git a/sdks/java/extensions/python/src/main/java/org/apache/beam/sdk/extensions/python/PythonService.java b/sdks/java/extensions/python/src/main/java/org/apache/beam/sdk/extensions/python/PythonService.java index 43c445a200e2..0d7a918aade3 100644 --- a/sdks/java/extensions/python/src/main/java/org/apache/beam/sdk/extensions/python/PythonService.java +++ b/sdks/java/extensions/python/src/main/java/org/apache/beam/sdk/extensions/python/PythonService.java @@ -72,7 +72,7 @@ public PythonService withExtraPackages(List extraPackages) { ImmutableList.builder().addAll(this.extraPackages).addAll(extraPackages).build()); } - @SuppressWarnings("argument.type.incompatible") + @SuppressWarnings("argument") public AutoCloseable start() throws IOException, InterruptedException { File bootstrapScript = File.createTempFile("bootstrap_beam_venv", ".py"); bootstrapScript.deleteOnExit(); diff --git a/sdks/java/extensions/sql/jdbc/src/main/java/org/apache/beam/sdk/extensions/sql/jdbc/BeamSqlLine.java b/sdks/java/extensions/sql/jdbc/src/main/java/org/apache/beam/sdk/extensions/sql/jdbc/BeamSqlLine.java index bf4ef1d4bf0f..2be276578e2d 100644 --- a/sdks/java/extensions/sql/jdbc/src/main/java/org/apache/beam/sdk/extensions/sql/jdbc/BeamSqlLine.java +++ b/sdks/java/extensions/sql/jdbc/src/main/java/org/apache/beam/sdk/extensions/sql/jdbc/BeamSqlLine.java @@ -57,7 +57,7 @@ private static String[] checkConnectionArgs(String[] args) { } /** Nullable InputStream is being handled inside sqlLine.begin. */ - @SuppressWarnings("argument.type.incompatible") + @SuppressWarnings("argument") static Status runSqlLine( String[] args, @Nullable InputStream inputStream, diff --git a/sdks/java/extensions/sql/jdbc/src/test/java/org/apache/beam/sdk/extensions/sql/jdbc/BeamSqlLineIT.java b/sdks/java/extensions/sql/jdbc/src/test/java/org/apache/beam/sdk/extensions/sql/jdbc/BeamSqlLineIT.java index 1f5bb46c71a6..365d65f9fe20 100644 --- a/sdks/java/extensions/sql/jdbc/src/test/java/org/apache/beam/sdk/extensions/sql/jdbc/BeamSqlLineIT.java +++ b/sdks/java/extensions/sql/jdbc/src/test/java/org/apache/beam/sdk/extensions/sql/jdbc/BeamSqlLineIT.java @@ -201,7 +201,7 @@ private String taxiRideJSON( } /** Suppressing this due to https://github.com/typetools/checker-framework/issues/979. */ - @SuppressWarnings("return.type.incompatible") + @SuppressWarnings("return") private Future>> runQueryInBackground(String[] args) { return pool.submit( (Callable) diff --git a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/rel/CalcRelSplitter.java b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/rel/CalcRelSplitter.java index eb1e86d0bf12..e254ba60568a 100644 --- a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/rel/CalcRelSplitter.java +++ b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/rel/CalcRelSplitter.java @@ -879,7 +879,7 @@ private static class HighestUsageFinder extends RexVisitorImpl { continue; } currentLevel = exprLevels[i]; - @SuppressWarnings("argument.type.incompatible") + @SuppressWarnings("argument") final Void unused = exprs[i].accept(this); } } diff --git a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/kafka/NestedPayloadKafkaTable.java b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/kafka/NestedPayloadKafkaTable.java index ce00c8f855e0..f6563cb94584 100644 --- a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/kafka/NestedPayloadKafkaTable.java +++ b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/meta/provider/kafka/NestedPayloadKafkaTable.java @@ -140,7 +140,7 @@ public PCollection> expand(PCollection input } // Suppress nullability warnings: ProducerRecord is supposed to accept null arguments. - @SuppressWarnings("argument.type.incompatible") + @SuppressWarnings("argument") @VisibleForTesting ProducerRecord transformOutput(Row row) { row = castRow(row, row.getSchema(), schema); diff --git a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/pubsublite/PubsubLiteTableProviderTest.java b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/pubsublite/PubsubLiteTableProviderTest.java index e3c172f6c0ea..a07eb1fe1a71 100644 --- a/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/pubsublite/PubsubLiteTableProviderTest.java +++ b/sdks/java/extensions/sql/src/test/java/org/apache/beam/sdk/extensions/sql/meta/provider/pubsublite/PubsubLiteTableProviderTest.java @@ -197,7 +197,7 @@ public void validTopicTables() { } @Test - @SuppressWarnings("argument.type.incompatible") + @SuppressWarnings("argument") public void topicTableCannotRead() { BeamSqlTable basic = makeTable(FULL_WRITE_SCHEMA, example(TopicPath.class).toString(), ImmutableMap.of()); @@ -230,7 +230,7 @@ public void validSubscriptionTables() { } @Test - @SuppressWarnings("argument.type.incompatible") + @SuppressWarnings("argument") public void subscriptionTableCannotWrite() { BeamSqlTable basic = makeTable(FULL_READ_SCHEMA, example(SubscriptionPath.class).toString(), ImmutableMap.of()); diff --git a/sdks/java/extensions/zetasketch/src/main/java/org/apache/beam/sdk/extensions/zetasketch/ApproximateCountDistinct.java b/sdks/java/extensions/zetasketch/src/main/java/org/apache/beam/sdk/extensions/zetasketch/ApproximateCountDistinct.java index a8866f98633b..76a1a93554d0 100644 --- a/sdks/java/extensions/zetasketch/src/main/java/org/apache/beam/sdk/extensions/zetasketch/ApproximateCountDistinct.java +++ b/sdks/java/extensions/zetasketch/src/main/java/org/apache/beam/sdk/extensions/zetasketch/ApproximateCountDistinct.java @@ -150,7 +150,7 @@ public PCollection expand(PCollection input) { return input.apply(builder.globally()).apply(HllCount.Extract.globally()); } - // Boiler plate to avoid [argument.type.incompatible] NonNull vs Nullable + // Boiler plate to avoid [argument] NonNull vs Nullable Contextful> mapping = getMapping(); if (mapping != null) { @@ -221,7 +221,7 @@ public PCollection> expand(PCollection> input) { return input.apply(builder.perKey()).apply(HllCount.Extract.perKey()); } - // Boiler plate to avoid [argument.type.incompatible] NonNull vs Nullable + // Boiler plate to avoid [argument] NonNull vs Nullable Contextful, KV>> mapping = getMapping(); if (mapping != null) { diff --git a/sdks/java/fn-execution/src/main/java/org/apache/beam/sdk/fn/stream/BufferingStreamObserver.java b/sdks/java/fn-execution/src/main/java/org/apache/beam/sdk/fn/stream/BufferingStreamObserver.java index d94e859e1c23..703784548829 100644 --- a/sdks/java/fn-execution/src/main/java/org/apache/beam/sdk/fn/stream/BufferingStreamObserver.java +++ b/sdks/java/fn-execution/src/main/java/org/apache/beam/sdk/fn/stream/BufferingStreamObserver.java @@ -48,7 +48,7 @@ public OnErrorException(@NonNull Throwable throwable) { } @Override - @SuppressWarnings("return.type.incompatible") + @SuppressWarnings("return") public synchronized @NonNull Throwable getCause() { return super.getCause(); } @@ -61,7 +61,7 @@ public OnErrorException(@NonNull Throwable throwable) { private final Future queueDrainer; private final int bufferSize; - @SuppressWarnings("methodref.receiver.bound.invalid") + @SuppressWarnings("methodref.receiver.bound") public BufferingStreamObserver( Phaser phaser, CallStreamObserver outboundObserver, diff --git a/sdks/java/fn-execution/src/main/java/org/apache/beam/sdk/fn/stream/PrefetchableIterables.java b/sdks/java/fn-execution/src/main/java/org/apache/beam/sdk/fn/stream/PrefetchableIterables.java index d8696f0dff17..2e3f13c2f0bc 100644 --- a/sdks/java/fn-execution/src/main/java/org/apache/beam/sdk/fn/stream/PrefetchableIterables.java +++ b/sdks/java/fn-execution/src/main/java/org/apache/beam/sdk/fn/stream/PrefetchableIterables.java @@ -123,7 +123,7 @@ public static PrefetchableIterable concat(Iterable... iterables) { return maybePrefetchable(iterables[0]); } return new Default() { - @SuppressWarnings("methodref.receiver.invalid") + @SuppressWarnings("methodref.receiver") @Override public PrefetchableIterator createIterator() { return PrefetchableIterators.concatIterators( diff --git a/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/control/ExecutionStateSampler.java b/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/control/ExecutionStateSampler.java index cb00dfa8ef9e..f528a4a9919d 100644 --- a/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/control/ExecutionStateSampler.java +++ b/sdks/java/harness/src/main/java/org/apache/beam/fn/harness/control/ExecutionStateSampler.java @@ -73,8 +73,7 @@ public class ExecutionStateSampler { private final Future stateSamplingThread; - @SuppressWarnings( - "methodref.receiver.bound.invalid" /* Synchronization ensures proper initialization */) + @SuppressWarnings("methodref.receiver.bound" /* Synchronization ensures proper initialization */) public ExecutionStateSampler(PipelineOptions options, MillisProvider clock) { String samplingPeriodMills = ExperimentalOptions.getExperimentValue( diff --git a/sdks/java/io/amazon-web-services/src/main/java/org/apache/beam/sdk/io/aws/dynamodb/AwsClientsProvider.java b/sdks/java/io/amazon-web-services/src/main/java/org/apache/beam/sdk/io/aws/dynamodb/AwsClientsProvider.java index e98e633c2ca4..f2d13b144e8d 100644 --- a/sdks/java/io/amazon-web-services/src/main/java/org/apache/beam/sdk/io/aws/dynamodb/AwsClientsProvider.java +++ b/sdks/java/io/amazon-web-services/src/main/java/org/apache/beam/sdk/io/aws/dynamodb/AwsClientsProvider.java @@ -31,7 +31,7 @@ public interface AwsClientsProvider extends Serializable { /** @deprecated DynamoDBIO doesn't require a CloudWatch client */ @Deprecated - @SuppressWarnings("return.type.incompatible") + @SuppressWarnings("return") default AmazonCloudWatch getCloudWatchClient() { return null; } diff --git a/sdks/java/io/amazon-web-services/src/main/java/org/apache/beam/sdk/io/aws/sns/AwsClientsProvider.java b/sdks/java/io/amazon-web-services/src/main/java/org/apache/beam/sdk/io/aws/sns/AwsClientsProvider.java index 6582b510b089..6a90c0285f20 100644 --- a/sdks/java/io/amazon-web-services/src/main/java/org/apache/beam/sdk/io/aws/sns/AwsClientsProvider.java +++ b/sdks/java/io/amazon-web-services/src/main/java/org/apache/beam/sdk/io/aws/sns/AwsClientsProvider.java @@ -31,7 +31,7 @@ public interface AwsClientsProvider extends Serializable { /** @deprecated SnsIO doesn't require a CloudWatch client */ @Deprecated - @SuppressWarnings("return.type.incompatible") + @SuppressWarnings("return") default AmazonCloudWatch getCloudWatchClient() { return null; } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/firestore/FirestoreV1ReadFn.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/firestore/FirestoreV1ReadFn.java index 31e17c0fd488..886a03ebc05f 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/firestore/FirestoreV1ReadFn.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/firestore/FirestoreV1ReadFn.java @@ -526,7 +526,7 @@ public final void processElement(ProcessContext c) throws Exception { // trying to expand all pages to a single collection. We are emitting a single page at a time // while tracking read progress so we can resume if an error has occurred and we still have // attempt budget available. - "type.argument.type.incompatible" + "type.argument" }) private abstract static class PaginatedFirestoreV1ReadFn< RequestT extends Message, diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/NestedRowToMessage.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/NestedRowToMessage.java index c5cae3416c3d..b6e310d81c22 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/NestedRowToMessage.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/NestedRowToMessage.java @@ -42,7 +42,7 @@ class NestedRowToMessage extends SimpleFunction { private final SerializableFunction> attributesExtractor; private final SerializableFunction payloadExtractor; - @SuppressWarnings("methodref.receiver.bound.invalid") + @SuppressWarnings("methodref.receiver.bound") NestedRowToMessage(PayloadSerializer serializer, Schema schema) { this.serializer = serializer; if (schema.getField(ATTRIBUTES_FIELD).getType().equals(ATTRIBUTE_MAP_FIELD_TYPE)) { diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsublite/internal/LimitingTopicBacklogReader.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsublite/internal/LimitingTopicBacklogReader.java index c33d0215bda2..ec7c672a3dc9 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsublite/internal/LimitingTopicBacklogReader.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsublite/internal/LimitingTopicBacklogReader.java @@ -39,7 +39,7 @@ final class LimitingTopicBacklogReader implements TopicBacklogReader { @Nullable private Offset currentRequestOffset = null; - @SuppressWarnings("method.invocation.invalid") + @SuppressWarnings("method.invocation") LimitingTopicBacklogReader(TopicBacklogReader underlying, Ticker ticker) { this.underlying = underlying; backlogCache = @@ -57,7 +57,7 @@ public ComputeMessageStatsResponse load(String val) { }); } - @SuppressWarnings("argument.type.incompatible") + @SuppressWarnings("argument") private synchronized ComputeMessageStatsResponse loadFromUnderlying() { return underlying.computeMessageStats(checkNotNull(currentRequestOffset)); } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsublite/internal/MemoryBufferedSubscriberImpl.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsublite/internal/MemoryBufferedSubscriberImpl.java index 5a7cbbbcc2f2..92d3c0126d81 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsublite/internal/MemoryBufferedSubscriberImpl.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsublite/internal/MemoryBufferedSubscriberImpl.java @@ -52,7 +52,7 @@ class MemoryBufferedSubscriberImpl extends ProxyService implements MemoryBuffere // onReceive will not be called inline as subscriber is not started. // addServices is intended to be called from the constructor. - @SuppressWarnings({"methodref.receiver.bound.invalid", "method.invocation.invalid"}) + @SuppressWarnings({"methodref.receiver.bound", "method.invocation"}) public MemoryBufferedSubscriberImpl( Partition partition, Offset startOffset, diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsublite/internal/TopicBacklogReaderImpl.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsublite/internal/TopicBacklogReaderImpl.java index 80e3cbee1655..b11daa43a020 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsublite/internal/TopicBacklogReaderImpl.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsublite/internal/TopicBacklogReaderImpl.java @@ -39,7 +39,7 @@ public TopicBacklogReaderImpl(TopicStatsClient client, TopicPath topicPath, Part } @Override - @SuppressWarnings("assignment.type.incompatible") + @SuppressWarnings("assignment") public ComputeMessageStatsResponse computeMessageStats(Offset offset) throws ApiException { try { return client diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/encoder/TimestampEncoding.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/encoder/TimestampEncoding.java index 22c06e15c0e4..3ff5753fb007 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/encoder/TimestampEncoding.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/encoder/TimestampEncoding.java @@ -76,7 +76,7 @@ protected void write(Object datum, Encoder out) throws IOException { * @throws IOException if it was not possible to read the timestamp from the provided decoder */ // it is possible to return nulls here if the encoded value was null - @SuppressWarnings({"override.return.invalid", "return.type.incompatible"}) + @SuppressWarnings({"override.return", "return"}) @Override protected Timestamp read(Object reuse, Decoder in) throws IOException { final long seconds = in.readLong(); diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsublite/internal/OffsetByteRangeTrackerTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsublite/internal/OffsetByteRangeTrackerTest.java index 79585ab28afb..1ba4582d3eaa 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsublite/internal/OffsetByteRangeTrackerTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsublite/internal/OffsetByteRangeTrackerTest.java @@ -79,7 +79,7 @@ public void getProgressStatsFailure() { } @Test - @SuppressWarnings({"dereference.of.nullable", "argument.type.incompatible"}) + @SuppressWarnings({"dereference.of.nullable", "argument"}) public void claimSplitSuccess() { assertTrue(tracker.tryClaim(OffsetByteProgress.of(Offset.of(1_000), MIN_BYTES))); assertTrue(tracker.tryClaim(OffsetByteProgress.of(Offset.of(10_000), MIN_BYTES))); @@ -98,7 +98,7 @@ public void claimSplitSuccess() { } @Test - @SuppressWarnings({"dereference.of.nullable", "argument.type.incompatible"}) + @SuppressWarnings({"dereference.of.nullable", "argument"}) public void splitWithoutClaimEmpty() { when(ticker.read()).thenReturn(100000000000000L); SplitResult splits = tracker.trySplit(IGNORED_FRACTION); diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsublite/internal/PerSubscriptionPartitionSdfTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsublite/internal/PerSubscriptionPartitionSdfTest.java index 5cfbbe016fed..a6624ac2c3aa 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsublite/internal/PerSubscriptionPartitionSdfTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/pubsublite/internal/PerSubscriptionPartitionSdfTest.java @@ -134,7 +134,7 @@ public void tearDownClosesBacklogReaderFactory() throws Exception { } @Test - @SuppressWarnings("argument.type.incompatible") + @SuppressWarnings("argument") public void process() throws Exception { when(processor.run()).thenReturn(ProcessContinuation.resume()); when(processorFactory.newProcessor(any(), any(), any())) @@ -171,7 +171,7 @@ public void close() {} } @Test - @SuppressWarnings("return.type.incompatible") + @SuppressWarnings("return") public void dofnIsSerializable() throws Exception { ObjectOutputStream output = new ObjectOutputStream(new ByteArrayOutputStream()); output.writeObject( From 82150176099d7cb5173e237867204ead8f53c3db Mon Sep 17 00:00:00 2001 From: Kenneth Knowles Date: Mon, 28 Nov 2022 12:18:39 -0800 Subject: [PATCH 008/456] Upgrade checker framework to 3.14.0 This upgrade involves some adjustments to types in the JDK --- .../groovy/org/apache/beam/gradle/BeamModulePlugin.groovy | 2 +- .../beam/sdk/util/UnboundedScheduledExecutorService.java | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy index 2e4bf4639f2b..8c753ccaa029 100644 --- a/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy +++ b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy @@ -461,7 +461,7 @@ class BeamModulePlugin implements Plugin { def aws_java_sdk2_version = "2.17.127" def cassandra_driver_version = "3.10.2" def cdap_version = "6.5.1" - def checkerframework_version = "3.13.0" + def checkerframework_version = "3.14.0" def classgraph_version = "4.8.104" def dbcp2_version = "2.8.0" def errorprone_version = "2.10.0" diff --git a/sdks/java/core/src/main/java/org/apache/beam/sdk/util/UnboundedScheduledExecutorService.java b/sdks/java/core/src/main/java/org/apache/beam/sdk/util/UnboundedScheduledExecutorService.java index 33cba79a3646..9b38604a7a8c 100644 --- a/sdks/java/core/src/main/java/org/apache/beam/sdk/util/UnboundedScheduledExecutorService.java +++ b/sdks/java/core/src/main/java/org/apache/beam/sdk/util/UnboundedScheduledExecutorService.java @@ -418,7 +418,7 @@ public void execute(Runnable command) { } @Override - public ScheduledFuture schedule(Runnable command, long delay, TimeUnit unit) { + public ScheduledFuture<@Nullable ?> schedule(Runnable command, long delay, TimeUnit unit) { if (command == null || unit == null) { throw new NullPointerException(); } @@ -442,7 +442,7 @@ public ScheduledFuture schedule(Runnable command, long delay, TimeUnit unit) } @Override - public ScheduledFuture scheduleAtFixedRate( + public ScheduledFuture<@Nullable ?> scheduleAtFixedRate( Runnable command, long initialDelay, long period, TimeUnit unit) { if (command == null || unit == null) { throw new NullPointerException(); @@ -458,7 +458,7 @@ public ScheduledFuture scheduleAtFixedRate( } @Override - public ScheduledFuture scheduleWithFixedDelay( + public ScheduledFuture<@Nullable ?> scheduleWithFixedDelay( Runnable command, long initialDelay, long delay, TimeUnit unit) { if (command == null || unit == null) { throw new NullPointerException(); From 2af405f5ff2adb12a199a178ea3e723a20fae3a9 Mon Sep 17 00:00:00 2001 From: Kenneth Knowles Date: Mon, 28 Nov 2022 12:18:09 -0800 Subject: [PATCH 009/456] Upgrade checker framework to 3.15.0 --- .../main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy index 8c753ccaa029..26b217f91f4c 100644 --- a/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy +++ b/buildSrc/src/main/groovy/org/apache/beam/gradle/BeamModulePlugin.groovy @@ -461,7 +461,7 @@ class BeamModulePlugin implements Plugin { def aws_java_sdk2_version = "2.17.127" def cassandra_driver_version = "3.10.2" def cdap_version = "6.5.1" - def checkerframework_version = "3.14.0" + def checkerframework_version = "3.15.0" def classgraph_version = "4.8.104" def dbcp2_version = "2.8.0" def errorprone_version = "2.10.0" From fccc46bc1806161336d29adc45809738a55c68c6 Mon Sep 17 00:00:00 2001 From: Kenneth Knowles Date: Mon, 28 Nov 2022 19:11:51 -0800 Subject: [PATCH 010/456] Inline :sdks:java:core:buildDependents so we can incrementally split --- build.gradle.kts | 133 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 132 insertions(+), 1 deletion(-) diff --git a/build.gradle.kts b/build.gradle.kts index db571d616c6b..6fcc26f94c27 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -210,7 +210,138 @@ tasks.register("javaPreCommit") { dependsOn(":model:fn-execution:build") dependsOn(":runners:google-cloud-dataflow-java:worker:legacy-worker:build") dependsOn(":sdks:java:core:buildNeeded") - dependsOn(":sdks:java:core:buildDependents") + + // Inline :sdks:java:core:buildDependents so we can carve out pieces at a time + dependsOn(":beam-validate-runner:build") + dependsOn(":examples:java:build") + dependsOn(":examples:java:twitter:build") + dependsOn(":examples:kotlin:build") + dependsOn(":examples:multi-language:build") + dependsOn(":model:fn-execution:build") + dependsOn(":model:job-management:build") + dependsOn(":model:pipeline:build") + dependsOn(":runners:core-construction-java:build") + dependsOn(":runners:core-java:build") + dependsOn(":runners:direct-java:build") + dependsOn(":runners:extensions-java:metrics:build") + dependsOn(":runners:flink:1.12:build") + dependsOn(":runners:flink:1.12:job-server:build") + dependsOn(":runners:flink:1.13:build") + dependsOn(":runners:flink:1.13:job-server:build") + dependsOn(":runners:flink:1.14:build") + dependsOn(":runners:flink:1.14:job-server:build") + dependsOn(":runners:flink:1.15:build") + dependsOn(":runners:flink:1.15:job-server:build") + dependsOn(":runners:google-cloud-dataflow-java:build") + dependsOn(":runners:google-cloud-dataflow-java:examples-streaming:build") + dependsOn(":runners:google-cloud-dataflow-java:examples:build") + dependsOn(":runners:google-cloud-dataflow-java:worker:build") + dependsOn(":runners:google-cloud-dataflow-java:worker:legacy-worker:build") + dependsOn(":runners:google-cloud-dataflow-java:worker:windmill:build") + dependsOn(":runners:java-fn-execution:build") + dependsOn(":runners:java-job-service:build") + dependsOn(":runners:jet:build") + dependsOn(":runners:local-java:build") + dependsOn(":runners:portability:java:build") + dependsOn(":runners:samza:build") + dependsOn(":runners:samza:job-server:build") + dependsOn(":runners:spark:2:build") + dependsOn(":runners:spark:2:job-server:build") + dependsOn(":runners:spark:3:build") + dependsOn(":runners:spark:3:job-server:build") + dependsOn(":runners:twister2:build") + dependsOn(":sdks:java:build-tools:build") + dependsOn(":sdks:java:core:build") + dependsOn(":sdks:java:core:jmh:build") + dependsOn(":sdks:java:expansion-service:build") + dependsOn(":sdks:java:expansion-service:app:build") + dependsOn(":sdks:java:extensions:arrow:build") + dependsOn(":sdks:java:extensions:euphoria:build") + dependsOn(":sdks:java:extensions:google-cloud-platform-core:build") + dependsOn(":sdks:java:extensions:jackson:build") + dependsOn(":sdks:java:extensions:join-library:build") + dependsOn(":sdks:java:extensions:kryo:build") + dependsOn(":sdks:java:extensions:ml:build") + dependsOn(":sdks:java:extensions:protobuf:build") + dependsOn(":sdks:java:extensions:python:build") + dependsOn(":sdks:java:extensions:sbe:build") + dependsOn(":sdks:java:extensions:schemaio-expansion-service:build") + dependsOn(":sdks:java:extensions:sketching:build") + dependsOn(":sdks:java:extensions:sorter:build") + dependsOn(":sdks:java:extensions:sql:build") + dependsOn(":sdks:java:extensions:sql:datacatalog:build") + dependsOn(":sdks:java:extensions:sql:expansion-service:build") + dependsOn(":sdks:java:extensions:sql:hcatalog:build") + dependsOn(":sdks:java:extensions:sql:jdbc:build") + dependsOn(":sdks:java:extensions:sql:perf-tests:build") + dependsOn(":sdks:java:extensions:sql:shell:build") + dependsOn(":sdks:java:extensions:sql:udf-test-provider:build") + dependsOn(":sdks:java:extensions:sql:udf:build") + dependsOn(":sdks:java:extensions:sql:zetasql:build") + dependsOn(":sdks:java:extensions:timeseries:build") + dependsOn(":sdks:java:extensions:zetasketch:build") + dependsOn(":sdks:java:fn-execution:build") + dependsOn(":sdks:java:harness:build") + dependsOn(":sdks:java:harness:jmh:build") + dependsOn(":sdks:java:io:amazon-web-services2:build") + dependsOn(":sdks:java:io:amazon-web-services:build") + dependsOn(":sdks:java:io:amqp:build") + dependsOn(":sdks:java:io:azure:build") + dependsOn(":sdks:java:io:bigquery-io-perf-tests:build") + dependsOn(":sdks:java:io:cassandra:build") + dependsOn(":sdks:java:io:cdap:build") + dependsOn(":sdks:java:io:clickhouse:build") + dependsOn(":sdks:java:io:common:build") + dependsOn(":sdks:java:io:contextualtextio:build") + dependsOn(":sdks:java:io:debezium:build") + dependsOn(":sdks:java:io:debezium:expansion-service:build") + dependsOn(":sdks:java:io:elasticsearch-tests:elasticsearch-tests-5:build") + dependsOn(":sdks:java:io:elasticsearch-tests:elasticsearch-tests-6:build") + dependsOn(":sdks:java:io:elasticsearch-tests:elasticsearch-tests-7:build") + dependsOn(":sdks:java:io:elasticsearch-tests:elasticsearch-tests-8:build") + dependsOn(":sdks:java:io:elasticsearch-tests:elasticsearch-tests-common:build") + dependsOn(":sdks:java:io:elasticsearch:build") + dependsOn(":sdks:java:io:expansion-service:build") + dependsOn(":sdks:java:io:file-based-io-tests:build") + dependsOn(":sdks:java:io:google-cloud-platform:build") + dependsOn(":sdks:java:io:google-cloud-platform:expansion-service:build") + dependsOn(":sdks:java:io:hadoop-common:build") + dependsOn(":sdks:java:io:hadoop-file-system:build") + dependsOn(":sdks:java:io:hadoop-format:build") + dependsOn(":sdks:java:io:hbase:build") + dependsOn(":sdks:java:io:hcatalog:build") + dependsOn(":sdks:java:io:influxdb:build") + dependsOn(":sdks:java:io:jdbc:build") + dependsOn(":sdks:java:io:jms:build") + dependsOn(":sdks:java:io:kafka:build") + dependsOn(":sdks:java:io:kinesis:build") + dependsOn(":sdks:java:io:kinesis:expansion-service:build") + dependsOn(":sdks:java:io:kudu:build") + dependsOn(":sdks:java:io:mongodb:build") + dependsOn(":sdks:java:io:mqtt:build") + dependsOn(":sdks:java:io:neo4j:build") + dependsOn(":sdks:java:io:parquet:build") + dependsOn(":sdks:java:io:pulsar:build") + dependsOn(":sdks:java:io:rabbitmq:build") + dependsOn(":sdks:java:io:redis:build") + dependsOn(":sdks:java:io:singlestore:build") + dependsOn(":sdks:java:io:snowflake:build") + dependsOn(":sdks:java:io:snowflake:expansion-service:build") + dependsOn(":sdks:java:io:solr:build") + dependsOn(":sdks:java:io:sparkreceiver:2:build") + dependsOn(":sdks:java:io:splunk:build") + dependsOn(":sdks:java:io:synthetic:build") + dependsOn(":sdks:java:io:thrift:build") + dependsOn(":sdks:java:io:tika:build") + dependsOn(":sdks:java:io:xml:build") + dependsOn(":sdks:java:testing:expansion-service:build") + dependsOn(":sdks:java:testing:jpms-tests:build") + dependsOn(":sdks:java:testing:load-tests:build") + dependsOn(":sdks:java:testing:nexmark:build") + dependsOn(":sdks:java:testing:test-utils:build") + dependsOn(":sdks:java:testing:tpcds:build") + dependsOn(":sdks:java:testing:watermarks:build") + dependsOn(":examples:java:preCommit") dependsOn(":examples:java:twitter:preCommit") dependsOn(":sdks:java:extensions:sql:jdbc:preCommit") From 60719d1b4d750d62009a1a0fa6989440b7a78dce Mon Sep 17 00:00:00 2001 From: Moritz Mack Date: Tue, 29 Nov 2022 13:53:15 +0100 Subject: [PATCH 011/456] [Spark Dataset runner] Fix support for Java 11 (closes #24392) --- .../translation/PipelineTranslator.java | 11 +++++++++-- .../translation/helpers/EncoderHelpers.java | 3 ++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/translation/PipelineTranslator.java b/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/translation/PipelineTranslator.java index 4f3a5cd76a64..a122597b85fb 100644 --- a/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/translation/PipelineTranslator.java +++ b/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/translation/PipelineTranslator.java @@ -27,6 +27,7 @@ import java.util.HashSet; import java.util.Map; import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; import javax.annotation.Nullable; import org.apache.beam.runners.core.construction.PTransformTranslation; import org.apache.beam.runners.core.construction.SerializablePipelineOptions; @@ -169,7 +170,7 @@ default void putDataset(PCollection pCollection, Dataset */ private class TranslatingVisitor extends PTransformVisitor implements TranslationState { private final Map, TranslationResult> translationResults; - private final Map, ExpressionEncoder> encoders; + private final Map, Encoder> encoders; private final SparkSession sparkSession; private final SerializablePipelineOptions serializableOptions; private final StorageLevel storageLevel; @@ -209,7 +210,13 @@ void visit( @Override public Encoder encoderOf(Coder coder, Factory factory) { - return (Encoder) encoders.computeIfAbsent(coder, (Factory) factory); + // computeIfAbsent fails with Java 11 on recursive factory + Encoder enc = (Encoder) encoders.get(coder); + if(enc == null){ + enc = factory.apply(coder); + encoders.put(coder, enc); + } + return enc; } private TranslationResult getResult(PCollection pCollection) { diff --git a/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/translation/helpers/EncoderHelpers.java b/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/translation/helpers/EncoderHelpers.java index f89f6bdb9d30..5212b511721b 100644 --- a/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/translation/helpers/EncoderHelpers.java +++ b/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/translation/helpers/EncoderHelpers.java @@ -35,6 +35,7 @@ import java.util.Map; import java.util.Set; import java.util.TreeMap; +import java.util.concurrent.ConcurrentHashMap; import java.util.function.Function; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.transforms.windowing.BoundedWindow; @@ -107,7 +108,7 @@ public class EncoderHelpers { Double.class); // Default encoders by class - private static final Map, Encoder> DEFAULT_ENCODERS = new HashMap<>(); + private static final Map, Encoder> DEFAULT_ENCODERS = new ConcurrentHashMap<>(); // Factory for default encoders by class private static final Function, @Nullable Encoder> ENCODER_FACTORY = From 36b8994688373916459ea0e36da189ab29aecc92 Mon Sep 17 00:00:00 2001 From: Moritz Mack Date: Tue, 29 Nov 2022 14:52:59 +0100 Subject: [PATCH 012/456] fix spotless --- .../structuredstreaming/translation/PipelineTranslator.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/translation/PipelineTranslator.java b/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/translation/PipelineTranslator.java index a122597b85fb..400502bea9ef 100644 --- a/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/translation/PipelineTranslator.java +++ b/runners/spark/3/src/main/java/org/apache/beam/runners/spark/structuredstreaming/translation/PipelineTranslator.java @@ -27,7 +27,6 @@ import java.util.HashSet; import java.util.Map; import java.util.Set; -import java.util.concurrent.ConcurrentHashMap; import javax.annotation.Nullable; import org.apache.beam.runners.core.construction.PTransformTranslation; import org.apache.beam.runners.core.construction.SerializablePipelineOptions; @@ -51,7 +50,6 @@ import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoder; import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder; import org.apache.spark.storage.StorageLevel; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -212,7 +210,7 @@ void visit( public Encoder encoderOf(Coder coder, Factory factory) { // computeIfAbsent fails with Java 11 on recursive factory Encoder enc = (Encoder) encoders.get(coder); - if(enc == null){ + if (enc == null) { enc = factory.apply(coder); encoders.put(coder, enc); } From 4b959e73a71fafa70bac175243e78c309786a4f8 Mon Sep 17 00:00:00 2001 From: Vitaly Terentyev Date: Tue, 29 Nov 2022 19:08:13 +0400 Subject: [PATCH 013/456] Fix SparkReceiverIOIT test (#24375) --- sdks/java/io/sparkreceiver/2/build.gradle | 1 + .../beam/sdk/io/sparkreceiver/SparkReceiverIOIT.java | 12 +----------- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/sdks/java/io/sparkreceiver/2/build.gradle b/sdks/java/io/sparkreceiver/2/build.gradle index 52c6a6340499..7607127adcaf 100644 --- a/sdks/java/io/sparkreceiver/2/build.gradle +++ b/sdks/java/io/sparkreceiver/2/build.gradle @@ -37,6 +37,7 @@ configurations.all { exclude group: 'org.slf4j', module: 'slf4j-jdk14' exclude group: 'org.slf4j', module: 'slf4j-log4j12' exclude group: 'org.slf4j', module: 'slf4j-reload4j' + exclude group: "org.slf4j", module: "jul-to-slf4j" } dependencies { diff --git a/sdks/java/io/sparkreceiver/2/src/test/java/org/apache/beam/sdk/io/sparkreceiver/SparkReceiverIOIT.java b/sdks/java/io/sparkreceiver/2/src/test/java/org/apache/beam/sdk/io/sparkreceiver/SparkReceiverIOIT.java index b335aab2ed53..4f839fc7c5d3 100644 --- a/sdks/java/io/sparkreceiver/2/src/test/java/org/apache/beam/sdk/io/sparkreceiver/SparkReceiverIOIT.java +++ b/sdks/java/io/sparkreceiver/2/src/test/java/org/apache/beam/sdk/io/sparkreceiver/SparkReceiverIOIT.java @@ -49,12 +49,9 @@ import org.apache.beam.sdk.metrics.Metrics; import org.apache.beam.sdk.options.Default; import org.apache.beam.sdk.options.Description; -import org.apache.beam.sdk.options.ExperimentalOptions; -import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.options.StreamingOptions; import org.apache.beam.sdk.options.Validation; import org.apache.beam.sdk.testing.TestPipeline; -import org.apache.beam.sdk.testing.TestPipelineOptions; import org.apache.beam.sdk.testutils.NamedTestResult; import org.apache.beam.sdk.testutils.metrics.IOITMetrics; import org.apache.beam.sdk.testutils.metrics.MetricsReader; @@ -113,14 +110,7 @@ public class SparkReceiverIOIT { private static InfluxDBSettings settings; - private static final ExperimentalOptions sdfPipelineOptions; - - static { - sdfPipelineOptions = PipelineOptionsFactory.create().as(ExperimentalOptions.class); - sdfPipelineOptions.as(TestPipelineOptions.class).setBlockOnRun(false); - } - - @Rule public TestPipeline readPipeline = TestPipeline.fromOptions(sdfPipelineOptions); + @Rule public TestPipeline readPipeline = TestPipeline.create(); @BeforeClass public static void setup() throws IOException { From f4a18c53f2682c92040f1da357ac104307749896 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 29 Nov 2022 20:40:37 +0530 Subject: [PATCH 014/456] Bump cloud.google.com/go/bigquery from 1.42.0 to 1.43.0 in /sdks (#24311) Bumps [cloud.google.com/go/bigquery](https://github.com/googleapis/google-cloud-go) from 1.42.0 to 1.43.0. - [Release notes](https://github.com/googleapis/google-cloud-go/releases) - [Changelog](https://github.com/googleapis/google-cloud-go/blob/main/CHANGES.md) - [Commits](https://github.com/googleapis/google-cloud-go/compare/bigquery/v1.42.0...bigquery/v1.43.0) --- updated-dependencies: - dependency-name: cloud.google.com/go/bigquery dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- sdks/go.mod | 2 +- sdks/go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sdks/go.mod b/sdks/go.mod index 5e5cc6ee7fbb..dfc1657b1fd2 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -23,7 +23,7 @@ module github.com/apache/beam/sdks/v2 go 1.18 require ( - cloud.google.com/go/bigquery v1.42.0 + cloud.google.com/go/bigquery v1.43.0 cloud.google.com/go/datastore v1.9.0 cloud.google.com/go/profiler v0.3.0 cloud.google.com/go/pubsub v1.26.0 diff --git a/sdks/go.sum b/sdks/go.sum index 97455598f945..89b5f18967e0 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -36,8 +36,8 @@ cloud.google.com/go/bigquery v1.4.0/go.mod h1:S8dzgnTigyfTmLBfrtrhyYhwRxG72rYxvf cloud.google.com/go/bigquery v1.5.0/go.mod h1:snEHRnqQbz117VIFhE8bmtwIDY80NLUZUMb4Nv6dBIg= cloud.google.com/go/bigquery v1.7.0/go.mod h1://okPTzCYNXSlb24MZs83e2Do+h+VXtc4gLoIoXIAPc= cloud.google.com/go/bigquery v1.8.0/go.mod h1:J5hqkt3O0uAFnINi6JXValWIb1v0goeZM77hZzJN/fQ= -cloud.google.com/go/bigquery v1.42.0 h1:JuTk8po4bCKRwObdT0zLb1K0BGkGHJdtgs2GK3j2Gws= -cloud.google.com/go/bigquery v1.42.0/go.mod h1:8dRTJxhtG+vwBKzE5OseQn/hiydoQN3EedCaOdYmxRA= +cloud.google.com/go/bigquery v1.43.0 h1:u0fvz5ysJBe1jwUPI4LuPwAX+o+6fCUwf3ECeg6eDUQ= +cloud.google.com/go/bigquery v1.43.0/go.mod h1:ZMQcXHsl+xmU1z36G2jNGZmKp9zNY5BUua5wDgmNCfw= cloud.google.com/go/bigtable v1.18.0 h1:OzxQqEBRNcUt0u3V9HobUS95hr1GVVPNHtPGrCeXBfU= cloud.google.com/go/bigtable v1.18.0/go.mod h1:TwTdxeNeIwj2lOmtvqISXlRWuIovWkjSZsd03sCLz2U= cloud.google.com/go/compute v0.1.0/go.mod h1:GAesmwr110a34z04OlxYkATPBEfVhkymfTBXtfbBFow= From b9088ba62a25d7d828987ccdb8a7b5388f44dd71 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 29 Nov 2022 20:41:22 +0530 Subject: [PATCH 015/456] Bump github.com/aws/aws-sdk-go-v2/feature/s3/manager in /sdks (#24348) Bumps [github.com/aws/aws-sdk-go-v2/feature/s3/manager](https://github.com/aws/aws-sdk-go-v2) from 1.11.41 to 1.11.42. - [Release notes](https://github.com/aws/aws-sdk-go-v2/releases) - [Changelog](https://github.com/aws/aws-sdk-go-v2/blob/main/CHANGELOG.md) - [Commits](https://github.com/aws/aws-sdk-go-v2/compare/feature/s3/manager/v1.11.41...feature/s3/manager/v1.11.42) --- updated-dependencies: - dependency-name: github.com/aws/aws-sdk-go-v2/feature/s3/manager dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- sdks/go.mod | 6 +++--- sdks/go.sum | 14 ++++++-------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/sdks/go.mod b/sdks/go.mod index dfc1657b1fd2..13e6c9460822 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -29,10 +29,10 @@ require ( cloud.google.com/go/pubsub v1.26.0 cloud.google.com/go/storage v1.28.0 github.com/aws/aws-sdk-go-v2 v1.17.1 - github.com/aws/aws-sdk-go-v2/config v1.18.2 + github.com/aws/aws-sdk-go-v2/config v1.18.3 github.com/aws/aws-sdk-go-v2/credentials v1.13.3 - github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.11.41 - github.com/aws/aws-sdk-go-v2/service/s3 v1.29.3 + github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.11.42 + github.com/aws/aws-sdk-go-v2/service/s3 v1.29.4 github.com/aws/smithy-go v1.13.4 github.com/docker/go-connections v0.4.0 github.com/dustin/go-humanize v1.0.0 diff --git a/sdks/go.sum b/sdks/go.sum index 89b5f18967e0..035e17c7ad06 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -151,18 +151,17 @@ github.com/aws/aws-sdk-go-v2 v1.17.1/go.mod h1:JLnGeGONAyi2lWXI1p0PCIOIy333JMVK1 github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.4.9 h1:RKci2D7tMwpvGpDNZnGQw9wk6v7o/xSwFcUAuNPoB8k= github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.4.9/go.mod h1:vCmV1q1VK8eoQJ5+aYE7PkK1K6v41qJ5pJdK3ggCDvg= github.com/aws/aws-sdk-go-v2/config v1.5.0/go.mod h1:RWlPOAW3E3tbtNAqTwvSW54Of/yP3oiZXMI0xfUdjyA= -github.com/aws/aws-sdk-go-v2/config v1.18.2 h1:tRhTb3xMZsB0gW0sXWpqs9FeIP8iQp5SvnvwiPXzHwo= -github.com/aws/aws-sdk-go-v2/config v1.18.2/go.mod h1:9XVoZTdD8ICjrgI5ddb8j918q6lEZkFYpb7uohgvU6c= +github.com/aws/aws-sdk-go-v2/config v1.18.3 h1:3kfBKcX3votFX84dm00U8RGA1sCCh3eRMOGzg5dCWfU= +github.com/aws/aws-sdk-go-v2/config v1.18.3/go.mod h1:BYdrbeCse3ZnOD5+2/VE/nATOK8fEUpBtmPMdKSyhMU= github.com/aws/aws-sdk-go-v2/credentials v1.3.1/go.mod h1:r0n73xwsIVagq8RsxmZbGSRQFj9As3je72C2WzUIToc= -github.com/aws/aws-sdk-go-v2/credentials v1.13.2/go.mod h1:eAT5aj/WJ2UDIA0IVNFc2byQLeD89SDEi4cjzH/MKoQ= github.com/aws/aws-sdk-go-v2/credentials v1.13.3 h1:ur+FHdp4NbVIv/49bUjBW+FE7e57HOo03ELodttmagk= github.com/aws/aws-sdk-go-v2/credentials v1.13.3/go.mod h1:/rOMmqYBcFfNbRPU0iN9IgGqD5+V2yp3iWNmIlz0wI4= github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.3.0/go.mod h1:2LAuqPx1I6jNfaGDucWfA2zqQCYCOMCDHiCOciALyNw= github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.12.19 h1:E3PXZSI3F2bzyj6XxUXdTIfvp425HHhwKsFvmzBwHgs= github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.12.19/go.mod h1:VihW95zQpeKQWVPGkwT+2+WJNQV8UXFfMTWdU6VErL8= github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.3.2/go.mod h1:qaqQiHSrOUVOfKe6fhgQ6UzhxjwqVW8aHNegd6Ws4w4= -github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.11.41 h1:ssgdsNm11dvFtO7F/AeiW4dAO3eGsDeg5fwpag/JP/I= -github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.11.41/go.mod h1:CS+AbDFAaPU9TQOo7U6mVV23YvqCOElnqmh0XQjgJ1g= +github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.11.42 h1:bxgBYvvBh+W1RnNYP4ROXEB8N+HSSucDszfE7Rb+kfU= +github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.11.42/go.mod h1:LHOsygMiW/14CkFxdXxvzKyMh3jbk/QfZVaDtCbLkl8= github.com/aws/aws-sdk-go-v2/internal/configsources v1.1.25 h1:nBO/RFxeq/IS5G9Of+ZrgucRciie2qpLy++3UGZ+q2E= github.com/aws/aws-sdk-go-v2/internal/configsources v1.1.25/go.mod h1:Zb29PYkf42vVYQY6pvSyJCJcFHlPIiY+YKdPtwnvMkY= github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.4.19 h1:oRHDrwCTVT8ZXi4sr9Ld+EXk7N/KGssOr2ygNeojEhw= @@ -184,15 +183,14 @@ github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.5.1/go.mod h1:6EQZIwNN github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.13.19 h1:piDBAaWkaxkkVV3xJJbTehXCZRXYs49kvpi/LG6LR2o= github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.13.19/go.mod h1:BmQWRVkLTmyNzYPFAZgon53qKLWBNSvonugD1MrSWUs= github.com/aws/aws-sdk-go-v2/service/s3 v1.11.1/go.mod h1:XLAGFrEjbvMCLvAtWLLP32yTv8GpBquCApZEycDLunI= -github.com/aws/aws-sdk-go-v2/service/s3 v1.29.3 h1:F6wgg8aHGNyhaAy2ONnWBThiPdLa386qNA0j33FIuSM= -github.com/aws/aws-sdk-go-v2/service/s3 v1.29.3/go.mod h1:/NHbqPRiwxSPVOB2Xr+StDEH+GWV/64WwnUjv4KYzV0= +github.com/aws/aws-sdk-go-v2/service/s3 v1.29.4 h1:QgmmWifaYZZcpaw3y1+ccRlgH6jAvLm4K/MBGUc7cNM= +github.com/aws/aws-sdk-go-v2/service/s3 v1.29.4/go.mod h1:/NHbqPRiwxSPVOB2Xr+StDEH+GWV/64WwnUjv4KYzV0= github.com/aws/aws-sdk-go-v2/service/sso v1.3.1/go.mod h1:J3A3RGUvuCZjvSuZEcOpHDnzZP/sKbhDWV2T1EOzFIM= github.com/aws/aws-sdk-go-v2/service/sso v1.11.25 h1:GFZitO48N/7EsFDt8fMa5iYdmWqkUDDB3Eje6z3kbG0= github.com/aws/aws-sdk-go-v2/service/sso v1.11.25/go.mod h1:IARHuzTXmj1C0KS35vboR0FeJ89OkEy1M9mWbK2ifCI= github.com/aws/aws-sdk-go-v2/service/ssooidc v1.13.8 h1:jcw6kKZrtNfBPJkaHrscDOZoe5gvi9wjudnxvozYFJo= github.com/aws/aws-sdk-go-v2/service/ssooidc v1.13.8/go.mod h1:er2JHN+kBY6FcMfcBBKNGCT3CarImmdFzishsqBmSRI= github.com/aws/aws-sdk-go-v2/service/sts v1.6.0/go.mod h1:q7o0j7d7HrJk/vr9uUt3BVRASvcU7gYZB9PUgPiByXg= -github.com/aws/aws-sdk-go-v2/service/sts v1.17.4/go.mod h1:bXcN3koeVYiJcdDU89n3kCYILob7Y34AeLopUbZgLT4= github.com/aws/aws-sdk-go-v2/service/sts v1.17.5 h1:60SJ4lhvn///8ygCzYy2l53bFW/Q15bVfyjyAWo6zuw= github.com/aws/aws-sdk-go-v2/service/sts v1.17.5/go.mod h1:bXcN3koeVYiJcdDU89n3kCYILob7Y34AeLopUbZgLT4= github.com/aws/smithy-go v1.6.0/go.mod h1:SObp3lf9smib00L/v3U2eAKG8FyQ7iLrJnQiAmR5n+E= From 4c0253d80099051fac7068e5199e85310695ae7c Mon Sep 17 00:00:00 2001 From: Andrea Nardelli Date: Tue, 29 Nov 2022 16:29:07 +0100 Subject: [PATCH 016/456] pubsub: fix typo in grpc client factory --- .../org/apache/beam/sdk/io/gcp/pubsub/PubsubGrpcClient.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubGrpcClient.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubGrpcClient.java index a94605a74234..2c3cc6678aae 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubGrpcClient.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/pubsub/PubsubGrpcClient.java @@ -111,7 +111,7 @@ public String getKind() { } } - /** Factory for creating Pubsub clients using gRCP transport. */ + /** Factory for creating Pubsub clients using gRPC transport. */ public static final PubsubClientFactory FACTORY = new PubsubGrpcClientFactory(); /** Timeout for grpc calls (in s). */ From 22dcacb163bba313de1005f21500e107a9a64dba Mon Sep 17 00:00:00 2001 From: Dmitry Repin Date: Tue, 29 Nov 2022 20:04:01 +0400 Subject: [PATCH 017/456] pg_23079 remove replacing tabs at playground (#24285) --- .../code_client/grpc_code_client.dart | 23 +++++++----------- .../example_client/grpc_example_client.dart | 13 +++------- .../src/util/replace_incorrect_symbols.dart | 24 ------------------- 3 files changed, 12 insertions(+), 48 deletions(-) delete mode 100644 playground/frontend/playground_components/lib/src/util/replace_incorrect_symbols.dart diff --git a/playground/frontend/playground_components/lib/src/repositories/code_client/grpc_code_client.dart b/playground/frontend/playground_components/lib/src/repositories/code_client/grpc_code_client.dart index 729ce4cd19f9..62f0df698bb1 100644 --- a/playground/frontend/playground_components/lib/src/repositories/code_client/grpc_code_client.dart +++ b/playground/frontend/playground_components/lib/src/repositories/code_client/grpc_code_client.dart @@ -22,7 +22,6 @@ import '../../api/iis_workaround_channel.dart'; import '../../api/v1/api.pbgrpc.dart' as grpc; import '../../models/sdk.dart'; import '../../util/pipeline_options.dart'; -import '../../util/replace_incorrect_symbols.dart'; import '../models/check_status_response.dart'; import '../models/output_response.dart'; import '../models/run_code_error.dart'; @@ -101,7 +100,7 @@ class GrpcCodeClient implements CodeClient { ), ); - return _toOutputResponse(response.output); + return OutputResponse(output: response.output); } @override @@ -115,10 +114,10 @@ class GrpcCodeClient implements CodeClient { ), ); - return _toOutputResponse(response.output); + return OutputResponse(output: response.output); } catch (ex) { print(ex); - return _toOutputResponse(''); + return OutputResponse(output: ''); } } @@ -131,10 +130,10 @@ class GrpcCodeClient implements CodeClient { grpc.GetLogsRequest(pipelineUuid: pipelineUuid), ); - return _toOutputResponse(response.output); + return OutputResponse(output: response.output); } catch (ex) { print(ex); - return _toOutputResponse(''); + return OutputResponse(output: ''); } } @@ -146,7 +145,7 @@ class GrpcCodeClient implements CodeClient { grpc.GetRunErrorRequest(pipelineUuid: pipelineUuid), ); - return _toOutputResponse(response.output); + return OutputResponse(output: response.output); } @override @@ -157,7 +156,7 @@ class GrpcCodeClient implements CodeClient { grpc.GetValidationOutputRequest(pipelineUuid: pipelineUuid), ); - return _toOutputResponse(response.output); + return OutputResponse(output: response.output); } @override @@ -168,7 +167,7 @@ class GrpcCodeClient implements CodeClient { grpc.GetPreparationOutputRequest(pipelineUuid: pipelineUuid), ); - return _toOutputResponse(response.output); + return OutputResponse(output: response.output); } @override @@ -183,7 +182,7 @@ class GrpcCodeClient implements CodeClient { return OutputResponse(output: response.graph); } catch (ex) { print(ex); - return _toOutputResponse(''); + return OutputResponse(output: ''); } } @@ -249,8 +248,4 @@ class GrpcCodeClient implements CodeClient { } return RunCodeStatus.unspecified; } - - OutputResponse _toOutputResponse(String response) { - return OutputResponse(output: replaceIncorrectSymbols(response)); - } } diff --git a/playground/frontend/playground_components/lib/src/repositories/example_client/grpc_example_client.dart b/playground/frontend/playground_components/lib/src/repositories/example_client/grpc_example_client.dart index db286d31593c..f53c14db0ab6 100644 --- a/playground/frontend/playground_components/lib/src/repositories/example_client/grpc_example_client.dart +++ b/playground/frontend/playground_components/lib/src/repositories/example_client/grpc_example_client.dart @@ -23,7 +23,6 @@ import '../../api/v1/api.pbgrpc.dart' as grpc; import '../../models/category_with_examples.dart'; import '../../models/example_base.dart'; import '../../models/sdk.dart'; -import '../../util/replace_incorrect_symbols.dart'; import '../complexity_grpc_extension.dart'; import '../models/get_default_precompiled_object_request.dart'; import '../models/get_precompiled_object_code_response.dart'; @@ -119,9 +118,7 @@ class GrpcExampleClient implements ExampleClient { ), ); - return GetPrecompiledObjectCodeResponse( - code: replaceIncorrectSymbols(response.code), - ); + return GetPrecompiledObjectCodeResponse(code: response.code); } @override @@ -135,9 +132,7 @@ class GrpcExampleClient implements ExampleClient { ), ); - return OutputResponse( - output: replaceIncorrectSymbols(response.output), - ); + return OutputResponse(output: response.output); } catch (ex) { print(ex); return OutputResponse( @@ -157,9 +152,7 @@ class GrpcExampleClient implements ExampleClient { ), ); - return OutputResponse( - output: replaceIncorrectSymbols(response.output), - ); + return OutputResponse(output: response.output); } catch (ex) { print(ex); return OutputResponse( diff --git a/playground/frontend/playground_components/lib/src/util/replace_incorrect_symbols.dart b/playground/frontend/playground_components/lib/src/util/replace_incorrect_symbols.dart deleted file mode 100644 index 69a0bb684009..000000000000 --- a/playground/frontend/playground_components/lib/src/util/replace_incorrect_symbols.dart +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// TODO(alexeyinkin): Move to the editor, https://github.com/apache/beam/issues/23079 -/// sometimes code contains incorrect symbols (like tab which doesn't look properly), -/// replace it with correct ones -String replaceIncorrectSymbols(String output) { - return output.replaceAll('\t', ' '); -} From 166b881cb4c2f12983382fcd93c1a33191a0fc23 Mon Sep 17 00:00:00 2001 From: Robert Burke Date: Tue, 29 Nov 2022 10:06:55 -0800 Subject: [PATCH 018/456] [#24339] Make Slices use iterable coder instead of custom coder. (#24346) * Improve debugging strings. * Switch slices to use iterable coder. * Handle coderrefs * tests for new translate code * fulltype testing * Add Statebacked iterable case. * Add unlifted combines. * Comment catches. Co-authored-by: lostluck <13907733+lostluck@users.noreply.github.com> --- sdks/go/pkg/beam/coder.go | 21 ++++- .../pkg/beam/core/runtime/exec/datasource.go | 2 +- sdks/go/pkg/beam/core/runtime/exec/pardo.go | 2 +- .../pkg/beam/core/runtime/exec/translate.go | 57 +++++++++++ .../beam/core/runtime/exec/translate_test.go | 94 +++++++++++++++++++ sdks/go/pkg/beam/core/runtime/graphx/coder.go | 30 +++--- .../beam/core/runtime/graphx/coder_test.go | 83 +++++++++------- .../pkg/beam/core/runtime/graphx/dataflow.go | 44 ++++++--- sdks/go/pkg/beam/core/typex/fulltype.go | 35 +++---- sdks/go/pkg/beam/core/typex/fulltype_test.go | 22 +++++ 10 files changed, 309 insertions(+), 81 deletions(-) diff --git a/sdks/go/pkg/beam/coder.go b/sdks/go/pkg/beam/coder.go index 938ee40fb4fb..3a0552e53eba 100644 --- a/sdks/go/pkg/beam/coder.go +++ b/sdks/go/pkg/beam/coder.go @@ -152,7 +152,23 @@ func NewCoder(t FullType) Coder { func inferCoder(t FullType) (*coder.Coder, error) { switch t.Class() { - case typex.Concrete, typex.Container: + case typex.Container: + switch t.Type() { + case reflectx.ByteSlice: + return &coder.Coder{Kind: coder.Bytes, T: t}, nil + } + switch t.Type().Kind() { + case reflect.Slice: + c, err := inferCoder(t.Components()[0]) + if err != nil { + return nil, err + } + return &coder.Coder{Kind: coder.Iterable, T: t, Components: []*coder.Coder{c}}, nil + + default: + panic(fmt.Sprintf("inferCoder: unknown container kind %v", t)) + } + case typex.Concrete: switch t.Type() { case reflectx.Int64: // use the beam varint coder. @@ -183,9 +199,6 @@ func inferCoder(t FullType) (*coder.Coder, error) { case reflectx.String: return &coder.Coder{Kind: coder.String, T: t}, nil - case reflectx.ByteSlice: - return &coder.Coder{Kind: coder.Bytes, T: t}, nil - case reflectx.Bool: return &coder.Coder{Kind: coder.Bool, T: t}, nil diff --git a/sdks/go/pkg/beam/core/runtime/exec/datasource.go b/sdks/go/pkg/beam/core/runtime/exec/datasource.go index 3e5084e857d5..9fa8df7500a7 100644 --- a/sdks/go/pkg/beam/core/runtime/exec/datasource.go +++ b/sdks/go/pkg/beam/core/runtime/exec/datasource.go @@ -324,7 +324,7 @@ func (n *DataSource) Down(ctx context.Context) error { } func (n *DataSource) String() string { - return fmt.Sprintf("DataSource[%v, %v] Coder:%v Out:%v", n.SID, n.Name, n.Coder, n.Out.ID()) + return fmt.Sprintf("DataSource[%v, %v] Out:%v Coder:%v ", n.SID, n.Name, n.Out.ID(), n.Coder) } // incrementIndexAndCheckSplit increments DataSource.index by one and checks if diff --git a/sdks/go/pkg/beam/core/runtime/exec/pardo.go b/sdks/go/pkg/beam/core/runtime/exec/pardo.go index 489878defa02..42de145db808 100644 --- a/sdks/go/pkg/beam/core/runtime/exec/pardo.go +++ b/sdks/go/pkg/beam/core/runtime/exec/pardo.go @@ -403,5 +403,5 @@ func (n *ParDo) fail(err error) error { } func (n *ParDo) String() string { - return fmt.Sprintf("ParDo[%v] Out:%v", path.Base(n.Fn.Name()), IDs(n.Out...)) + return fmt.Sprintf("ParDo[%v] Out:%v Sig: %v", path.Base(n.Fn.Name()), IDs(n.Out...), n.Fn.ProcessElementFn().Fn.Type()) } diff --git a/sdks/go/pkg/beam/core/runtime/exec/translate.go b/sdks/go/pkg/beam/core/runtime/exec/translate.go index 478f37791d3b..c98e48bf54d9 100644 --- a/sdks/go/pkg/beam/core/runtime/exec/translate.go +++ b/sdks/go/pkg/beam/core/runtime/exec/translate.go @@ -21,6 +21,7 @@ import ( "strconv" "strings" + "github.com/apache/beam/sdks/v2/go/pkg/beam/core/funcx" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/coder" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/window" @@ -95,11 +96,67 @@ func UnmarshalPlan(desc *fnpb.ProcessBundleDescriptor) (*Plan, error) { b.units = b.units[:len(b.units)-1] } + mayFixDataSourceCoder(u) b.units = append(b.units, u) } return b.build() } +// mayFixDataSourceCoder checks the node downstream of the DataSource and if applicable, changes +// a KV> coder to a CoGBK. This requires knowledge of the downstream node because +// coder interpretation is ambiguous to received types in DoFns, and we can only interpret it right +// at execution time with knowledge of both. +func mayFixDataSourceCoder(u *DataSource) { + if !coder.IsKV(coder.SkipW(u.Coder)) { + return // If it's not a KV, there's nothing to do here. + } + if coder.SkipW(u.Coder).Components[1].Kind != coder.Iterable { + return // If the V is not an iterable, we don't care. + } + out := u.Out + if mp, ok := out.(*Multiplex); ok { + // Here we trust that the Multiplex Outs are all the same signature, since we've validated + // that at construction time. + out = mp.Out[0] + } + + switch n := out.(type) { + // These nodes always expect CoGBK behavior. + case *Expand, *MergeAccumulators, *ReshuffleOutput, *Combine: + u.Coder = convertToCoGBK(u.Coder) + return + case *ParDo: + // So we now know we have a KV>. So we need to validate whether the DoFn has an + // iter function in the value slot. If it does, we need to use a CoGBK coder. + sig := n.Fn.ProcessElementFn() + // Get all valid inputs and side inputs. + in := sig.Params(funcx.FnValue | funcx.FnIter | funcx.FnReIter) + + if len(in) < 2 { + return // Somehow there's only a single value, so we're done. (Defense against generic KVs) + } + // It's an iterator, so we can assume it's a GBK, due to previous pre-conditions. + if sig.Param[in[1]].Kind == funcx.FnIter { + u.Coder = convertToCoGBK(u.Coder) + return + } + } +} + +func convertToCoGBK(oc *coder.Coder) *coder.Coder { + ocnw := coder.SkipW(oc) + // Validate that all values from the coder are iterables. + comps := make([]*coder.Coder, 0, len(ocnw.Components)) + comps = append(comps, ocnw.Components[0]) + for _, c := range ocnw.Components[1:] { + if c.Kind != coder.Iterable { + panic(fmt.Sprintf("want all values to be iterables: %v", oc)) + } + comps = append(comps, c.Components[0]) + } + return coder.NewW(coder.NewCoGBK(comps), oc.Window) +} + type builder struct { desc *fnpb.ProcessBundleDescriptor coders *graphx.CoderUnmarshaller diff --git a/sdks/go/pkg/beam/core/runtime/exec/translate_test.go b/sdks/go/pkg/beam/core/runtime/exec/translate_test.go index 3867b9425161..17d33caf8b1a 100644 --- a/sdks/go/pkg/beam/core/runtime/exec/translate_test.go +++ b/sdks/go/pkg/beam/core/runtime/exec/translate_test.go @@ -21,6 +21,7 @@ import ( "testing" "time" + "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/coder" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/window" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/runtime/graphx" @@ -90,6 +91,99 @@ func TestUnmarshalReshuffleCoders(t *testing.T) { } } +func TestMayFixDataSourceCoder(t *testing.T) { + knownStart := coder.NewW( + coder.NewKV([]*coder.Coder{coder.NewBytes(), coder.NewI(coder.NewString())}), + coder.NewGlobalWindow()) + knownWant := coder.NewW( + coder.NewCoGBK([]*coder.Coder{coder.NewBytes(), coder.NewString()}), + coder.NewGlobalWindow()) + + makeParDo := func(t *testing.T, fn any) *ParDo { + t.Helper() + dfn, err := graph.NewDoFn(fn) + if err != nil { + t.Fatalf("couldn't construct ParDo with Sig: %T %v", fn, err) + } + return &ParDo{Fn: dfn} + } + + tests := []struct { + name string + start, want *coder.Coder + out Node + }{ + { + name: "bytes", + start: coder.NewBytes(), + }, { + name: "W", + start: coder.NewW(coder.NewBytes(), coder.NewGlobalWindow()), + }, { + name: "W", + start: coder.NewW( + coder.NewKV([]*coder.Coder{coder.NewBytes(), coder.NewBool()}), + coder.NewGlobalWindow()), + }, { + name: "W>_nil", + start: knownStart, + }, { + name: "W>_Expand", + out: &Expand{}, + start: knownStart, + want: knownWant, + }, { + name: "W>_Combine", + out: &Combine{}, + start: knownStart, + want: knownWant, + }, { + name: "W>_ReshuffleOutput", + out: &ReshuffleOutput{}, + start: knownStart, + want: knownWant, + }, { + name: "W>_MergeAccumulators", + out: &MergeAccumulators{}, + start: knownStart, + want: knownWant, + }, { + name: "W>_Multiplex_Expand", + out: &Multiplex{Out: []Node{&Expand{}}}, + start: knownStart, + want: knownWant, + }, { + name: "W>_Multiplex_ParDo_KV", + out: &Multiplex{Out: []Node{makeParDo(t, func([]byte, []string) {})}}, + start: knownStart, + }, { + name: "W>_Multiplex_ParDo_GBK", + out: &Multiplex{Out: []Node{makeParDo(t, func([]byte, func(*string) bool) {})}}, + start: knownStart, + want: knownWant, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + // If want is nil, we expect no changes. + if test.want == nil { + test.want = test.start + } + + u := &DataSource{ + Coder: test.start, + Out: test.out, + } + mayFixDataSourceCoder(u) + if !test.want.Equals(u.Coder) { + t.Errorf("mayFixDataSourceCoder(Datasource[Coder: %v, Out: %T]), got %v, want %v", test.start, test.out, u.Coder, test.want) + } + + }) + } +} + func TestUnmarshallWindowFn(t *testing.T) { tests := []struct { name string diff --git a/sdks/go/pkg/beam/core/runtime/graphx/coder.go b/sdks/go/pkg/beam/core/runtime/graphx/coder.go index cecbea86ed7c..3af820e8a2bc 100644 --- a/sdks/go/pkg/beam/core/runtime/graphx/coder.go +++ b/sdks/go/pkg/beam/core/runtime/graphx/coder.go @@ -216,9 +216,6 @@ func (b *CoderUnmarshaller) makeCoder(id string, c *pipepb.Coder) (*coder.Coder, } id := components[1] - kind := coder.KV - root := typex.KVType - elm, err := b.peek(id) if err != nil { return nil, err @@ -226,15 +223,15 @@ func (b *CoderUnmarshaller) makeCoder(id string, c *pipepb.Coder) (*coder.Coder, switch elm.GetSpec().GetUrn() { case urnIterableCoder, urnStateBackedIterableCoder: - id = elm.GetComponentCoderIds()[0] - kind = coder.CoGBK - root = typex.CoGBKType + iterElmID := elm.GetComponentCoderIds()[0] // TODO(https://github.com/apache/beam/issues/18032): If CoGBK with > 1 input, handle as special GBK. We expect // it to be encoded as CoGBK>>. Remove this handling once // CoGBK has a first-class representation. - if ids, ok := b.isCoGBKList(id); ok { + // If the value is an iterable, and a special CoGBK type, then expand it to the real + // CoGBK signature, instead of the special type. + if ids, ok := b.isCoGBKList(iterElmID); ok { // CoGBK values, err := b.Coders(ids) @@ -242,9 +239,11 @@ func (b *CoderUnmarshaller) makeCoder(id string, c *pipepb.Coder) (*coder.Coder, return nil, err } - t := typex.New(root, append([]typex.FullType{key.T}, coder.Types(values)...)...) - return &coder.Coder{Kind: kind, T: t, Components: append([]*coder.Coder{key}, values...)}, nil + t := typex.New(typex.CoGBKType, append([]typex.FullType{key.T}, coder.Types(values)...)...) + return &coder.Coder{Kind: coder.CoGBK, T: t, Components: append([]*coder.Coder{key}, values...)}, nil } + // It's valid to have a KV> without being a CoGBK, and validating if we need to change to + // a CoGBK is done at the DataSource, since that's when we can check against the downstream nodes. } value, err := b.Coder(id) @@ -252,8 +251,8 @@ func (b *CoderUnmarshaller) makeCoder(id string, c *pipepb.Coder) (*coder.Coder, return nil, err } - t := typex.New(root, key.T, value.T) - return &coder.Coder{Kind: kind, T: t, Components: []*coder.Coder{key, value}}, nil + t := typex.New(typex.KVType, key.T, value.T) + return &coder.Coder{Kind: coder.KV, T: t, Components: []*coder.Coder{key, value}}, nil case urnLengthPrefixCoder: if len(components) != 1 { @@ -338,7 +337,7 @@ func (b *CoderUnmarshaller) makeCoder(id string, c *pipepb.Coder) (*coder.Coder, } return c, nil - case urnIterableCoder: + case urnIterableCoder, urnStateBackedIterableCoder: if len(components) != 1 { return nil, errors.Errorf("could not unmarshal iterable coder from %v, expected one component but got %d", c, len(components)) } @@ -553,6 +552,13 @@ func (b *CoderMarshaller) Add(c *coder.Coder) (string, error) { return b.internBuiltInCoder(urnTimerCoder, comp...), nil + case coder.Iterable: + comp, err := b.AddMulti(c.Components) + if err != nil { + return "", errors.Wrapf(err, "failed to marshal iterable coder %v", c) + } + return b.internBuiltInCoder(urnIterableCoder, comp...), nil + default: err := errors.Errorf("unexpected coder kind: %v", c.Kind) return "", errors.WithContextf(err, "failed to marshal coder %v", c) diff --git a/sdks/go/pkg/beam/core/runtime/graphx/coder_test.go b/sdks/go/pkg/beam/core/runtime/graphx/coder_test.go index aad15df0f23f..01c70181ce89 100644 --- a/sdks/go/pkg/beam/core/runtime/graphx/coder_test.go +++ b/sdks/go/pkg/beam/core/runtime/graphx/coder_test.go @@ -49,60 +49,70 @@ func TestMarshalUnmarshalCoders(t *testing.T) { baz := custom("baz", reflectx.Int) tests := []struct { - name string - c *coder.Coder + name string + c *coder.Coder + equivalent *coder.Coder }{ { - "bytes", - coder.NewBytes(), + name: "bytes", + c: coder.NewBytes(), }, { - "bool", - coder.NewBool(), + name: "bool", + c: coder.NewBool(), }, { - "varint", - coder.NewVarInt(), + name: "varint", + c: coder.NewVarInt(), }, { - "double", - coder.NewDouble(), + name: "double", + c: coder.NewDouble(), }, { - "string", - coder.NewString(), + name: "string", + c: coder.NewString(), }, { - "foo", - foo, + name: "foo", + c: foo, }, { - "bar", - bar, + name: "bar", + c: bar, }, { - "baz", - baz, + name: "baz", + c: baz, }, { - "W", - coder.NewW(coder.NewBytes(), coder.NewGlobalWindow()), + name: "W", + c: coder.NewW(coder.NewBytes(), coder.NewGlobalWindow()), }, { - "N", - coder.NewN(coder.NewBytes()), + name: "N", + c: coder.NewN(coder.NewBytes()), }, { - "KV", - coder.NewKV([]*coder.Coder{foo, bar}), + name: "I", + c: coder.NewI(foo), }, { - "CoGBK", - coder.NewCoGBK([]*coder.Coder{foo, bar}), + name: "KV", + c: coder.NewKV([]*coder.Coder{foo, bar}), }, { - "CoGBK", - coder.NewCoGBK([]*coder.Coder{foo, bar, baz}), + name: "KV>", + c: coder.NewKV([]*coder.Coder{foo, coder.NewI(bar)}), + }, + { + name: "CoGBK", + c: coder.NewCoGBK([]*coder.Coder{foo, bar}), + equivalent: coder.NewKV([]*coder.Coder{foo, coder.NewI(bar)}), + }, + { + name: "CoGBK", + c: coder.NewCoGBK([]*coder.Coder{foo, bar, baz}), }, { name: "R[graphx.registeredNamedTypeForTest]", @@ -124,7 +134,10 @@ func TestMarshalUnmarshalCoders(t *testing.T) { if err != nil { t.Fatalf("Unmarshal(Marshal(%v)) failed: %v", test.c, err) } - if len(coders) != 1 || !test.c.Equals(coders[0]) { + if test.equivalent != nil && !test.equivalent.Equals(coders[0]) { + t.Errorf("Unmarshal(Marshal(%v)) = %v, want equivalent", test.equivalent, coders) + } + if test.equivalent == nil && !test.c.Equals(coders[0]) { t.Errorf("Unmarshal(Marshal(%v)) = %v, want identity", test.c, coders) } }) @@ -149,7 +162,10 @@ func TestMarshalUnmarshalCoders(t *testing.T) { if err != nil { t.Fatalf("Unmarshal(Marshal(%v)) failed: %v", test.c, err) } - if len(coders) != 1 || !test.c.Equals(coders[0]) { + if test.equivalent != nil && !test.equivalent.Equals(coders[0]) { + t.Errorf("Unmarshal(Marshal(%v)) = %v, want equivalent", test.equivalent, coders) + } + if test.equivalent == nil && !test.c.Equals(coders[0]) { t.Errorf("Unmarshal(Marshal(%v)) = %v, want identity", test.c, coders) } }) @@ -166,8 +182,11 @@ func TestMarshalUnmarshalCoders(t *testing.T) { if err != nil { t.Fatalf("DecodeCoderRef(EncodeCoderRef(%v)) failed: %v", test.c, err) } - if !test.c.Equals(got) { - t.Errorf("DecodeCoderRef(EncodeCoderRef(%v)) = %v, want identity", test.c, got) + if test.equivalent != nil && !test.equivalent.Equals(got) { + t.Errorf("DecodeCoderRef(EncodeCoderRef(%v)) = %v want equivalent", test.equivalent, got) + } + if test.equivalent == nil && !test.c.Equals(got) { + t.Errorf("DecodeCoderRef(EncodeCoderRef(%v)) = %v want identity", test.c, got) } }) } diff --git a/sdks/go/pkg/beam/core/runtime/graphx/dataflow.go b/sdks/go/pkg/beam/core/runtime/graphx/dataflow.go index efdf4ef140f4..1bda90aa9e1c 100644 --- a/sdks/go/pkg/beam/core/runtime/graphx/dataflow.go +++ b/sdks/go/pkg/beam/core/runtime/graphx/dataflow.go @@ -16,6 +16,8 @@ package graphx import ( + "reflect" + "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/coder" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/runtime/graphx/schema" v1pb "github.com/apache/beam/sdks/v2/go/pkg/beam/core/runtime/graphx/v1" @@ -128,6 +130,16 @@ func EncodeCoderRef(c *coder.Coder) (*CoderRef, error) { } return &CoderRef{Type: nullableType, Components: []*CoderRef{innerref}}, nil + case coder.Iterable: + if len(c.Components) != 1 { + return nil, errors.Errorf("bad I: %v", c) + } + innerref, err := EncodeCoderRef(c.Components[0]) + if err != nil { + return nil, err + } + return &CoderRef{Type: streamType, Components: []*CoderRef{innerref}}, nil + case coder.CoGBK: if len(c.Components) < 2 { return nil, errors.Errorf("bad CoGBK: %v", c) @@ -243,27 +255,19 @@ func DecodeCoderRef(c *CoderRef) (*coder.Coder, error) { } elm := c.Components[1] - kind := coder.KV - root := typex.KVType - - isGBK := elm.Type == streamType - if isGBK { - elm = elm.Components[0] - kind = coder.CoGBK - root = typex.CoGBKType - + if elm.Type == streamType { // TODO(https://github.com/apache/beam/issues/18032): If CoGBK with > 1 input, handle as special GBK. We expect // it to be encoded as CoGBK>. Remove this handling once // CoGBK has a first-class representation. - if refs, ok := isCoGBKList(elm); ok { + if refs, ok := isCoGBKList(elm.Components[0]); ok { values, err := DecodeCoderRefs(refs) if err != nil { return nil, err } - t := typex.New(root, append([]typex.FullType{key.T}, coder.Types(values)...)...) - return &coder.Coder{Kind: kind, T: t, Components: append([]*coder.Coder{key}, values...)}, nil + t := typex.New(typex.CoGBKType, append([]typex.FullType{key.T}, coder.Types(values)...)...) + return &coder.Coder{Kind: coder.CoGBK, T: t, Components: append([]*coder.Coder{key}, values...)}, nil } } @@ -272,8 +276,8 @@ func DecodeCoderRef(c *CoderRef) (*coder.Coder, error) { return nil, err } - t := typex.New(root, key.T, value.T) - return &coder.Coder{Kind: kind, T: t, Components: []*coder.Coder{key, value}}, nil + t := typex.New(typex.KVType, key.T, value.T) + return &coder.Coder{Kind: coder.KV, T: t, Components: []*coder.Coder{key, value}}, nil case nullableType: if len(c.Components) != 1 { @@ -319,7 +323,17 @@ func DecodeCoderRef(c *CoderRef) (*coder.Coder, error) { return &coder.Coder{Kind: coder.WindowedValue, T: t, Components: []*coder.Coder{elm}, Window: w}, nil case streamType: - return nil, errors.Errorf("stream must be pair value: %+v", c) + if len(c.Components) != 1 { + return nil, errors.Errorf("bad iterable/stream: %+v", c) + } + + inner, err := DecodeCoderRef(c.Components[0]) + if err != nil { + return nil, err + } + + t := typex.New(reflect.SliceOf(inner.T.Type()), inner.T) + return &coder.Coder{Kind: coder.Iterable, T: t, Components: []*coder.Coder{inner}}, nil case rowType: subC := c.Components[0] diff --git a/sdks/go/pkg/beam/core/typex/fulltype.go b/sdks/go/pkg/beam/core/typex/fulltype.go index 386acad38185..41ef0ab09d22 100644 --- a/sdks/go/pkg/beam/core/typex/fulltype.go +++ b/sdks/go/pkg/beam/core/typex/fulltype.go @@ -108,8 +108,13 @@ func New(t reflect.Type, components ...FullType) FullType { case Container: switch t.Kind() { case reflect.Slice: - // We include the child type as a component for convenience. - return &tree{class, t, []FullType{New(t.Elem())}} + if len(components) == 0 { + // For elements without sub components, we just create with the type, this handles vanilla slices. + // We include the child type as a component for convenience. + return &tree{class, t, []FullType{New(t.Elem())}} + } + // For elements which themselves have components, we need to go deeper. + return &tree{class, t, []FullType{New(t.Elem(), components[0].Components()...)}} default: panic(fmt.Sprintf("Unexpected aggregate type: %v", t)) } @@ -117,10 +122,10 @@ func New(t reflect.Type, components ...FullType) FullType { switch t { case KVType: if len(components) != 2 { - panic("Invalid number of components for KV") + panic(fmt.Sprintf("Invalid number of components for KV: %v, %v", t, components)) } if isAnyNonKVComposite(components) { - panic("Invalid to nest composites inside KV") + panic(fmt.Sprintf("Invalid to nest composite composites inside KV: %v, %v", t, components)) } return &tree{class, t, components} case WindowedValueType: @@ -133,10 +138,10 @@ func New(t reflect.Type, components ...FullType) FullType { return &tree{class, t, components} case CoGBKType: if len(components) < 2 { - panic("Invalid number of components for CoGBK") + panic(fmt.Sprintf("Invalid number of components for CoGBK: %v", t)) } if isAnyNonKVComposite(components) { - panic("Invalid to nest composites inside CoGBK") + panic(fmt.Sprintf("Invalid to nest composites inside CoGBK: %v", t)) } return &tree{class, t, components} case TimersType: @@ -221,15 +226,14 @@ func NewCoGBK(components ...FullType) FullType { // // For example: // -// SA: KV := KV -// SA: KV := KV // X bound to string by assignment -// SA: KV := KV // Assignable only if X is already bound to string -// SA: KV := KV // Not assignable under any binding -// -// Not SA: KV := KV -// Not SA: X := KV -// Not SA: GBK(X,Y) := KV +// SA: KV := KV +// SA: KV := KV // X bound to string by assignment +// SA: KV := KV // Assignable only if X is already bound to string +// SA: KV := KV // Not assignable under any binding // +// Not SA: KV := KV +// Not SA: X := KV +// Not SA: GBK(X,Y) := KV func IsStructurallyAssignable(from, to FullType) bool { switch from.Class() { case Concrete: @@ -423,6 +427,5 @@ func checkTypesNotNil(list []FullType) { // NoFiringPane return PaneInfo assigned as NoFiringPane(0x0f) func NoFiringPane() PaneInfo { - pn := PaneInfo{IsFirst: true, IsLast: true, Timing: PaneUnknown} - return pn + return PaneInfo{IsFirst: true, IsLast: true, Timing: PaneUnknown} } diff --git a/sdks/go/pkg/beam/core/typex/fulltype_test.go b/sdks/go/pkg/beam/core/typex/fulltype_test.go index 8c7c161b7010..060404ad9115 100644 --- a/sdks/go/pkg/beam/core/typex/fulltype_test.go +++ b/sdks/go/pkg/beam/core/typex/fulltype_test.go @@ -34,6 +34,10 @@ func TestIsBound(t *testing.T) { {NewKV(New(reflectx.String), New(reflect.SliceOf(reflectx.Int))), true}, {NewKV(New(reflectx.String), New(reflect.SliceOf(XType))), false}, {NewKV(New(reflectx.String), New(reflectx.String)), true}, + {NewKV(New(reflectx.String), New(reflect.SliceOf(reflectx.String))), true}, + {NewW(NewKV(New(reflectx.ByteSlice), New(reflectx.Int))), true}, + {New(reflect.SliceOf(KVType), NewKV(New(reflectx.ByteSlice), New(reflectx.Int))), true}, + {New(TimersType, New(reflectx.ByteSlice)), true}, } for _, test := range tests { @@ -44,6 +48,9 @@ func TestIsBound(t *testing.T) { } func TestIsStructurallyAssignable(t *testing.T) { + type foo int + var f foo + fooT := reflect.TypeOf(f) tests := []struct { A, B FullType Exp bool @@ -52,6 +59,7 @@ func TestIsStructurallyAssignable(t *testing.T) { {New(reflectx.Int32), New(reflectx.Int64), false}, // from Go assignability {New(reflectx.Int64), New(reflectx.Int32), false}, // from Go assignability {New(reflectx.Int), New(TType), true}, + {New(reflectx.Int), New(fooT), false}, {New(XType), New(TType), true}, {NewKV(New(XType), New(YType)), New(TType), false}, // T cannot match composites {NewKV(New(reflectx.Int), New(reflectx.Int)), NewCoGBK(New(reflectx.Int), New(reflectx.Int)), false}, // structural mismatch @@ -60,6 +68,8 @@ func TestIsStructurallyAssignable(t *testing.T) { {NewKV(New(reflectx.String), New(reflectx.Int)), NewKV(New(TType), New(TType)), true}, {NewKV(New(reflectx.Int), New(reflectx.Int)), NewKV(New(TType), New(TType)), true}, {NewKV(New(reflectx.Int), New(reflectx.String)), NewKV(New(TType), New(reflectx.String)), true}, + {New(reflect.SliceOf(reflectx.Int)), New(reflect.SliceOf(fooT)), false}, + {New(reflect.SliceOf(reflectx.Int)), New(TType), true}, } for _, test := range tests { @@ -103,6 +113,18 @@ func TestBindSubstitute(t *testing.T) { NewCoGBK(New(YType), New(XType)), NewCoGBK(New(XType), New(ZType)), }, + { + New(reflect.SliceOf(reflectx.String)), + New(XType), + NewKV(New(reflectx.Int), New(XType)), + NewKV(New(reflectx.Int), New(reflect.SliceOf(reflectx.String))), + }, + { + New(reflectx.String), + New(XType), + NewKV(New(reflectx.Int), New(reflect.SliceOf(XType))), + NewKV(New(reflectx.Int), New(reflect.SliceOf(reflectx.String))), + }, } for _, test := range tests { From 1335b98ce54a08fcf0f274e0fda0ecd86edbb178 Mon Sep 17 00:00:00 2001 From: Jack McCluskey <34928439+jrmccluskey@users.noreply.github.com> Date: Tue, 29 Nov 2022 15:02:38 -0500 Subject: [PATCH 019/456] Add custom inference fns to CHANGES.md (#24412) --- CHANGES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.md b/CHANGES.md index 65104de7db8f..293f2c200bf0 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -72,6 +72,7 @@ (Python) ([#23684](https://github.com/apache/beam/pull/23684)). * Pipeline Resource Hints now supported via `--resource_hints` flag (Go) ([#23990](https://github.com/apache/beam/pull/23990)). * Make Python SDK containers reusable on portable runners by installing dependencies to temporary venvs ([BEAM-12792](https://issues.apache.org/jira/browse/BEAM-12792)). +* RunInference model handlers now support the specification of a custom inference function in Python ([#22572](https://github.com/apache/beam/issues/22572)) ## Breaking Changes From 7286f55fb5f9aabcb912703d6d7d11070abe24e5 Mon Sep 17 00:00:00 2001 From: Yi Hu Date: Tue, 29 Nov 2022 18:12:04 -0500 Subject: [PATCH 020/456] Better warning and Exception message in CalciteUtil (#24414) * toSqlTypeName still accept SqlCharType to avoid breaking change * Add logical type identifier to exception message --- .../sql/impl/utils/CalciteUtils.java | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/utils/CalciteUtils.java b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/utils/CalciteUtils.java index 153acce03e1b..dbc5ee0df0ea 100644 --- a/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/utils/CalciteUtils.java +++ b/sdks/java/extensions/sql/src/main/java/org/apache/beam/sdk/extensions/sql/impl/utils/CalciteUtils.java @@ -40,9 +40,13 @@ import org.apache.beam.vendor.calcite.v1_28_0.org.apache.calcite.sql.type.SqlTypeName; import org.joda.time.Instant; import org.joda.time.base.AbstractInstant; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** Utility methods for Calcite related operations. */ public class CalciteUtils { + private static final Logger LOG = LoggerFactory.getLogger(CalciteUtils.class); + private static final long UNLIMITED_ARRAY_SIZE = -1L; // SQL has schema types that do not directly correspond to Beam Schema types. We define @@ -187,11 +191,21 @@ public static SqlTypeName toSqlTypeName(FieldType type) { typeName = BEAM_TO_CALCITE_DEFAULT_MAPPING.get(type); } if (typeName == null) { - if (type.getLogicalType() != null) { - Schema.LogicalType logicalType = type.getLogicalType(); + Schema.LogicalType logicalType = type.getLogicalType(); + if (logicalType != null) { if (logicalType instanceof PassThroughLogicalType) { // for pass through logical type, just return its base type return toSqlTypeName(logicalType.getBaseType()); + } else if ("SqlCharType".equals(logicalType.getIdentifier())) { + LOG.warn( + "SqlCharType is used in Schema. It was removed in Beam 2.44.0 and should be" + + " replaced by FixedString logical type."); + return SqlTypeName.CHAR; + } else { + throw new IllegalArgumentException( + String.format( + "Cannot find a matching Calcite SqlTypeName for Beam logical type: %s", + logicalType.getIdentifier())); } } throw new IllegalArgumentException( From 2fc974cce7b81d8c0ed15dcc761e9eef68a0d5d6 Mon Sep 17 00:00:00 2001 From: Robert Burke Date: Tue, 29 Nov 2022 15:27:18 -0800 Subject: [PATCH 021/456] List breaking change #24339 in Changes.md (#24420) --- CHANGES.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 293f2c200bf0..e1558909518d 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -84,6 +84,8 @@ TextIO) and will now need to declare the dependency. * `beam-sdks-java-core` is no longer a dependency of the Java SDK Harness. Users of a portable runner (such as Dataflow Runner v2) will need to provide this package and its dependencies. +* Slices now use the Beam Iterable Coder. This enables cross language use, but breaks pipeline updates + if a Slice type is used as a PCollection element or State API element. (Go)[#24339](https://github.com/apache/beam/issues/24339) ## Deprecations From 9a575b742e12b37f2c8f2ada6b1b2d7e4810bb7f Mon Sep 17 00:00:00 2001 From: Tianyang Hu Date: Tue, 29 Nov 2022 16:35:53 -0800 Subject: [PATCH 022/456] Allow composite output types in sql.Transform. (#24421) --- sdks/go/pkg/beam/transforms/sql/sql.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sdks/go/pkg/beam/transforms/sql/sql.go b/sdks/go/pkg/beam/transforms/sql/sql.go index 36f4a55e5baa..2a373d98240a 100644 --- a/sdks/go/pkg/beam/transforms/sql/sql.go +++ b/sdks/go/pkg/beam/transforms/sql/sql.go @@ -59,9 +59,9 @@ func Input(name string, in beam.PCollection) Option { // // There is currently no default output type, so users must set this option. // In the future, Row, once implemented, may become the default output type. -func OutputType(t reflect.Type) Option { +func OutputType(t reflect.Type, components ...typex.FullType) Option { return func(o sqlx.Options) { - o.(*options).outType = typex.New(t) + o.(*options).outType = typex.New(t, components...) } } @@ -89,11 +89,11 @@ func ExpansionAddr(addr string) Option { // // Example: // -// in := beam.Create(s, 1, 2, 3) -// out := sql.Transform(s, "SELECT COUNT(*) FROM t", -// sql.Input("t", in), -// sql.OutputType(reflect.TypeOf(int64(0)))) -// // `out` is a PCollection with a single element 3. +// in := beam.Create(s, 1, 2, 3) +// out := sql.Transform(s, "SELECT COUNT(*) FROM t", +// sql.Input("t", in), +// sql.OutputType(reflect.TypeOf(int64(0)))) +// // `out` is a PCollection with a single element 3. // // If an expansion service address is not provided as an option, one will be // automatically started for the transform. From 3951a7d0ad9c2bfc57e830afce43827c7e3236ab Mon Sep 17 00:00:00 2001 From: camphillips22 Date: Tue, 29 Nov 2022 19:36:18 -0500 Subject: [PATCH 023/456] Add map_windows support to Go SDK (#24307) --- CHANGES.md | 1 + sdks/go/pkg/beam/core/graph/coder/coder.go | 10 +++ sdks/go/pkg/beam/core/runtime/exec/coder.go | 42 ++++++++++++- .../pkg/beam/core/runtime/exec/coder_test.go | 3 + .../pkg/beam/core/runtime/exec/translate.go | 11 ++++ sdks/go/pkg/beam/core/runtime/exec/window.go | 50 +++++++++++++++ .../pkg/beam/core/runtime/exec/window_test.go | 63 +++++++++++++++++++ sdks/go/pkg/beam/core/runtime/graphx/coder.go | 21 +++---- .../beam/core/runtime/graphx/coder_test.go | 4 ++ .../pkg/beam/core/runtime/graphx/dataflow.go | 6 ++ .../pkg/beam/core/runtime/graphx/translate.go | 1 + 11 files changed, 198 insertions(+), 14 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index e1558909518d..47d170dd905d 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -73,6 +73,7 @@ * Pipeline Resource Hints now supported via `--resource_hints` flag (Go) ([#23990](https://github.com/apache/beam/pull/23990)). * Make Python SDK containers reusable on portable runners by installing dependencies to temporary venvs ([BEAM-12792](https://issues.apache.org/jira/browse/BEAM-12792)). * RunInference model handlers now support the specification of a custom inference function in Python ([#22572](https://github.com/apache/beam/issues/22572)) +* Support for `map_windows` urn added to Go SDK ([#24307](https://github.apache/beam/pull/24307)). ## Breaking Changes diff --git a/sdks/go/pkg/beam/core/graph/coder/coder.go b/sdks/go/pkg/beam/core/graph/coder/coder.go index 3ee83502f343..bd6dd8c89ff9 100644 --- a/sdks/go/pkg/beam/core/graph/coder/coder.go +++ b/sdks/go/pkg/beam/core/graph/coder/coder.go @@ -178,6 +178,11 @@ const ( KV Kind = "KV" LP Kind = "LP" // Explicitly length prefixed, likely at the runner's direction. + // IW stands for IntervalWindow and uses the short name to avoid a collision with the + // WindowCoder kind. This Kind is used when the window is provided as a value instead + // of a window for the value. + IW Kind = "IW" + Window Kind = "window" // A debug wrapper around a window coder. // CoGBK is currently equivalent to either @@ -294,6 +299,11 @@ func NewString() *Coder { return &Coder{Kind: String, T: typex.New(reflectx.String)} } +// NewIntervalWindowCoder returns a new IntervalWindow coder using the built-in scheme. +func NewIntervalWindowCoder() *Coder { + return &Coder{Kind: IW, T: typex.New(reflect.TypeOf((*struct{ Start, End int64 })(nil)).Elem())} +} + // IsW returns true iff the coder is for a WindowedValue. func IsW(c *Coder) bool { return c.Kind == WindowedValue diff --git a/sdks/go/pkg/beam/core/runtime/exec/coder.go b/sdks/go/pkg/beam/core/runtime/exec/coder.go index 0421b253bbe1..5adaddd28ae8 100644 --- a/sdks/go/pkg/beam/core/runtime/exec/coder.go +++ b/sdks/go/pkg/beam/core/runtime/exec/coder.go @@ -16,13 +16,12 @@ package exec import ( + "bytes" "fmt" "io" "reflect" "strings" - "bytes" - "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/coder" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/mtime" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/window" @@ -117,6 +116,9 @@ func MakeElementEncoder(c *coder.Coder) ElementEncoder { snd: MakeElementEncoder(c.Components[1]), } + case coder.IW: + return &intervalWindowValueEncoder{} + case coder.Window: return &wrappedWindowEncoder{ enc: MakeWindowEncoder(c.Window), @@ -229,6 +231,9 @@ func MakeElementDecoder(c *coder.Coder) ElementDecoder { snd: MakeElementDecoder(c.Components[1]), } + case coder.IW: + return &intervalWindowValueDecoder{} + // The following cases are not expected to be executed in the normal // course of a pipeline, however including them here enables simpler // end to end validation of standard coders against @@ -589,7 +594,8 @@ func (c *kvDecoder) DecodeTo(r io.Reader, fv *FullValue) error { // Elm will be the decoded type. // // Example: -// KV> decodes to *FullValue{Elm: int, Elm2: *FullValue{...}} +// +// KV> decodes to *FullValue{Elm: int, Elm2: *FullValue{...}} func (c *kvDecoder) Decode(r io.Reader) (*FullValue, error) { fv := &FullValue{} if err := c.DecodeTo(r, fv); err != nil { @@ -1180,6 +1186,36 @@ func (*intervalWindowDecoder) DecodeSingle(r io.Reader) (typex.Window, error) { return window.IntervalWindow{Start: mtime.FromMilliseconds(end.Milliseconds() - int64(duration)), End: end}, nil } +type intervalWindowValueEncoder struct { + intervalWindowEncoder +} + +func (e *intervalWindowValueEncoder) Encode(v *FullValue, w io.Writer) error { + return e.EncodeSingle(v.Elm.(window.IntervalWindow), w) +} + +type intervalWindowValueDecoder struct { + intervalWindowDecoder +} + +func (d *intervalWindowValueDecoder) Decode(r io.Reader) (*FullValue, error) { + fv := &FullValue{} + err := d.DecodeTo(r, fv) + if err != nil { + return nil, err + } + return fv, nil +} + +func (d *intervalWindowValueDecoder) DecodeTo(r io.Reader, value *FullValue) error { + w, err := d.DecodeSingle(r) + if err != nil { + return err + } + value.Elm = w + return nil +} + // EncodeWindowedValueHeader serializes a windowed value header. func EncodeWindowedValueHeader(enc WindowEncoder, ws []typex.Window, t typex.EventTime, p typex.PaneInfo, w io.Writer) error { // Encoding: Timestamp, Window, Pane (header) + Element diff --git a/sdks/go/pkg/beam/core/runtime/exec/coder_test.go b/sdks/go/pkg/beam/core/runtime/exec/coder_test.go index f69aadbb2975..75d18e533cf1 100644 --- a/sdks/go/pkg/beam/core/runtime/exec/coder_test.go +++ b/sdks/go/pkg/beam/core/runtime/exec/coder_test.go @@ -89,6 +89,9 @@ func TestCoders(t *testing.T) { }, { coder: coder.NewN(coder.NewBytes()), val: &FullValue{Elm: []byte("myBytes")}, + }, { + coder: coder.NewIntervalWindowCoder(), + val: &FullValue{Elm: window.IntervalWindow{Start: 0, End: 100}}, }, } { t.Run(fmt.Sprintf("%v", test.coder), func(t *testing.T) { diff --git a/sdks/go/pkg/beam/core/runtime/exec/translate.go b/sdks/go/pkg/beam/core/runtime/exec/translate.go index c98e48bf54d9..78cf0ef65cd6 100644 --- a/sdks/go/pkg/beam/core/runtime/exec/translate.go +++ b/sdks/go/pkg/beam/core/runtime/exec/translate.go @@ -762,6 +762,17 @@ func (b *builder) makeLink(from string, id linkID) (Node, error) { } u = &WindowInto{UID: b.idgen.New(), Fn: wfn, Out: out[0]} + case graphx.URNMapWindows: + var fn pipepb.FunctionSpec + if err := proto.Unmarshal(payload, &fn); err != nil { + return nil, errors.Wrapf(err, "invalid SideInput payload for %v", transform) + } + mapper, err := unmarshalAndMakeWindowMapping(&fn) + if err != nil { + return nil, err + } + u = &MapWindows{UID: b.idgen.New(), Fn: mapper, Out: out[0]} + case graphx.URNFlatten: u = &Flatten{UID: b.idgen.New(), N: len(transform.Inputs), Out: out[0]} diff --git a/sdks/go/pkg/beam/core/runtime/exec/window.go b/sdks/go/pkg/beam/core/runtime/exec/window.go index 02640f5dcfad..4048d102790c 100644 --- a/sdks/go/pkg/beam/core/runtime/exec/window.go +++ b/sdks/go/pkg/beam/core/runtime/exec/window.go @@ -23,6 +23,7 @@ import ( "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/mtime" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/graph/window" "github.com/apache/beam/sdks/v2/go/pkg/beam/core/typex" + "github.com/apache/beam/sdks/v2/go/pkg/beam/internal/errors" ) // WindowInto places each element in one or more windows. @@ -97,6 +98,55 @@ func (w *WindowInto) String() string { return fmt.Sprintf("WindowInto[%v]. Out:%v", w.Fn, w.Out.ID()) } +type MapWindows struct { + UID UnitID + Fn WindowMapper + Out Node +} + +func (m *MapWindows) ID() UnitID { + return m.UID +} + +func (m *MapWindows) Up(_ context.Context) error { + return nil +} + +func (m *MapWindows) StartBundle(ctx context.Context, id string, data DataContext) error { + return m.Out.StartBundle(ctx, id, data) +} + +func (m *MapWindows) ProcessElement(ctx context.Context, elm *FullValue, values ...ReStream) error { + w, ok := elm.Elm2.(window.IntervalWindow) + if !ok { + return errors.Errorf("not an IntervalWindow, got %T", elm.Elm2) + } + newW, err := m.Fn.MapWindow(w) + if err != nil { + return err + } + out := &FullValue{ + Elm: elm.Elm, + Elm2: newW, + Timestamp: elm.Timestamp, + Windows: elm.Windows, + Pane: elm.Pane, + } + return m.Out.ProcessElement(ctx, out, values...) +} + +func (m *MapWindows) FinishBundle(ctx context.Context) error { + return m.Out.FinishBundle(ctx) +} + +func (m *MapWindows) Down(_ context.Context) error { + return nil +} + +func (m *MapWindows) String() string { + return fmt.Sprintf("MapWindows[%v]. Out:%v", m.Fn, m.Out.ID()) +} + // WindowMapper defines an interface maps windows from a main input window space // to windows from a side input window space. Used during side input materialization. type WindowMapper interface { diff --git a/sdks/go/pkg/beam/core/runtime/exec/window_test.go b/sdks/go/pkg/beam/core/runtime/exec/window_test.go index 16ac6e814ef3..e0bca2a74f4d 100644 --- a/sdks/go/pkg/beam/core/runtime/exec/window_test.go +++ b/sdks/go/pkg/beam/core/runtime/exec/window_test.go @@ -16,6 +16,8 @@ package exec import ( + "context" + "math/rand" "testing" "time" @@ -170,3 +172,64 @@ func TestMapWindow(t *testing.T) { } } } + +func TestMapWindows(t *testing.T) { + tests := []struct { + name string + wFn *window.Fn + in []typex.Window + expect []typex.Window + }{ + { + "fixed2fixed", + window.NewFixedWindows(1000 * time.Millisecond), + []typex.Window{ + window.IntervalWindow{Start: 100, End: 200}, + window.IntervalWindow{Start: 100, End: 1100}, + }, + []typex.Window{ + window.IntervalWindow{Start: 0, End: 1000}, + window.IntervalWindow{Start: 1000, End: 2000}, + }, + }, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + inV, expected := makeNoncedWindowValues(tc.in, tc.expect) + + out := &CaptureNode{UID: 1} + unit := &MapWindows{UID: 2, Fn: &windowMapper{wfn: tc.wFn}, Out: out} + a := &FixedRoot{UID: 3, Elements: inV, Out: unit} + + p, err := NewPlan(tc.name, []Unit{a, unit, out}) + if err != nil { + t.Fatalf("failed to construct plan: %s", err) + } + ctx := context.Background() + if err := p.Execute(ctx, "1", DataContext{}); err != nil { + t.Fatalf("execute failed: %s", err) + } + if err := p.Down(ctx); err != nil { + t.Fatalf("down failed: %s", err) + } + if !equalList(out.Elements, expected) { + t.Errorf("map_windows returned %v, want %v", extractValues(out.Elements...), extractValues(expected...)) + } + }) + } +} + +func makeNoncedWindowValues(in []typex.Window, expect []typex.Window) ([]MainInput, []FullValue) { + if len(in) != len(expect) { + panic("provided window slices must be the same length") + } + inV := make([]MainInput, len(in)) + expectV := make([]FullValue, len(in)) + for i := range in { + nonce := make([]byte, 4) + rand.Read(nonce) + inV[i] = MainInput{Key: makeKV(nonce, in[i])[0]} + expectV[i] = makeKV(nonce, expect[i])[0] + } + return inV, expectV +} diff --git a/sdks/go/pkg/beam/core/runtime/graphx/coder.go b/sdks/go/pkg/beam/core/runtime/graphx/coder.go index 3af820e8a2bc..86a2f9f94687 100644 --- a/sdks/go/pkg/beam/core/runtime/graphx/coder.go +++ b/sdks/go/pkg/beam/core/runtime/graphx/coder.go @@ -378,18 +378,14 @@ func (b *CoderUnmarshaller) makeCoder(id string, c *pipepb.Coder) (*coder.Coder, return nil, err } return coder.NewN(elm), nil - - // Special handling for window coders so they can be treated as - // a general coder. Generally window coders are not used outside of - // specific contexts, but this enables improved testing. - // Window types are not permitted to be fulltypes, so - // we use assignably equivalent anonymous struct types. case urnIntervalWindow: - w, err := b.WindowCoder(id) - if err != nil { - return nil, err - } - return &coder.Coder{Kind: coder.Window, T: typex.New(reflect.TypeOf((*struct{ Start, End int64 })(nil)).Elem()), Window: w}, nil + return coder.NewIntervalWindowCoder(), nil + + // Special handling for the global window coder so it can be treated as + // a general coder. Generally window coders are not used outside of + // specific contexts, but this enables improved testing. + // Window types are not permitted to be fulltypes, so + // we use assignably equivalent anonymous struct types. case urnGlobalWindow: w, err := b.WindowCoder(id) if err != nil { @@ -528,6 +524,9 @@ func (b *CoderMarshaller) Add(c *coder.Coder) (string, error) { case coder.String: return b.internBuiltInCoder(urnStringCoder), nil + case coder.IW: + return b.internBuiltInCoder(urnIntervalWindow), nil + case coder.Row: rt := c.T.Type() s, err := schema.FromType(rt) diff --git a/sdks/go/pkg/beam/core/runtime/graphx/coder_test.go b/sdks/go/pkg/beam/core/runtime/graphx/coder_test.go index 01c70181ce89..12ed074b91ce 100644 --- a/sdks/go/pkg/beam/core/runtime/graphx/coder_test.go +++ b/sdks/go/pkg/beam/core/runtime/graphx/coder_test.go @@ -85,6 +85,10 @@ func TestMarshalUnmarshalCoders(t *testing.T) { name: "baz", c: baz, }, + { + name: "IW", + c: coder.NewIntervalWindowCoder(), + }, { name: "W", c: coder.NewW(coder.NewBytes(), coder.NewGlobalWindow()), diff --git a/sdks/go/pkg/beam/core/runtime/graphx/dataflow.go b/sdks/go/pkg/beam/core/runtime/graphx/dataflow.go index 1bda90aa9e1c..dafd3eee40ad 100644 --- a/sdks/go/pkg/beam/core/runtime/graphx/dataflow.go +++ b/sdks/go/pkg/beam/core/runtime/graphx/dataflow.go @@ -176,6 +176,9 @@ func EncodeCoderRef(c *coder.Coder) (*CoderRef, error) { } return &CoderRef{Type: windowedValueType, Components: []*CoderRef{elm, w}, IsWrapper: true}, nil + case coder.IW: + return &CoderRef{Type: intervalWindowType}, nil + case coder.Bytes: return &CoderRef{Type: bytesType}, nil @@ -305,6 +308,9 @@ func DecodeCoderRef(c *CoderRef) (*coder.Coder, error) { return decodeDataflowCustomCoder(subC.Type) } + case intervalWindowType: + return coder.NewIntervalWindowCoder(), nil + case windowedValueType: if len(c.Components) != 2 { return nil, errors.Errorf("bad windowed value: %+v", c) diff --git a/sdks/go/pkg/beam/core/runtime/graphx/translate.go b/sdks/go/pkg/beam/core/runtime/graphx/translate.go index 22254a38e012..498afeac8289 100644 --- a/sdks/go/pkg/beam/core/runtime/graphx/translate.go +++ b/sdks/go/pkg/beam/core/runtime/graphx/translate.go @@ -44,6 +44,7 @@ const ( URNReshuffle = "beam:transform:reshuffle:v1" URNCombinePerKey = "beam:transform:combine_per_key:v1" URNWindow = "beam:transform:window_into:v1" + URNMapWindows = "beam:transform:map_windows:v1" URNIterableSideInput = "beam:side_input:iterable:v1" URNMultimapSideInput = "beam:side_input:multimap:v1" From 3bd76cc130ffee98e6f33023936b703e0d4e1c7b Mon Sep 17 00:00:00 2001 From: AdalbertMemSQL <55380838+AdalbertMemSQL@users.noreply.github.com> Date: Wed, 30 Nov 2022 06:53:19 +0200 Subject: [PATCH 024/456] Deleted initialNumReaders paramter. (#24355) * Deleted initialNumReaders paramter. Pre-split restrictions to the maximum number of readers. See the discussion in the design doc for more details: https://docs.google.com/document/d/1WU-hkoZ93SaGXyOz_UtX0jXzIRl194hCId_IdmEV9jw/edit?disco=AAAAjCSqPvs * Nit reformatting * Fixed bug in splitRange function --- .../sdk/io/singlestore/SingleStoreIO.java | 41 +--------- .../singlestore/ReadWithPartitionsTest.java | 76 +++---------------- 2 files changed, 16 insertions(+), 101 deletions(-) diff --git a/sdks/java/io/singlestore/src/main/java/org/apache/beam/sdk/io/singlestore/SingleStoreIO.java b/sdks/java/io/singlestore/src/main/java/org/apache/beam/sdk/io/singlestore/SingleStoreIO.java index a698d713e1c8..6873ae6b8b37 100644 --- a/sdks/java/io/singlestore/src/main/java/org/apache/beam/sdk/io/singlestore/SingleStoreIO.java +++ b/sdks/java/io/singlestore/src/main/java/org/apache/beam/sdk/io/singlestore/SingleStoreIO.java @@ -545,8 +545,6 @@ public abstract static class ReadWithPartitions extends PTransform getRowMapper(); - abstract @Nullable Integer getInitialNumReaders(); - abstract Builder toBuilder(); @AutoValue.Builder @@ -560,8 +558,6 @@ abstract Builder setDataSourceConfiguration( abstract Builder setRowMapper(RowMapper rowMapper); - abstract Builder setInitialNumReaders(Integer initialNumReaders); - abstract ReadWithPartitions build(); } @@ -585,12 +581,6 @@ public ReadWithPartitions withRowMapper(RowMapper rowMapper) { return toBuilder().setRowMapper(rowMapper).build(); } - /** Pre-split initial restriction and start initialNumReaders reading at the very beginning. */ - public ReadWithPartitions withInitialNumReaders(Integer initialNumReaders) { - checkNotNull(initialNumReaders, "initialNumReaders can not be null"); - return toBuilder().setInitialNumReaders(initialNumReaders).build(); - } - @Override public PCollection expand(PBegin input) { DataSourceConfiguration dataSourceConfiguration = getDataSourceConfiguration(); @@ -603,10 +593,6 @@ public PCollection expand(PBegin input) { RowMapper rowMapper = getRowMapper(); Preconditions.checkArgumentNotNull(rowMapper, "withRowMapper() is required"); - int initialNumReaders = SingleStoreUtil.getArgumentWithDefault(getInitialNumReaders(), 1); - checkArgument( - initialNumReaders >= 1, "withInitialNumReaders() should be greater or equal to 1"); - String actualQuery = SingleStoreUtil.getSelectQuery(getTable(), getQuery()); Coder coder = @@ -621,11 +607,7 @@ public PCollection expand(PBegin input) { .apply( ParDo.of( new ReadWithPartitions.ReadWithPartitionsFn<>( - dataSourceConfiguration, - actualQuery, - database, - rowMapper, - initialNumReaders))) + dataSourceConfiguration, actualQuery, database, rowMapper))) .setCoder(coder); } @@ -635,19 +617,16 @@ private static class ReadWithPartitionsFn String query; String database; RowMapper rowMapper; - int initialNumReaders; ReadWithPartitionsFn( DataSourceConfiguration dataSourceConfiguration, String query, String database, - RowMapper rowMapper, - int initialNumReaders) { + RowMapper rowMapper) { this.dataSourceConfiguration = dataSourceConfiguration; this.query = query; this.database = database; this.rowMapper = rowMapper; - this.initialNumReaders = initialNumReaders; } @ProcessElement @@ -690,19 +669,8 @@ public void splitRange( @Element ParameterT element, @Restriction OffsetRange range, OutputReceiver receiver) { - long numPartitions = range.getTo() - range.getFrom(); - checkArgument( - initialNumReaders <= numPartitions, - "withInitialNumReaders() should not be greater then number of partitions in the database.\n" - + String.format( - "InitialNumReaders is %d, number of partitions in the database is %d", - initialNumReaders, range.getTo())); - - for (int i = 0; i < initialNumReaders; i++) { - receiver.output( - new OffsetRange( - range.getFrom() + numPartitions * i / initialNumReaders, - range.getFrom() + numPartitions * (i + 1) / initialNumReaders)); + for (long i = range.getFrom(); i < range.getTo(); i++) { + receiver.output(new OffsetRange(i, i + 1)); } } @@ -744,7 +712,6 @@ public void populateDisplayData(DisplayData.Builder builder) { builder.addIfNotNull(DisplayData.item("table", getTable())); builder.addIfNotNull( DisplayData.item("rowMapper", SingleStoreUtil.getClassNameOrNull(getRowMapper()))); - builder.addIfNotNull(DisplayData.item("initialNumReaders", getInitialNumReaders())); } } diff --git a/sdks/java/io/singlestore/src/test/java/org/apache/beam/sdk/io/singlestore/ReadWithPartitionsTest.java b/sdks/java/io/singlestore/src/test/java/org/apache/beam/sdk/io/singlestore/ReadWithPartitionsTest.java index 8b96e6f9909a..32ca35680261 100644 --- a/sdks/java/io/singlestore/src/test/java/org/apache/beam/sdk/io/singlestore/ReadWithPartitionsTest.java +++ b/sdks/java/io/singlestore/src/test/java/org/apache/beam/sdk/io/singlestore/ReadWithPartitionsTest.java @@ -148,67 +148,16 @@ public void testReadWithPartitionsWithTable() { pipeline.run(); } - @Test - public void testReadWithPartitionsWithInitialNumReaders() { - PCollection rows = - pipeline.apply( - SingleStoreIO.readWithPartitions() - .withDataSourceConfiguration(dataSourceConfiguration) - .withQuery("SELECT * FROM `t`") - .withRowMapper(new TestHelper.TestRowMapper()) - .withInitialNumReaders(2)); - - PAssert.thatSingleton(rows.apply("Count All", Count.globally())) - .isEqualTo((long) EXPECTED_ROW_COUNT); - - Iterable expectedValues = TestRow.getExpectedValues(0, EXPECTED_ROW_COUNT); - PAssert.that(rows).containsInAnyOrder(expectedValues); - - pipeline.run(); - } - - @Test - public void testReadWithPartitionsZeroInitialNumReaders() { - assertThrows( - "withInitialNumReaders() should be greater or equal to 1", - IllegalArgumentException.class, - () -> { - pipelineForErrorChecks.apply( - SingleStoreIO.readWithPartitions() - .withDataSourceConfiguration(dataSourceConfiguration) - .withTable("t") - .withInitialNumReaders(0) - .withRowMapper(new TestHelper.TestRowMapper())); - }); - } - - @Test - public void testReadWithPartitionsTooBigInitialNumReaders() { - pipelineForErrorChecks.apply( - SingleStoreIO.readWithPartitions() - .withDataSourceConfiguration(dataSourceConfiguration) - .withTable("t") - .withInitialNumReaders(100) - .withRowMapper(new TestHelper.TestRowMapper())); - - assertThrows( - "withInitialNumReaders() should not be greater then number of partitions in the database.\n" - + "InitialNumReaders is 100, number of partitions in the database is 2", - Pipeline.PipelineExecutionException.class, - () -> pipelineForErrorChecks.run().waitUntilFinish()); - } - @Test public void testReadWithPartitionsNoTableAndQuery() { assertThrows( "One of withTable() or withQuery() is required", IllegalArgumentException.class, - () -> { - pipelineForErrorChecks.apply( - SingleStoreIO.readWithPartitions() - .withDataSourceConfiguration(dataSourceConfiguration) - .withRowMapper(new TestHelper.TestRowMapper())); - }); + () -> + pipelineForErrorChecks.apply( + SingleStoreIO.readWithPartitions() + .withDataSourceConfiguration(dataSourceConfiguration) + .withRowMapper(new TestHelper.TestRowMapper()))); } @Test @@ -216,13 +165,12 @@ public void testReadWithPartitionsBothTableAndQuery() { assertThrows( "withTable() can not be used together with withQuery()", IllegalArgumentException.class, - () -> { - pipelineForErrorChecks.apply( - SingleStoreIO.readWithPartitions() - .withDataSourceConfiguration(dataSourceConfiguration) - .withTable("t") - .withQuery("SELECT * FROM `t`") - .withRowMapper(new TestHelper.TestRowMapper())); - }); + () -> + pipelineForErrorChecks.apply( + SingleStoreIO.readWithPartitions() + .withDataSourceConfiguration(dataSourceConfiguration) + .withTable("t") + .withQuery("SELECT * FROM `t`") + .withRowMapper(new TestHelper.TestRowMapper()))); } } From fdc70cfdc514ad080d1fb1fc31bdf9f51c2e3104 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 30 Nov 2022 10:04:29 -0500 Subject: [PATCH 025/456] Bump google.golang.org/grpc from 1.50.1 to 1.51.0 in /sdks (#24281) Bumps [google.golang.org/grpc](https://github.com/grpc/grpc-go) from 1.50.1 to 1.51.0. - [Release notes](https://github.com/grpc/grpc-go/releases) - [Commits](https://github.com/grpc/grpc-go/compare/v1.50.1...v1.51.0) --- updated-dependencies: - dependency-name: google.golang.org/grpc dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- sdks/go.mod | 2 +- sdks/go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sdks/go.mod b/sdks/go.mod index 13e6c9460822..b43a71f0d559 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -55,7 +55,7 @@ require ( golang.org/x/text v0.4.0 google.golang.org/api v0.103.0 google.golang.org/genproto v0.0.0-20221027153422-115e99e71e1c - google.golang.org/grpc v1.50.1 + google.golang.org/grpc v1.51.0 google.golang.org/protobuf v1.28.1 gopkg.in/retry.v1 v1.0.3 gopkg.in/yaml.v2 v2.4.0 diff --git a/sdks/go.sum b/sdks/go.sum index 035e17c7ad06..e2faee38997e 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -1431,8 +1431,8 @@ google.golang.org/grpc v1.40.1/go.mod h1:ogyxbiOoUXAkP+4+xa6PZSE9DZgIHtSpzjDTB9K google.golang.org/grpc v1.44.0/go.mod h1:k+4IHHFw41K8+bbowsex27ge2rCb65oeWqe4jJ590SU= google.golang.org/grpc v1.45.0/go.mod h1:lN7owxKUQEqMfSyQikvvk5tf/6zMPsrK+ONuO11+0rQ= google.golang.org/grpc v1.46.0/go.mod h1:vN9eftEi1UMyUsIF80+uQXhHjbXYbm0uXoFCACuMGWk= -google.golang.org/grpc v1.50.1 h1:DS/BukOZWp8s6p4Dt/tOaJaTQyPyOoCcrjroHuCeLzY= -google.golang.org/grpc v1.50.1/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCDK+GI= +google.golang.org/grpc v1.51.0 h1:E1eGv1FTqoLIdnBCZufiSHgKjlqG6fKFf6pPWtMTh8U= +google.golang.org/grpc v1.51.0/go.mod h1:wgNDFcnuBGmxLKI/qn4T+m5BtEBYXJPvibbUPsAIPww= google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.1.0/go.mod h1:6Kw0yEErY5E/yWrBtf03jp27GLLJujG4z/JK95pnjjw= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= From 58f7399ca24692296d66ae80bc3bd462328e320c Mon Sep 17 00:00:00 2001 From: Evgeny Antyshev Date: Wed, 30 Nov 2022 19:18:58 +0300 Subject: [PATCH 026/456] [Playground] use JAVA SDK 2.43.0 in Examples CI (#24429) --- .github/workflows/playground_examples_ci_reusable.yml | 6 +++++- playground/backend/containers/java/Dockerfile | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/playground_examples_ci_reusable.yml b/.github/workflows/playground_examples_ci_reusable.yml index d00ecd3b5bdc..7310b97b300f 100644 --- a/.github/workflows/playground_examples_ci_reusable.yml +++ b/.github/workflows/playground_examples_ci_reusable.yml @@ -34,7 +34,7 @@ on: type: string required: true env: - BEAM_VERSION: 2.42.0 + BEAM_VERSION: 2.43.0 jobs: check_has_examples: name: pre-check @@ -141,6 +141,10 @@ jobs: if [ -n "$SDK_TAG" ]; then opts="$opts -Psdk-tag=$SDK_TAG" fi + if [ "$SDK" == "java" ]; then + # Java uses a fixed BEAM_VERSION + opts="$opts -Pbase-image=apache/beam_java8_sdk:$BEAM_VERSION" + fi # by default (w/o -Psdk-tag) runner uses BEAM from local ./sdks # TODO Java SDK doesn't, it uses 2.42.0, fix this diff --git a/playground/backend/containers/java/Dockerfile b/playground/backend/containers/java/Dockerfile index b15807ce397c..de89e72fb99e 100644 --- a/playground/backend/containers/java/Dockerfile +++ b/playground/backend/containers/java/Dockerfile @@ -15,7 +15,7 @@ # See the License for the specific language governing permissions and # limitations under the License. ############################################################################### -ARG BEAM_VERSION=2.42.0 +ARG BEAM_VERSION=2.43.0 ARG BASE_IMAGE=apache/beam_java8_sdk:${BEAM_VERSION} FROM golang:1.18-bullseye AS build From e32b05408c5984184aa3f636d736d3549e5912f0 Mon Sep 17 00:00:00 2001 From: sysede <103770918+sysede@users.noreply.github.com> Date: Wed, 30 Nov 2022 12:44:38 -0500 Subject: [PATCH 027/456] Update authors.yml (#24433) --- website/www/site/data/authors.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/website/www/site/data/authors.yml b/website/www/site/data/authors.yml index 815308038fd6..3ad6eda3b514 100644 --- a/website/www/site/data/authors.yml +++ b/website/www/site/data/authors.yml @@ -244,4 +244,8 @@ iht: twitter: herraiz yichi: name: Yichi Zhang - email: yichi@apache.org \ No newline at end of file + email: yichi@apache.org +sysede: + name: Danielle Syse + email: syse@google.com + linkedin: desyse From b8b4b6c90316209a8c434df6b7c6c0c185a04de0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 30 Nov 2022 23:23:26 +0530 Subject: [PATCH 028/456] Bump cloud.google.com/go/spanner from 1.36.0 to 1.40.0 in /sdks (#24423) Bumps [cloud.google.com/go/spanner](https://github.com/googleapis/google-cloud-go) from 1.36.0 to 1.40.0. - [Release notes](https://github.com/googleapis/google-cloud-go/releases) - [Changelog](https://github.com/googleapis/google-cloud-go/blob/main/CHANGES.md) - [Commits](https://github.com/googleapis/google-cloud-go/compare/spanner/v1.36.0...spanner/v1.40.0) --- updated-dependencies: - dependency-name: cloud.google.com/go/spanner dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- sdks/go.mod | 2 +- sdks/go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sdks/go.mod b/sdks/go.mod index b43a71f0d559..16c65636209d 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -61,7 +61,7 @@ require ( gopkg.in/yaml.v2 v2.4.0 ) -require cloud.google.com/go/spanner v1.36.0 +require cloud.google.com/go/spanner v1.40.0 require ( cloud.google.com/go/bigtable v1.18.0 diff --git a/sdks/go.sum b/sdks/go.sum index e2faee38997e..2126c83bca69 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -68,8 +68,8 @@ cloud.google.com/go/pubsub v1.2.0/go.mod h1:jhfEVHT8odbXTkndysNHCcx0awwzvfOlguIA cloud.google.com/go/pubsub v1.3.1/go.mod h1:i+ucay31+CNRpDW4Lu78I4xXG+O1r/MAHgjpRVR+TSU= cloud.google.com/go/pubsub v1.26.0 h1:Y/HcMxVXgkUV2pYeLMUkclMg0ue6U0jVyI5xEARQ4zA= cloud.google.com/go/pubsub v1.26.0/go.mod h1:QgBH3U/jdJy/ftjPhTkyXNj543Tin1pRYcdcPRnFIRI= -cloud.google.com/go/spanner v1.36.0 h1:MYc3fKJlZZCpZymoKBqPR23Hxd1CFhH+zsQPMzeM1xI= -cloud.google.com/go/spanner v1.36.0/go.mod h1:RKVKnqXxTMDuBPAsjxohvcSTH6qiRB6E0oMljFIKPr0= +cloud.google.com/go/spanner v1.40.0 h1:Kwq37LCo7YAfnHIpIcy46jag1bfJS4VJ8WE/GfQd//Q= +cloud.google.com/go/spanner v1.40.0/go.mod h1:01LIzguGmKiwr8mur46zNOZxPjc+PeT5Y3R9ENte+v8= cloud.google.com/go/storage v1.0.0/go.mod h1:IhtSnM/ZTZV8YYJWCY8RULGVqBDmpoyjwiyrjsg+URw= cloud.google.com/go/storage v1.5.0/go.mod h1:tpKbwo567HUNpVclU5sGELwQWBDZ8gh0ZeosJ0Rtdos= cloud.google.com/go/storage v1.6.0/go.mod h1:N7U0C8pVQ/+NIKOBQyamJIeKQKkZ+mxpohlUTyfDhBk= From 050288545ccfe80f6d5f660864716be81e23c05d Mon Sep 17 00:00:00 2001 From: Shubham Krishna Date: Wed, 30 Nov 2022 23:29:47 +0530 Subject: [PATCH 029/456] Add Large Language Model RunInference Example (#24350) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add Large Language Model RunInference Example * Fix formatting and linting issues * Fix formatting and linting issues * Fix pylinting * Fix package import pylint issue * Fix package import pylint issue * Adapt example according to latest RunInference changes * Fix linting and formating issues * Add documentation * Add missing link * Replace closing shortcode with opening shortcode * Seperate pipeline_args and known_args * Improve docstrings and documentation * Improve docstrings and documentation Co-authored-by: Shubham Krishna <“shubham.krishna@ml6.eu”> --- .../inference/large_language_modeling/main.py | 140 ++++++++++++++++++ .../large_language_modeling/requirements.txt | 21 +++ .../ml/large-language-modeling.md | 73 +++++++++ .../content/en/documentation/ml/overview.md | 3 +- .../section-menu/en/documentation.html | 1 + 5 files changed, 237 insertions(+), 1 deletion(-) create mode 100644 sdks/python/apache_beam/examples/inference/large_language_modeling/main.py create mode 100644 sdks/python/apache_beam/examples/inference/large_language_modeling/requirements.txt create mode 100644 website/www/site/content/en/documentation/ml/large-language-modeling.md diff --git a/sdks/python/apache_beam/examples/inference/large_language_modeling/main.py b/sdks/python/apache_beam/examples/inference/large_language_modeling/main.py new file mode 100644 index 000000000000..a373f8377e17 --- /dev/null +++ b/sdks/python/apache_beam/examples/inference/large_language_modeling/main.py @@ -0,0 +1,140 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License + +""""A pipeline that uses RunInference to perform translation +with a T5 language model. + +This pipeline takes a list of english sentences and then uses +the T5ForConditionalGeneration from Hugging Face to translate the +english sentence into german. +""" +import argparse +import sys + +import apache_beam as beam +from apache_beam.ml.inference.base import RunInference +from apache_beam.ml.inference.pytorch_inference import PytorchModelHandlerTensor +from apache_beam.ml.inference.pytorch_inference import make_tensor_model_fn +from apache_beam.options.pipeline_options import PipelineOptions +from transformers import AutoConfig +from transformers import AutoTokenizer +from transformers import T5ForConditionalGeneration + + +class Preprocess(beam.DoFn): + def __init__(self, tokenizer: AutoTokenizer): + self._tokenizer = tokenizer + + def process(self, element): + """ + Process the raw text input to a format suitable for + T5ForConditionalGeneration model inference + + Args: + element: A string of text + + Returns: + A tokenized example that can be read by the + T5ForConditionalGeneration + """ + input_ids = self._tokenizer( + element, return_tensors="pt", padding="max_length", + max_length=512).input_ids + return input_ids + + +class Postprocess(beam.DoFn): + def __init__(self, tokenizer: AutoTokenizer): + self._tokenizer = tokenizer + + def process(self, element): + """ + Process the PredictionResult to print the translated texts + + Args: + element: The RunInference output to be processed. + """ + decoded_inputs = self._tokenizer.decode( + element.example, skip_special_tokens=True) + decoded_outputs = self._tokenizer.decode( + element.inference, skip_special_tokens=True) + print(f"{decoded_inputs} \t Output: {decoded_outputs}") + + +def parse_args(argv): + """Parses args for the workflow.""" + parser = argparse.ArgumentParser() + parser.add_argument( + "--model_state_dict_path", + dest="model_state_dict_path", + required=True, + help="Path to the model's state_dict.", + ) + parser.add_argument( + "--model_name", + dest="model_name", + required=True, + help="Path to the model's state_dict.", + default="t5-small", + ) + + return parser.parse_known_args(args=argv) + + +def run(): + """ + Runs the interjector pipeline which translates English sentences + into German using the RunInference API. """ + + known_args, pipeline_args = parse_args(sys.argv) + pipeline_options = PipelineOptions(pipeline_args) + + gen_fn = make_tensor_model_fn('generate') + model_handler = PytorchModelHandlerTensor( + state_dict_path=known_args.model_state_dict_path, + model_class=T5ForConditionalGeneration, + model_params={ + "config": AutoConfig.from_pretrained(known_args.model_name) + }, + device="cpu", + inference_fn=gen_fn) + + eng_sentences = [ + "The house is wonderful.", + "I like to work in NYC.", + "My name is Shubham.", + "I want to work for Google.", + "I am from India." + ] + task_prefix = "translate English to German: " + task_sentences = [task_prefix + sentence for sentence in eng_sentences] + tokenizer = AutoTokenizer.from_pretrained(known_args.model_name) + + # [START Pipeline] + with beam.Pipeline(options=pipeline_options) as pipeline: + _ = ( + pipeline + | "CreateInputs" >> beam.Create(task_sentences) + | "Preprocess" >> beam.ParDo(Preprocess(tokenizer=tokenizer)) + | "RunInference" >> RunInference(model_handler=model_handler) + | "PostProcess" >> beam.ParDo(Postprocess(tokenizer=tokenizer))) + # [END Pipeline] + + +if __name__ == "__main__": + run() diff --git a/sdks/python/apache_beam/examples/inference/large_language_modeling/requirements.txt b/sdks/python/apache_beam/examples/inference/large_language_modeling/requirements.txt new file mode 100644 index 000000000000..8c4ba6aeea8c --- /dev/null +++ b/sdks/python/apache_beam/examples/inference/large_language_modeling/requirements.txt @@ -0,0 +1,21 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License + +torch==1.12.1 +transformers==4.24.0 +sentencepiece==0.1.97 \ No newline at end of file diff --git a/website/www/site/content/en/documentation/ml/large-language-modeling.md b/website/www/site/content/en/documentation/ml/large-language-modeling.md new file mode 100644 index 000000000000..7db18335dd11 --- /dev/null +++ b/website/www/site/content/en/documentation/ml/large-language-modeling.md @@ -0,0 +1,73 @@ +--- +title: "Large Language Model Inference in Beam" +--- + + +# RunInference +In Apache Beam 2.40.0, Beam introduced the RunInference API, which lets you deploy a machine learning model in a Beam pipeline. A `RunInference` transform performs inference on a `PCollection` of examples using a machine learning (ML) model. The transform outputs a PCollection that contains the input examples and output predictions. For more information, see RunInference [here](https://beam.apache.org/documentation/transforms/python/elementwise/runinference/). You can also find [inference examples on GitHub](https://github.com/apache/beam/tree/master/sdks/python/apache_beam/examples/inference). + + +## Using RunInference with very large models +RunInference works well on arbitrarily large models as long as they can fit on your hardware. + +This example demonstrates running inference with a `T5` language model using `RunInference` in a pipeline. `T5` is an encoder-decoder model pre-trained on a multi-task mixture of unsupervised and supervised tasks. Each task is converted into a text-to-text format. The example uses `T5-11B`, which contains 11 billion parameters and is 45 GB in size. In order to work well on a variety of tasks, `T5` prepends a different prefix to the input corresponding to each task. For example, for translation, the input would be: `translate English to German: …` and for summarization, it would be: `summarize: …`. For more information about `T5` see the [T5 overiew](https://huggingface.co/docs/transformers/model_doc/t5) in the HuggingFace documentation. + +### Run the Pipeline ? +First, install the required packages and pass the required arguments. +You can view the code on [GitHub](https://github.com/apache/beam/tree/master/sdks/python/apache_beam/examples/inference/large_language_modeling/main.py) + +1. Locally on your machine: `python main.py --runner DirectRunner`. You need to have 45 GB of disk space available to run this example. +2. On Google Cloud using Dataflow: `python main.py --runner DataflowRunner` + +### Pipeline Steps +The pipeline contains the following steps: +1. Read the inputs. +2. Encode the text into transformer-readable token ID integers using a tokenizer. +3. Use RunInference to get the output. +4. Decode the RunInference output and print it. + +The following code snippet contains the four steps: + +{{< highlight >}} + with beam.Pipeline(options=pipeline_options) as pipeline: + _ = ( + pipeline + | "CreateInputs" >> beam.Create(task_sentences) + | "Preprocess" >> beam.ParDo(Preprocess(tokenizer=tokenizer)) + | "RunInference" >> RunInference(model_handler=model_handler) + | "PostProcess" >> beam.ParDo(Postprocess(tokenizer=tokenizer)) + ) +{{< /highlight >}} + +In the third step of pipeline we use `RunInference`. +In order to use it, you must first define a `ModelHandler`. RunInference provides model handlers for `PyTorch`, `TensorFlow` and `Scikit-Learn`. Because the example uses a `PyTorch` model, it uses the `PyTorchModelHandlerTensor` model handler. + +{{< highlight >}} + gen_fn = make_tensor_model_fn('generate') + + model_handler = PytorchModelHandlerTensor( + state_dict_path=args.model_state_dict_path, + model_class=T5ForConditionalGeneration, + model_params={"config": AutoConfig.from_pretrained(args.model_name)}, + device="cpu", + inference_fn=gen_fn) +{{< /highlight >}} + +A `ModelHandler` requires parameters like: +* `state_dict_path` – The path to the saved dictionary of the model state. +* `model_class` – The class of the Pytorch model that defines the model structure. +* `model_params` – A dictionary of arguments required to instantiate the model class. +* `device` – The device on which you wish to run the model. If device = GPU then a GPU device will be used if it is available. Otherwise, it will be CPU. +* `inference_fn` - The inference function to use during RunInference. diff --git a/website/www/site/content/en/documentation/ml/overview.md b/website/www/site/content/en/documentation/ml/overview.md index d8df9f02cab5..d2737f5fe383 100644 --- a/website/www/site/content/en/documentation/ml/overview.md +++ b/website/www/site/content/en/documentation/ml/overview.md @@ -89,4 +89,5 @@ You can find examples of end-to-end AI/ML pipelines for several use cases: * [ML Workflow Orchestration](/documentation/ml/orchestration): Illustrates how to orchestrate ML workflows consisting of multiple steps by using Kubeflow Pipelines and Tensorflow Extended. * [Multi model pipelines in Beam](/documentation/ml/multi-model-pipelines): Explains how multi-model pipelines work and gives an overview of what you need to know to build one using the RunInference API. * [Online Clustering in Beam](/documentation/ml/online-clustering): Demonstrates how to set up a real-time clustering pipeline that can read text from Pub/Sub, convert the text into an embedding using a transformer-based language model with the RunInference API, and cluster the text using BIRCH with stateful processing. -* [Anomaly Detection in Beam](/documentation/ml/anomaly-detection): Demonstrates how to set up an anomaly detection pipeline that reads text from Pub/Sub in real time and then detects anomalies using a trained HDBSCAN clustering model with the RunInference API. \ No newline at end of file +* [Anomaly Detection in Beam](/documentation/ml/anomaly-detection): Demonstrates how to set up an anomaly detection pipeline that reads text from Pub/Sub in real time and then detects anomalies using a trained HDBSCAN clustering model with the RunInference API. +* [Large Language Model Inference in Beam](/documentation/ml/large-language-modeling): Demonstrates a pipeline that uses RunInference to perform translation with the T5 language model which contains 11 billion parameters. \ No newline at end of file diff --git a/website/www/site/layouts/partials/section-menu/en/documentation.html b/website/www/site/layouts/partials/section-menu/en/documentation.html index c4e10a607dff..df00b9d099ac 100644 --- a/website/www/site/layouts/partials/section-menu/en/documentation.html +++ b/website/www/site/layouts/partials/section-menu/en/documentation.html @@ -220,6 +220,7 @@
  • Online Clustering
  • RunInference Metrics
  • Anomaly Detection
  • +
  • Large Language Model Inference in Beam
  • From 8802f277e34458e7ca4bc887fa3160cb2086a3d0 Mon Sep 17 00:00:00 2001 From: Vladislav Chunikhin <102509589+vchunikhin@users.noreply.github.com> Date: Wed, 30 Nov 2022 22:42:54 +0400 Subject: [PATCH 030/456] [Playground] [Backend] minor fixes for error msgs (#23999) * [Playground] [Backend] minor fixes for error msgs * [Playground] [Backend] updated title message * gradlew --info * fix Co-authored-by: Evgeny Antyshev --- .../tour_of_beam_backend_integration.yml | 2 +- playground/backend/cmd/server/controller.go | 124 +++++++++--------- .../internal/db/datastore/datastore_db.go | 4 +- 3 files changed, 68 insertions(+), 62 deletions(-) diff --git a/.github/workflows/tour_of_beam_backend_integration.yml b/.github/workflows/tour_of_beam_backend_integration.yml index 66a783279c87..413694f68396 100644 --- a/.github/workflows/tour_of_beam_backend_integration.yml +++ b/.github/workflows/tour_of_beam_backend_integration.yml @@ -80,7 +80,7 @@ jobs: cache-read-only: false - name: Build Playground router image - run: ./gradlew playground:backend:containers:router:docker + run: ./gradlew -i playground:backend:containers:router:docker working-directory: ${{ env.GITHUB_WORKSPACE }} # 1. Start emulators diff --git a/playground/backend/cmd/server/controller.go b/playground/backend/cmd/server/controller.go index d18f6d6e4401..2f85bd074434 100644 --- a/playground/backend/cmd/server/controller.go +++ b/playground/backend/cmd/server/controller.go @@ -16,6 +16,7 @@ package main import ( "context" + "errors" "cloud.google.com/go/datastore" "github.com/google/uuid" @@ -27,24 +28,26 @@ import ( "beam.apache.org/playground/backend/internal/db" "beam.apache.org/playground/backend/internal/db/mapper" "beam.apache.org/playground/backend/internal/environment" - "beam.apache.org/playground/backend/internal/errors" + cerrors "beam.apache.org/playground/backend/internal/errors" "beam.apache.org/playground/backend/internal/logger" "beam.apache.org/playground/backend/internal/setup_tools/life_cycle" "beam.apache.org/playground/backend/internal/utils" ) const ( - errorTitleGetSnippet = "Error during getting snippet" - errorTitleSaveSnippet = "Error during saving snippet" - errorTitleGetCatalog = "Error during getting catalog" - errorTitleGetExample = "Error during getting example" - errorTitleGetExampleCode = "Error during getting example code" - errorTitleGetExampleOutput = "Error during getting example output" - errorTitleGetExampleLogs = "Error during getting example logs" - errorTitleGetExampleGraph = "Error during getting example graph" + errorTitleGetSnippet = "Error during getting snippet" + errorTitleSaveSnippet = "Error during saving snippet" + errorTitleGetCatalog = "Error during getting catalog" + errorTitleGetExample = "Error during getting example" + errorTitleGetExampleCode = "Error during getting example code" + errorTitleGetExampleOutput = "Error during getting example output" + errorTitleGetExampleLogs = "Error during getting example logs" + errorTitleGetExampleGraph = "Error during getting example graph" + errorTitleGetDefaultExample = "Error during getting default example" userBadCloudPathErrMsg = "Invalid cloud path parameter" userCloudConnectionErrMsg = "Cloud connection error" + resourceNotFoundErrMsg = "Resource is not found" ) // playgroundController processes `gRPC' requests from clients. @@ -71,12 +74,12 @@ func (controller *playgroundController) RunCode(ctx context.Context, info *pb.Ru // check for correct sdk if info.Sdk != controller.env.BeamSdkEnvs.ApacheBeamSdk { logger.Errorf("RunCode(): request contains incorrect sdk: %s\n", info.Sdk) - return nil, errors.InvalidArgumentError("Error during preparing", "Incorrect sdk. Want to receive %s, but the request contains %s", controller.env.BeamSdkEnvs.ApacheBeamSdk.String(), info.Sdk.String()) + return nil, cerrors.InvalidArgumentError("Error during preparing", "Incorrect sdk. Want to receive %s, but the request contains %s", controller.env.BeamSdkEnvs.ApacheBeamSdk.String(), info.Sdk.String()) } switch info.Sdk { case pb.Sdk_SDK_UNSPECIFIED: logger.Errorf("RunCode(): unimplemented sdk: %s\n", info.Sdk) - return nil, errors.InvalidArgumentError("Error during preparing", "Sdk is not implemented yet: %s", info.Sdk.String()) + return nil, cerrors.InvalidArgumentError("Error during preparing", "Sdk is not implemented yet: %s", info.Sdk.String()) } cacheExpirationTime := controller.env.ApplicationEnvs.CacheEnvs().KeyExpirationTime() @@ -85,29 +88,29 @@ func (controller *playgroundController) RunCode(ctx context.Context, info *pb.Ru lc, err := life_cycle.Setup(info.Sdk, info.Code, pipelineId, controller.env.ApplicationEnvs.WorkingDir(), controller.env.ApplicationEnvs.PipelinesFolder(), controller.env.BeamSdkEnvs.PreparedModDir()) if err != nil { logger.Errorf("RunCode(): error during setup file system: %s\n", err.Error()) - return nil, errors.InternalError("Error during preparing", "Error during setup file system for the code processing: %s", err.Error()) + return nil, cerrors.InternalError("Error during preparing", "Error during setup file system for the code processing: %s", err.Error()) } if err = utils.SetToCache(ctx, controller.cacheService, pipelineId, cache.Status, pb.Status_STATUS_VALIDATING); err != nil { code_processing.DeleteFolders(pipelineId, lc) - return nil, errors.InternalError("Error during preparing", "Error during saving status of the code processing") + return nil, cerrors.InternalError("Error during preparing", "Error during saving status of the code processing") } if err = utils.SetToCache(ctx, controller.cacheService, pipelineId, cache.RunOutputIndex, 0); err != nil { code_processing.DeleteFolders(pipelineId, lc) - return nil, errors.InternalError("Error during preparing", "Error during saving initial run output") + return nil, cerrors.InternalError("Error during preparing", "Error during saving initial run output") } if err = utils.SetToCache(ctx, controller.cacheService, pipelineId, cache.LogsIndex, 0); err != nil { code_processing.DeleteFolders(pipelineId, lc) - return nil, errors.InternalError("Error during preparing", "Error during saving value for the logs output") + return nil, cerrors.InternalError("Error during preparing", "Error during saving value for the logs output") } if err = utils.SetToCache(ctx, controller.cacheService, pipelineId, cache.Canceled, false); err != nil { code_processing.DeleteFolders(pipelineId, lc) - return nil, errors.InternalError("Error during preparing", "Error during saving initial cancel flag") + return nil, cerrors.InternalError("Error during preparing", "Error during saving initial cancel flag") } if err = controller.cacheService.SetExpTime(ctx, pipelineId, cacheExpirationTime); err != nil { logger.Errorf("%s: RunCode(): cache.SetExpTime(): %s\n", pipelineId, err.Error()) code_processing.DeleteFolders(pipelineId, lc) - return nil, errors.InternalError("Error during preparing", "Internal error") + return nil, cerrors.InternalError("Error during preparing", "Internal error") } go code_processing.Process(context.Background(), controller.cacheService, lc, pipelineId, &controller.env.ApplicationEnvs, &controller.env.BeamSdkEnvs, info.PipelineOptions) @@ -122,7 +125,7 @@ func (controller *playgroundController) CheckStatus(ctx context.Context, info *p errorMessage := "Error during getting status of the code processing" if err != nil { logger.Errorf("%s: CheckStatus(): pipelineId has incorrect value and couldn't be parsed as uuid value: %s", info.PipelineUuid, err.Error()) - return nil, errors.InvalidArgumentError(errorMessage, "pipelineId has incorrect value and couldn't be parsed as uuid value: %s", info.PipelineUuid) + return nil, cerrors.InvalidArgumentError(errorMessage, "pipelineId has incorrect value and couldn't be parsed as uuid value: %s", info.PipelineUuid) } status, err := code_processing.GetProcessingStatus(ctx, controller.cacheService, pipelineId, errorMessage) if err != nil { @@ -137,7 +140,7 @@ func (controller *playgroundController) GetRunOutput(ctx context.Context, info * errorMessage := "Error during getting run output of the code processing" if err != nil { logger.Errorf("%s: GetRunOutput(): pipelineId has incorrect value and couldn't be parsed as uuid value: %s", info.PipelineUuid, err.Error()) - return nil, errors.InvalidArgumentError(errorMessage, "pipelineId has incorrect value and couldn't be parsed as uuid value: %s", info.PipelineUuid) + return nil, cerrors.InvalidArgumentError(errorMessage, "pipelineId has incorrect value and couldn't be parsed as uuid value: %s", info.PipelineUuid) } lastIndex, err := code_processing.GetLastIndex(ctx, controller.cacheService, pipelineId, cache.RunOutputIndex, errorMessage) if err != nil { @@ -151,7 +154,7 @@ func (controller *playgroundController) GetRunOutput(ctx context.Context, info * if len(runOutput) > lastIndex { newRunOutput = runOutput[lastIndex:] if err := utils.SetToCache(ctx, controller.cacheService, pipelineId, cache.RunOutputIndex, lastIndex+len(newRunOutput)); err != nil { - return nil, errors.InternalError(errorMessage, "Error during saving pagination value") + return nil, cerrors.InternalError(errorMessage, "Error during saving pagination value") } } @@ -167,7 +170,7 @@ func (controller *playgroundController) GetLogs(ctx context.Context, info *pb.Ge errorMessage := "Error during getting logs of the code processing" if err != nil { logger.Errorf("%s: %s: pipelineId has incorrect value and couldn't be parsed as uuid value: %s", info.PipelineUuid, errorTitle, err.Error()) - return nil, errors.InvalidArgumentError(errorMessage, "pipelineId has incorrect value and couldn't be parsed as uuid value: %s", info.PipelineUuid) + return nil, cerrors.InvalidArgumentError(errorMessage, "pipelineId has incorrect value and couldn't be parsed as uuid value: %s", info.PipelineUuid) } lastIndex, err := code_processing.GetLastIndex(ctx, controller.cacheService, pipelineId, cache.LogsIndex, errorMessage) if err != nil { @@ -181,7 +184,7 @@ func (controller *playgroundController) GetLogs(ctx context.Context, info *pb.Ge if len(logs) > lastIndex { newLogs = logs[lastIndex:] if err := utils.SetToCache(ctx, controller.cacheService, pipelineId, cache.LogsIndex, lastIndex+len(newLogs)); err != nil { - return nil, errors.InternalError(errorMessage, "Error during saving pagination value") + return nil, cerrors.InternalError(errorMessage, "Error during saving pagination value") } } @@ -196,7 +199,7 @@ func (controller *playgroundController) GetRunError(ctx context.Context, info *p errorMessage := "Error during getting error output of the code processing" if err != nil { logger.Errorf("%s: GetRunError(): pipelineId has incorrect value and couldn't be parsed as uuid value: %s", info.PipelineUuid, err.Error()) - return nil, errors.InvalidArgumentError(errorMessage, "pipelineId has incorrect value and couldn't be parsed as uuid value: %s", info.PipelineUuid) + return nil, cerrors.InvalidArgumentError(errorMessage, "pipelineId has incorrect value and couldn't be parsed as uuid value: %s", info.PipelineUuid) } runError, err := code_processing.GetProcessingOutput(ctx, controller.cacheService, pipelineId, cache.RunError, errorMessage) if err != nil { @@ -211,7 +214,7 @@ func (controller *playgroundController) GetValidationOutput(ctx context.Context, errorMessage := "Error during getting compilation output" if err != nil { logger.Errorf("%s: GetValidationOutput(): pipelineId has incorrect value and couldn't be parsed as uuid value: %s", info.PipelineUuid, err.Error()) - return nil, errors.InvalidArgumentError(errorMessage, "pipelineId has incorrect value and couldn't be parsed as uuid value: %s", info.PipelineUuid) + return nil, cerrors.InvalidArgumentError(errorMessage, "pipelineId has incorrect value and couldn't be parsed as uuid value: %s", info.PipelineUuid) } validationOutput, err := code_processing.GetProcessingOutput(ctx, controller.cacheService, pipelineId, cache.ValidationOutput, errorMessage) if err != nil { @@ -226,7 +229,7 @@ func (controller *playgroundController) GetPreparationOutput(ctx context.Context errorMessage := "Error during getting compilation output" if err != nil { logger.Errorf("%s: GetPreparationOutput(): pipelineId has incorrect value and couldn't be parsed as uuid value: %s", info.PipelineUuid, err.Error()) - return nil, errors.InvalidArgumentError(errorMessage, "pipelineId has incorrect value and couldn't be parsed as uuid value: %s", info.PipelineUuid) + return nil, cerrors.InvalidArgumentError(errorMessage, "pipelineId has incorrect value and couldn't be parsed as uuid value: %s", info.PipelineUuid) } preparationOutput, err := code_processing.GetProcessingOutput(ctx, controller.cacheService, pipelineId, cache.PreparationOutput, errorMessage) if err != nil { @@ -241,7 +244,7 @@ func (controller *playgroundController) GetCompileOutput(ctx context.Context, in errorMessage := "Error during getting compilation output" if err != nil { logger.Errorf("%s: GetCompileOutput(): pipelineId has incorrect value and couldn't be parsed as uuid value: %s", info.PipelineUuid, err.Error()) - return nil, errors.InvalidArgumentError(errorMessage, "pipelineId has incorrect value and couldn't be parsed as uuid value: %s", info.PipelineUuid) + return nil, cerrors.InvalidArgumentError(errorMessage, "pipelineId has incorrect value and couldn't be parsed as uuid value: %s", info.PipelineUuid) } compileOutput, err := code_processing.GetProcessingOutput(ctx, controller.cacheService, pipelineId, cache.CompileOutput, errorMessage) if err != nil { @@ -256,7 +259,7 @@ func (controller *playgroundController) GetGraph(ctx context.Context, info *pb.G errorMessage := "Error during getting graph output" if err != nil { logger.Errorf("%s: GetGraph(): pipelineId has incorrect value and couldn't be parsed as uuid value: %s", info.PipelineUuid, err.Error()) - return nil, errors.InvalidArgumentError(errorMessage, "pipelineId has incorrect value and couldn't be parsed as uuid value: %s", info.PipelineUuid) + return nil, cerrors.InvalidArgumentError(errorMessage, "pipelineId has incorrect value and couldn't be parsed as uuid value: %s", info.PipelineUuid) } graph, err := code_processing.GetGraph(ctx, controller.cacheService, pipelineId, errorMessage) if err != nil { @@ -271,10 +274,10 @@ func (controller *playgroundController) Cancel(ctx context.Context, info *pb.Can errorMessage := "Error during canceling the code processing" if err != nil { logger.Errorf("%s: Cancel(): pipelineId has incorrect value and couldn't be parsed as uuid value: %s", info.PipelineUuid, err.Error()) - return nil, errors.InvalidArgumentError(errorMessage, "pipelineId has incorrect value and couldn't be parsed as uuid value: %s", info.PipelineUuid) + return nil, cerrors.InvalidArgumentError(errorMessage, "pipelineId has incorrect value and couldn't be parsed as uuid value: %s", info.PipelineUuid) } if err := utils.SetToCache(ctx, controller.cacheService, pipelineId, cache.Canceled, true); err != nil { - return nil, errors.InternalError(errorMessage, "Error during saving cancel flag value") + return nil, cerrors.InternalError(errorMessage, "Error during saving cancel flag value") } return &pb.CancelResponse{}, nil } @@ -286,7 +289,7 @@ func (controller *playgroundController) Cancel(ctx context.Context, info *pb.Can func (controller *playgroundController) GetPrecompiledObjects(ctx context.Context, info *pb.GetPrecompiledObjectsRequest) (*pb.GetPrecompiledObjectsResponse, error) { catalog, err := controller.cacheComponent.GetCatalogFromCacheOrDatastore(ctx, controller.env.ApplicationEnvs.CacheRequestTimeout()) if err != nil { - return nil, errors.InternalError(errorTitleGetCatalog, userCloudConnectionErrMsg) + return nil, cerrors.InternalError(errorTitleGetCatalog, userCloudConnectionErrMsg) } return &pb.GetPrecompiledObjectsResponse{ SdkCategories: utils.FilterCatalog(catalog, info.Sdk, info.Category), @@ -297,19 +300,19 @@ func (controller *playgroundController) GetPrecompiledObjects(ctx context.Contex func (controller *playgroundController) GetPrecompiledObject(ctx context.Context, info *pb.GetPrecompiledObjectRequest) (*pb.GetPrecompiledObjectResponse, error) { exampleId, err := utils.GetExampleID(info.GetCloudPath()) if err != nil { - return nil, errors.InvalidArgumentError(errorTitleGetExample, userBadCloudPathErrMsg) + return nil, cerrors.InvalidArgumentError(errorTitleGetExample, userBadCloudPathErrMsg) } sdks, err := controller.cacheComponent.GetSdkCatalogFromCacheOrDatastore(ctx, controller.env.ApplicationEnvs.CacheRequestTimeout()) if err != nil { - return nil, errors.InternalError(errorTitleGetExample, err.Error()) + return nil, cerrors.InternalError(errorTitleGetExample, userCloudConnectionErrMsg) } precompiledObject, err := controller.db.GetExample(ctx, exampleId, sdks) if err != nil { switch err { case datastore.ErrNoSuchEntity: - return nil, errors.NotFoundError(errorTitleGetExample, userCloudConnectionErrMsg) + return nil, cerrors.NotFoundError(errorTitleGetExample, resourceNotFoundErrMsg) default: - return nil, errors.InternalError(errorTitleGetExample, userCloudConnectionErrMsg) + return nil, cerrors.InternalError(errorTitleGetExample, userCloudConnectionErrMsg) } } return &pb.GetPrecompiledObjectResponse{PrecompiledObject: precompiledObject}, nil @@ -319,15 +322,15 @@ func (controller *playgroundController) GetPrecompiledObject(ctx context.Context func (controller *playgroundController) GetPrecompiledObjectCode(ctx context.Context, info *pb.GetPrecompiledObjectCodeRequest) (*pb.GetPrecompiledObjectCodeResponse, error) { exampleId, err := utils.GetExampleID(info.GetCloudPath()) if err != nil { - return nil, errors.InvalidArgumentError(errorTitleGetExampleCode, userBadCloudPathErrMsg) + return nil, cerrors.InvalidArgumentError(errorTitleGetExampleCode, userBadCloudPathErrMsg) } codeString, err := controller.db.GetExampleCode(ctx, exampleId) if err != nil { switch err { case datastore.ErrNoSuchEntity: - return nil, errors.NotFoundError(errorTitleGetExampleCode, userCloudConnectionErrMsg) + return nil, cerrors.NotFoundError(errorTitleGetExampleCode, resourceNotFoundErrMsg) default: - return nil, errors.InternalError(errorTitleGetExampleCode, userCloudConnectionErrMsg) + return nil, cerrors.InternalError(errorTitleGetExampleCode, userCloudConnectionErrMsg) } } response := pb.GetPrecompiledObjectCodeResponse{Code: codeString} @@ -338,15 +341,15 @@ func (controller *playgroundController) GetPrecompiledObjectCode(ctx context.Con func (controller *playgroundController) GetPrecompiledObjectOutput(ctx context.Context, info *pb.GetPrecompiledObjectOutputRequest) (*pb.GetPrecompiledObjectOutputResponse, error) { exampleId, err := utils.GetExampleID(info.GetCloudPath()) if err != nil { - return nil, errors.InvalidArgumentError(errorTitleGetExampleOutput, userBadCloudPathErrMsg) + return nil, cerrors.InvalidArgumentError(errorTitleGetExampleOutput, userBadCloudPathErrMsg) } output, err := controller.db.GetExampleOutput(ctx, exampleId) if err != nil { switch err { case datastore.ErrNoSuchEntity: - return nil, errors.NotFoundError(errorTitleGetExampleOutput, userCloudConnectionErrMsg) + return nil, cerrors.NotFoundError(errorTitleGetExampleOutput, resourceNotFoundErrMsg) default: - return nil, errors.InternalError(errorTitleGetExampleOutput, userCloudConnectionErrMsg) + return nil, cerrors.InternalError(errorTitleGetExampleOutput, userCloudConnectionErrMsg) } } response := pb.GetPrecompiledObjectOutputResponse{Output: output} @@ -357,15 +360,15 @@ func (controller *playgroundController) GetPrecompiledObjectOutput(ctx context.C func (controller *playgroundController) GetPrecompiledObjectLogs(ctx context.Context, info *pb.GetPrecompiledObjectLogsRequest) (*pb.GetPrecompiledObjectLogsResponse, error) { exampleId, err := utils.GetExampleID(info.GetCloudPath()) if err != nil { - return nil, errors.InvalidArgumentError(errorTitleGetExampleLogs, userBadCloudPathErrMsg) + return nil, cerrors.InvalidArgumentError(errorTitleGetExampleLogs, userBadCloudPathErrMsg) } logs, err := controller.db.GetExampleLogs(ctx, exampleId) if err != nil { switch err { case datastore.ErrNoSuchEntity: - return nil, errors.NotFoundError(errorTitleGetExampleLogs, userCloudConnectionErrMsg) + return nil, cerrors.NotFoundError(errorTitleGetExampleLogs, resourceNotFoundErrMsg) default: - return nil, errors.InternalError(errorTitleGetExampleLogs, userCloudConnectionErrMsg) + return nil, cerrors.InternalError(errorTitleGetExampleLogs, userCloudConnectionErrMsg) } } response := pb.GetPrecompiledObjectLogsResponse{Output: logs} @@ -376,15 +379,15 @@ func (controller *playgroundController) GetPrecompiledObjectLogs(ctx context.Con func (controller *playgroundController) GetPrecompiledObjectGraph(ctx context.Context, info *pb.GetPrecompiledObjectGraphRequest) (*pb.GetPrecompiledObjectGraphResponse, error) { exampleId, err := utils.GetExampleID(info.GetCloudPath()) if err != nil { - return nil, errors.InvalidArgumentError(errorTitleGetExampleGraph, userBadCloudPathErrMsg) + return nil, cerrors.InvalidArgumentError(errorTitleGetExampleGraph, userBadCloudPathErrMsg) } graph, err := controller.db.GetExampleGraph(ctx, exampleId) if err != nil { switch err { case datastore.ErrNoSuchEntity: - return nil, errors.NotFoundError(errorTitleGetExampleGraph, userCloudConnectionErrMsg) + return nil, cerrors.NotFoundError(errorTitleGetExampleGraph, resourceNotFoundErrMsg) default: - return nil, errors.InternalError(errorTitleGetExampleGraph, userCloudConnectionErrMsg) + return nil, cerrors.InternalError(errorTitleGetExampleGraph, userCloudConnectionErrMsg) } } response := pb.GetPrecompiledObjectGraphResponse{Graph: graph} @@ -396,12 +399,12 @@ func (controller *playgroundController) GetDefaultPrecompiledObject(ctx context. switch info.Sdk { case pb.Sdk_SDK_UNSPECIFIED: logger.Errorf("GetDefaultPrecompiledObject(): unimplemented sdk: %s\n", info.Sdk) - return nil, errors.InvalidArgumentError("Error during preparing", "Sdk is not implemented yet: %s", info.Sdk.String()) + return nil, cerrors.InvalidArgumentError(errorTitleGetDefaultExample, "Sdk is not implemented yet: %s", info.Sdk.String()) } precompiledObject, err := controller.cacheComponent.GetDefaultPrecompiledObjectFromCacheOrDatastore(ctx, info.Sdk, controller.env.ApplicationEnvs.CacheRequestTimeout()) if err != nil { logger.Errorf("GetDefaultPrecompiledObject(): error during getting catalog: %s", err.Error()) - return nil, errors.InternalError("Error during getting Precompiled Objects", "Error with cloud connection") + return nil, cerrors.InternalError(errorTitleGetDefaultExample, userCloudConnectionErrMsg) } response := pb.GetDefaultPrecompiledObjectResponse{PrecompiledObject: precompiledObject} return &response, nil @@ -411,15 +414,15 @@ func (controller *playgroundController) GetDefaultPrecompiledObject(ctx context. func (controller *playgroundController) SaveSnippet(ctx context.Context, req *pb.SaveSnippetRequest) (*pb.SaveSnippetResponse, error) { if req.Sdk == pb.Sdk_SDK_UNSPECIFIED { logger.Errorf("SaveSnippet(): unimplemented sdk: %s\n", req.Sdk) - return nil, errors.InvalidArgumentError(errorTitleSaveSnippet, "Sdk is not implemented yet: %s", req.Sdk.String()) + return nil, cerrors.InvalidArgumentError(errorTitleSaveSnippet, "Sdk is not implemented yet: %s", req.Sdk.String()) } if controller.db == nil { logger.Error("SaveSnippet(): the runner is trying to save the snippet") - return nil, errors.InvalidArgumentError(errorTitleSaveSnippet, "The runner doesn't support snippets") + return nil, cerrors.InvalidArgumentError(errorTitleSaveSnippet, "The runner doesn't support snippets") } if req.Files == nil || len(req.Files) == 0 { logger.Error("SaveSnippet(): files are empty") - return nil, errors.InvalidArgumentError(errorTitleSaveSnippet, "Snippet must have files") + return nil, cerrors.InvalidArgumentError(errorTitleSaveSnippet, "Snippet must have files") } snippet := controller.entityMapper.ToSnippet(req) @@ -427,17 +430,17 @@ func (controller *playgroundController) SaveSnippet(ctx context.Context, req *pb for _, file := range req.Files { if file.Content == "" { logger.Error("SaveSnippet(): entity is empty") - return nil, errors.InvalidArgumentError(errorTitleSaveSnippet, "Snippet must have some content") + return nil, cerrors.InvalidArgumentError(errorTitleSaveSnippet, "Snippet must have some content") } maxSnippetSize := controller.props.MaxSnippetSize if len(file.Content) > int(maxSnippetSize) { logger.Errorf("SaveSnippet(): entity is too large. Max entity size: %d symbols", maxSnippetSize) - return nil, errors.InvalidArgumentError(errorTitleSaveSnippet, "Snippet size is more than %d symbols", maxSnippetSize) + return nil, cerrors.InvalidArgumentError(errorTitleSaveSnippet, "Snippet size is more than %d symbols", maxSnippetSize) } fileEntity, err := controller.entityMapper.ToFileEntity(req, file) if err != nil { logger.Errorf("SaveSnippet(): file has wrong properties, err: %s", err.Error()) - return nil, errors.InvalidArgumentError(errorTitleSaveSnippet, "File content is invalid") + return nil, cerrors.InvalidArgumentError(errorTitleSaveSnippet, "File content is invalid") } snippet.Files = append(snippet.Files, fileEntity) } @@ -445,11 +448,11 @@ func (controller *playgroundController) SaveSnippet(ctx context.Context, req *pb id, err := snippet.ID() if err != nil { logger.Errorf("SaveSnippet(): ID(): error during ID generation: %s", err.Error()) - return nil, errors.InternalError(errorTitleSaveSnippet, "Failed to generate ID") + return nil, cerrors.InternalError(errorTitleSaveSnippet, "Failed to generate ID") } if err = controller.db.PutSnippet(ctx, id, snippet); err != nil { logger.Errorf("SaveSnippet(): PutSnippet(): error during entity saving: %s", err.Error()) - return nil, errors.InternalError(errorTitleSaveSnippet, "Failed to save a snippet entity") + return nil, cerrors.InternalError(errorTitleSaveSnippet, "Failed to save a snippet entity") } response := pb.SaveSnippetResponse{Id: id} @@ -460,12 +463,15 @@ func (controller *playgroundController) SaveSnippet(ctx context.Context, req *pb func (controller *playgroundController) GetSnippet(ctx context.Context, info *pb.GetSnippetRequest) (*pb.GetSnippetResponse, error) { if controller.db == nil { logger.Error("GetSnippet(): the runner is trying to read the snippet") - return nil, errors.InvalidArgumentError(errorTitleGetSnippet, "The runner doesn't support snippets") + return nil, cerrors.InvalidArgumentError(errorTitleGetSnippet, "The runner doesn't support snippets") } snippet, err := controller.db.GetSnippet(ctx, info.GetId()) if err != nil { logger.Errorf("GetSnippet(): error during getting the snippet: %s", err.Error()) - return nil, errors.InternalError(errorTitleGetSnippet, "Failed to retrieve the snippet") + if errors.Is(err, datastore.ErrNoSuchEntity) { + return nil, cerrors.NotFoundError(errorTitleGetSnippet, resourceNotFoundErrMsg) + } + return nil, cerrors.InternalError(errorTitleGetSnippet, "Failed to retrieve the snippet") } response := pb.GetSnippetResponse{ @@ -476,7 +482,7 @@ func (controller *playgroundController) GetSnippet(ctx context.Context, info *pb files, err := controller.db.GetFiles(ctx, info.GetId(), snippet.NumberOfFiles) if err != nil { logger.Errorf("GetSnippet(): GetFiles(): error during getting files: %s", err.Error()) - return nil, errors.InternalError(errorTitleGetSnippet, "Failed to retrieve files") + return nil, cerrors.InternalError(errorTitleGetSnippet, "Failed to retrieve files") } for _, file := range files { response.Files = append(response.Files, &pb.SnippetFile{ diff --git a/playground/backend/internal/db/datastore/datastore_db.go b/playground/backend/internal/db/datastore/datastore_db.go index b1e79dcc7cea..f0169fcae021 100644 --- a/playground/backend/internal/db/datastore/datastore_db.go +++ b/playground/backend/internal/db/datastore/datastore_db.go @@ -273,8 +273,8 @@ func (d *Datastore) GetDefaultExamples(ctx context.Context, sdks []*entity.SDKEn } if len(examples) == 0 { - logger.Error("no examples") - return nil, fmt.Errorf("no examples") + logger.Error("no default example") + return nil, fmt.Errorf("no default example") } //Retrieving snippets From a997f9c28bb4f86c09944e2ad31812db6e0b6723 Mon Sep 17 00:00:00 2001 From: Dmitry Repin Date: Wed, 30 Nov 2022 23:10:35 +0400 Subject: [PATCH 031/456] pg_24284_now_closing_parenthesis on cancel button is visible (#24327) --- .../lib/src/widgets/run_button.dart | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/playground/frontend/playground_components/lib/src/widgets/run_button.dart b/playground/frontend/playground_components/lib/src/widgets/run_button.dart index 9aabaeb6b1f4..66ac02247979 100644 --- a/playground/frontend/playground_components/lib/src/widgets/run_button.dart +++ b/playground/frontend/playground_components/lib/src/widgets/run_button.dart @@ -53,12 +53,17 @@ class RunButton extends StatelessWidget { child: ShortcutTooltip( shortcut: playgroundController.runShortcut, child: ElevatedButton.icon( + style: const ButtonStyle( + padding: MaterialStatePropertyAll(EdgeInsets.zero), + ), icon: isRunning ? SizedBox( width: BeamIconSizes.small, height: BeamIconSizes.small, child: CircularProgressIndicator( - color: Theme.of(context).extension()!.primaryBackgroundTextColor, + color: Theme.of(context) + .extension()! + .primaryBackgroundTextColor, ), ) : const Icon(Icons.play_arrow), @@ -67,7 +72,8 @@ class RunButton extends StatelessWidget { builder: (context, AsyncSnapshot state) { final seconds = (state.data ?? 0) / kMsToSec; final runText = 'widgets.runOrCancelButton.titles.run'.tr(); - final cancelText = 'widgets.runOrCancelButton.titles.cancel'.tr(); + final cancelText = + 'widgets.runOrCancelButton.titles.cancel'.tr(); final buttonText = isRunning ? cancelText : runText; if (seconds > 0) { return Text( From 012c885af43f6c7d6a6e87fd0adabbfc2ce100d6 Mon Sep 17 00:00:00 2001 From: Benjamin Gonzalez <74670721+benWize@users.noreply.github.com> Date: Wed, 30 Nov 2022 13:54:13 -0600 Subject: [PATCH 032/456] [Github Actions] - Cut Release Branch Workflow (#24020) --- .github/workflows/cut_release_branch.yml | 111 +++++++++++++++++++++++ CI.md | 6 ++ 2 files changed, 117 insertions(+) create mode 100644 .github/workflows/cut_release_branch.yml diff --git a/.github/workflows/cut_release_branch.yml b/.github/workflows/cut_release_branch.yml new file mode 100644 index 000000000000..e89b8da829c4 --- /dev/null +++ b/.github/workflows/cut_release_branch.yml @@ -0,0 +1,111 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# This workflow will update apache beam master branch with next release version +# and cut release branch for current development version. + +# To learn more about GitHub Actions in Apache Beam check the CI.md + +name: Cut Release Branch +on: + workflow_dispatch: + inputs: + RELEASE_VERSION: + description: Beam version of current release + required: true + NEXT_VERSION: + description: Next release version + required: true + +jobs: + update_master: + runs-on: [self-hosted, ubuntu-20.04] + env: + MASTER_BRANCH: master + NEXT_RELEASE: ${{ github.event.inputs.NEXT_VERSION }} + SCRIPT_DIR: ./release/src/main/scripts + steps: + - name: Validate Next Version + run: | + if [[ $NEXT_RELEASE =~ ([0-9]+\.[0-9]+\.[0-9]+) ]]; then + echo "NEXT_VERSION_IN_BASE_BRANCH=${BASH_REMATCH[1]}" >> $GITHUB_ENV + else + echo "The input for NEXT_RELEASE does not match a valid format [0-9]+\.[0-9]+\.[0-9]+" + exit 1 + fi + - name: Check out code + uses: actions/checkout@v3 + - name: Set git config + run: | + git config user.name $GITHUB_ACTOR + git config user.email actions@"$RUNNER_NAME".local + - name: Update master branch + run: | + bash "${SCRIPT_DIR}/set_version.sh" "${NEXT_VERSION_IN_BASE_BRANCH}" + echo "==============Update master branch as following================" + git diff + echo "===============================================================" + - name: Commit and Push to master branch files with Next Version + run: | + git add * + git commit -m "Moving to ${NEXT_VERSION_IN_BASE_BRANCH}-SNAPSHOT on master branch." + git push origin ${MASTER_BRANCH} + + update_release_branch: + needs: update_master + runs-on: [self-hosted, ubuntu-20.04] + env: + RELEASE: ${{ github.event.inputs.RELEASE_VERSION }} + steps: + - name: Validate Release Version + run: | + if [[ ${RELEASE} =~ ([0-9]+\.[0-9]+\.[0-9]+) ]]; then + echo "RELEASE_VERSION=${BASH_REMATCH[1]}" >> $GITHUB_ENV + echo "RELEASE_BRANCH=release-${RELEASE}" >> $GITHUB_ENV + else + echo "The input for RELEASE does not match a valid format [0-9]+\.[0-9]+\.[0-9]+" + exit 1 + fi + - name: Check out code + uses: actions/checkout@v3 + - name: Set git config + run: | + git config user.name $GITHUB_ACTOR + git config user.email actions@"$RUNNER_NAME".local + - name: Checkout to release branch + run: | + git checkout -b ${RELEASE_BRANCH} + echo "==================Current working branch=======================" + echo ${RELEASE_BRANCH} + echo "===============================================================" + - name: Update release version for dataflow runner + run: | + sed -i -e "s/'beam-master-.*'/'${RELEASE}'/g" \ + runners/google-cloud-dataflow-java/build.gradle + echo "===============Update release branch as following==============" + git diff + echo "===============================================================" + - name: Commit and Push to release branch + run: | + git add runners/google-cloud-dataflow-java/build.gradle + git commit -m "Set Dataflow container to release version." + git push --set-upstream origin ${RELEASE_BRANCH} + + + + + diff --git a/CI.md b/CI.md index cb26171654b2..2f000a3c76dc 100644 --- a/CI.md +++ b/CI.md @@ -127,6 +127,12 @@ Service Account shall have following permissions ([IAM roles](https://cloud.goog ### Release Preparation and Validation Workflows +#### Cut Release Branch - [verify_release_build.yml](.github/workflows/cut_release_branch.yml) +| Job | Description | Pull Request Run | Direct Push/Merge Run | Scheduled Run | Requires GCP Credentials | +|-----------------------|------------------------------------------------------------|------------------|-----------------------|---------------|--------------------------| +| Update Master | Update Apache Beam master branch with next release version | No | No | No | No | +| Update Release Branch | Cut release branch for current development version | No | No | No | No | + #### Verify Release Build - [verify_release_build.yml](.github/workflows/verify_release_build.yml) | Job | Description | Pull Request Run | Direct Push/Merge Run | Scheduled Run | Requires GCP Credentials | From 6be2eed456374870ce44fe227d99ada5c77ac118 Mon Sep 17 00:00:00 2001 From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com> Date: Wed, 30 Nov 2022 14:58:29 -0500 Subject: [PATCH 033/456] Add six to build-requirements.txt (#24434) --- sdks/python/build-requirements.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sdks/python/build-requirements.txt b/sdks/python/build-requirements.txt index 80ce36de8c8a..1a17eb29fa1f 100644 --- a/sdks/python/build-requirements.txt +++ b/sdks/python/build-requirements.txt @@ -18,6 +18,11 @@ # TODO(https://github.com/apache/beam/issues/20051): Consider PEP-517/PEP-518 instead of this file. setuptools +# grpcio-tools depends on grpcio and the grpcio>1.50.0 results in error(ImportModuleError six) +# when installing Apache Beam source via pip install -e . +# Adding six as part of build dependencies. +# https://github.com/apache/beam/issues/24432 +six wheel>=0.36.0 grpcio-tools==1.37.0 From 8a42014f9a416df476bd66a7835edaaed2c44edc Mon Sep 17 00:00:00 2001 From: Anand Inguva <34158215+AnandInguva@users.noreply.github.com> Date: Wed, 30 Nov 2022 16:34:59 -0500 Subject: [PATCH 034/456] Add Pytorch RunInference GPU benchmark (#24347) * Add Pytorch RunInference GPU benchmark * Add option to specify python version for load tests * Update to Apache beam 2.42 in TensorRT docker file * Fix spotless groovy * Update Readme with GPU test * Update Readme * Fix groovy * Fix whitespace lint * Changes based on comments --- .test-infra/jenkins/LoadTestsBuilder.groovy | 15 ++++-- .../jenkins/PythonTestProperties.groovy | 1 + .../job_InferenceBenchmarkTests_Python.groovy | 42 ++++++++++++++- .../inference/pytorch_image_classification.py | 5 +- .../testing/benchmarks/inference/README.md | 52 +++++++++++++++---- ...pytorch_image_classification_benchmarks.py | 5 +- .../tensor_rt.dockerfile | 4 +- 7 files changed, 102 insertions(+), 22 deletions(-) diff --git a/.test-infra/jenkins/LoadTestsBuilder.groovy b/.test-infra/jenkins/LoadTestsBuilder.groovy index db83ff0546d2..b5b0b664acff 100644 --- a/.test-infra/jenkins/LoadTestsBuilder.groovy +++ b/.test-infra/jenkins/LoadTestsBuilder.groovy @@ -43,7 +43,8 @@ class LoadTestsBuilder { static void loadTest(context, String title, Runner runner, SDK sdk, Map options, - String mainClass, List jobSpecificSwitches = null, String requirementsTxtFile = null) { + String mainClass, List jobSpecificSwitches = null, String requirementsTxtFile = null, + String pythonVersion = null) { options.put('runner', runner.option) InfluxDBCredentialsHelper.useCredentials(context) @@ -52,7 +53,7 @@ class LoadTestsBuilder { gradle { rootBuildScriptDir(commonJobProperties.checkoutDir) setGradleTask(delegate, runner, sdk, options, mainClass, - jobSpecificSwitches, requirementsTxtFile) + jobSpecificSwitches, requirementsTxtFile, pythonVersion) commonJobProperties.setGradleSwitches(delegate) } } @@ -92,7 +93,8 @@ class LoadTestsBuilder { } private static void setGradleTask(context, Runner runner, SDK sdk, Map options, - String mainClass, List jobSpecificSwitches, String requirementsTxtFile = null) { + String mainClass, List jobSpecificSwitches, String requirementsTxtFile = null, + String pythonVersion = null) { context.tasks(getGradleTaskName(sdk)) context.switches("-PloadTest.mainClass=\"${mainClass}\"") context.switches("-Prunner=${runner.getDependencyBySDK(sdk)}") @@ -107,7 +109,12 @@ class LoadTestsBuilder { } if (sdk == SDK.PYTHON) { - context.switches("-PpythonVersion=${LOAD_TEST_PYTHON_VERSION}") + if (pythonVersion == null) { + context.switches("-PpythonVersion=${LOAD_TEST_PYTHON_VERSION}") + } + else { + context.switches("-PpythonVersion=${pythonVersion}") + } } } diff --git a/.test-infra/jenkins/PythonTestProperties.groovy b/.test-infra/jenkins/PythonTestProperties.groovy index 4a72619bc10a..f6ea06455608 100644 --- a/.test-infra/jenkins/PythonTestProperties.groovy +++ b/.test-infra/jenkins/PythonTestProperties.groovy @@ -38,6 +38,7 @@ class PythonTestProperties { final static List CROSS_LANGUAGE_VALIDATES_RUNNER_DATAFLOW_USING_SQL_PYTHON_VERSIONS = [HIGHEST_SUPPORTED] final static List VALIDATES_CONTAINER_DATAFLOW_PYTHON_VERSIONS = ALL_SUPPORTED_VERSIONS final static String LOAD_TEST_PYTHON_VERSION = '3.7' + final static String RUN_INFERENCE_TEST_PYTHON_VERSION = '3.8' final static String CHICAGO_TAXI_EXAMPLE_FLINK_PYTHON_VERSION = '3.7' // Use for various shell scripts triggered by Jenkins. // Gradle scripts should use project.ext.pythonVersion defined by PythonNature/BeamModulePlugin. diff --git a/.test-infra/jenkins/job_InferenceBenchmarkTests_Python.groovy b/.test-infra/jenkins/job_InferenceBenchmarkTests_Python.groovy index de1915ce990c..975c9c6a7f23 100644 --- a/.test-infra/jenkins/job_InferenceBenchmarkTests_Python.groovy +++ b/.test-infra/jenkins/job_InferenceBenchmarkTests_Python.groovy @@ -20,6 +20,8 @@ import CommonJobProperties as commonJobProperties import LoadTestsBuilder as loadTestsBuilder import PhraseTriggeringPostCommitBuilder import CronJobBuilder +import static PythonTestProperties.RUN_INFERENCE_TEST_PYTHON_VERSION + def now = new Date().format("MMddHHmmss", TimeZone.getTimeZone('UTC')) @@ -50,6 +52,7 @@ def loadTestConfigurations = { influx_db_name : InfluxDBCredentialsHelper.InfluxDBDatabaseName, influx_hostname : InfluxDBCredentialsHelper.InfluxDBHostUrl, pretrained_model_name : 'resnet101', + device : 'CPU', input_file : 'gs://apache-beam-ml/testing/inputs/openimage_50k_benchmark.txt', model_state_dict_path : 'gs://apache-beam-ml/models/torchvision.models.resnet101.pth', output : 'gs://temp-storage-for-end-to-end-tests/torch/result_101' + now + '.txt' @@ -78,6 +81,7 @@ def loadTestConfigurations = { influx_db_name : InfluxDBCredentialsHelper.InfluxDBDatabaseName, influx_hostname : InfluxDBCredentialsHelper.InfluxDBHostUrl, pretrained_model_name : 'resnet152', + device : 'CPU', input_file : 'gs://apache-beam-ml/testing/inputs/openimage_50k_benchmark.txt', model_state_dict_path : 'gs://apache-beam-ml/models/torchvision.models.resnet152.pth', output : 'gs://temp-storage-for-end-to-end-tests/torch/result_resnet152' + now + '.txt' @@ -106,6 +110,7 @@ def loadTestConfigurations = { influx_measurement : 'torch_language_modeling_bert_base_uncased', influx_db_name : InfluxDBCredentialsHelper.InfluxDBDatabaseName, influx_hostname : InfluxDBCredentialsHelper.InfluxDBHostUrl, + device : 'CPU', input_file : 'gs://apache-beam-ml/testing/inputs/sentences_50k.txt', bert_tokenizer : 'bert-base-uncased', model_state_dict_path : 'gs://apache-beam-ml/models/huggingface.BertForMaskedLM.bert-base-uncased.pth', @@ -134,12 +139,44 @@ def loadTestConfigurations = { influx_measurement : 'torch_language_modeling_bert_large_uncased', influx_db_name : InfluxDBCredentialsHelper.InfluxDBDatabaseName, influx_hostname : InfluxDBCredentialsHelper.InfluxDBHostUrl, + device : 'CPU', input_file : 'gs://apache-beam-ml/testing/inputs/sentences_50k.txt', bert_tokenizer : 'bert-large-uncased', model_state_dict_path : 'gs://apache-beam-ml/models/huggingface.BertForMaskedLM.bert-large-uncased.pth', output : 'gs://temp-storage-for-end-to-end-tests/torch/result_bert_large_uncased' + now + '.txt' ] ], + [ + title : 'Pytorch Imagenet Classification with Resnet 152 with Tesla T4 GPU', + test : 'apache_beam.testing.benchmarks.inference.pytorch_image_classification_benchmarks', + runner : CommonTestProperties.Runner.DATAFLOW, + pipelineOptions: [ + job_name : 'benchmark-tests-pytorch-imagenet-python-gpu' + now, + project : 'apache-beam-testing', + region : 'us-central1', + machine_type : 'n1-standard-2', + num_workers : 75, // this could be lower as the quota for the apache-beam-testing project is 32 T4 GPUs as of November 28th, 2022. + disk_size_gb : 50, + autoscaling_algorithm : 'NONE', + staging_location : 'gs://temp-storage-for-perf-tests/loadtests', + temp_location : 'gs://temp-storage-for-perf-tests/loadtests', + requirements_file : 'apache_beam/ml/inference/torch_tests_requirements.txt', + publish_to_big_query : true, + metrics_dataset : 'beam_run_inference', + metrics_table : 'torch_inference_imagenet_results_resnet152_tesla_t4', + input_options : '{}', // this option is not required for RunInference tests. + influx_measurement : 'torch_inference_imagenet_resnet152_tesla_t4', + influx_db_name : InfluxDBCredentialsHelper.InfluxDBDatabaseName, + influx_hostname : InfluxDBCredentialsHelper.InfluxDBHostUrl, + pretrained_model_name : 'resnet152', + device : 'GPU', + experiments : 'worker_accelerator=type:nvidia-tesla-t4;count:1;install-nvidia-driver', + sdk_container_image : 'us.gcr.io/apache-beam-testing/python-postcommit-it/tensor_rt:latest', + input_file : 'gs://apache-beam-ml/testing/inputs/openimage_50k_benchmark.txt', + model_state_dict_path : 'gs://apache-beam-ml/models/torchvision.models.resnet152.pth', + output : 'gs://temp-storage-for-end-to-end-tests/torch/result_resnet152_gpu' + now + '.txt' + ] + ], ] } @@ -147,14 +184,15 @@ def loadTestJob = { scope -> List testScenarios = loadTestConfigurations() for (Map testConfig: testScenarios){ commonJobProperties.setTopLevelMainJobProperties(scope, 'master', 180) - loadTestsBuilder.loadTest(scope, testConfig.title, testConfig.runner, CommonTestProperties.SDK.PYTHON, testConfig.pipelineOptions, testConfig.test, null, testConfig.pipelineOptions.requirements_file) + loadTestsBuilder.loadTest(scope, testConfig.title, testConfig.runner, CommonTestProperties.SDK.PYTHON, testConfig.pipelineOptions, testConfig.test, null, + testConfig.pipelineOptions.requirements_file, RUN_INFERENCE_TEST_PYTHON_VERSION) } } PhraseTriggeringPostCommitBuilder.postCommitJob( 'beam_Inference_Python_Benchmarks_Dataflow', 'Run Inference Benchmarks', - 'Inference benchmarks on Dataflow(\"Run Inference Benchmarks"\"")', + 'RunInference benchmarks on Dataflow(\"Run Inference Benchmarks"\"")', this ) { loadTestJob(delegate) diff --git a/sdks/python/apache_beam/examples/inference/pytorch_image_classification.py b/sdks/python/apache_beam/examples/inference/pytorch_image_classification.py index 1b335dc0b6ab..65f21ceaa318 100644 --- a/sdks/python/apache_beam/examples/inference/pytorch_image_classification.py +++ b/sdks/python/apache_beam/examples/inference/pytorch_image_classification.py @@ -108,6 +108,7 @@ def run( model_class=None, model_params=None, save_main_session=True, + device='CPU', test_pipeline=None) -> PipelineResult: """ Args: @@ -117,6 +118,7 @@ def run( These will be used to instantiate the model object in the RunInference API. save_main_session: Used for internal testing. + device: Device to be used on the Runner. Choices are (CPU, GPU). test_pipeline: Used for internal testing. """ known_args, pipeline_args = parse_known_args(argv) @@ -138,7 +140,8 @@ def batch_elements_kwargs(self): PytorchModelHandlerTensorWithBatchSize( state_dict_path=known_args.model_state_dict_path, model_class=model_class, - model_params=model_params)) + model_params=model_params, + device=device)) pipeline = test_pipeline if not test_pipeline: diff --git a/sdks/python/apache_beam/testing/benchmarks/inference/README.md b/sdks/python/apache_beam/testing/benchmarks/inference/README.md index 9ef269c73a56..12c817bd1226 100644 --- a/sdks/python/apache_beam/testing/benchmarks/inference/README.md +++ b/sdks/python/apache_beam/testing/benchmarks/inference/README.md @@ -38,16 +38,34 @@ the following metrics: - Mean Load Model Latency - the average amount of time it takes to load a model. This is done once per DoFn instance on worker startup, so the cost is amortized across the pipeline. +These metrics are published to InfluxDB and BigQuery. + +### Pytorch Image Classification Tests + +* Pytorch Image Classification with Resnet 101. + * machine_type: n1-standard-2 + * num_workers: 75 + * autoscaling_algorithm: NONE + * disk_size_gb: 50 + +* Pytorch Image Classification with Resnet 152. + * machine_type: n1-standard-2 + * num_workers: 75 + * autoscaling_algorithm: NONE + * disk_size_gb: 50 + +* Pytorch Imagenet Classification with Resnet 152 with Tesla T4 GPU. + * machine_type: + * CPU: n1-standard-2 + * GPU: NVIDIA Tesla T4 + * num_workers: 75 + * autoscaling_algorithm: NONE + * disk_size_gb: 50 + Approximate size of the models used in the tests * resnet101: 170.5 MB * resnet152: 230.4 MB -The above tests are configured to run using following configurations - * machine_type: n1-standard-2 - * num_workers: 75 - * autoscaling_algorithm: NONE - * disk_size_gb: 50 - ## Pytorch RunInference Language Modeling The Pytorch RunInference Language Modeling benchmark runs an @@ -62,12 +80,24 @@ the following metrics: - Mean Load Model Latency - the average amount of time it takes to load a model. This is done once per DoFn instance on worker startup, so the cost is amortized across the pipeline. +These metrics are published to InfluxDB and BigQuery. + +### Pytorch Language Modeling Tests + +* Pytorch Langauge Modeling using Hugging Face bert-base-uncased model. + * machine_type: n1-standard-2 + * num_workers: 250 + * autoscaling_algorithm: NONE + * disk_size_gb: 50 + +* Pytorch Langauge Modeling using Hugging Face bert-large-uncased model. + * machine_type: n1-standard-2 + * num_workers: 250 + * autoscaling_algorithm: NONE + * disk_size_gb: 50 + Approximate size of the models used in the tests * bert-base-uncased: 417.7 MB * bert-large-uncased: 1.2 GB -The above tests are configured to run using following configurations - * machine_type: n1-standard-2 - * num_workers: 250 - * autoscaling_algorithm: NONE - * disk_size_gb: 75 +All the performance tests are defined at [job_InferenceBenchmarkTests_Python.groovy](https://github.com/apache/beam/blob/master/.test-infra/jenkins/job_InferenceBenchmarkTests_Python.groovy). diff --git a/sdks/python/apache_beam/testing/benchmarks/inference/pytorch_image_classification_benchmarks.py b/sdks/python/apache_beam/testing/benchmarks/inference/pytorch_image_classification_benchmarks.py index eafa9fde38dd..514c9d672850 100644 --- a/sdks/python/apache_beam/testing/benchmarks/inference/pytorch_image_classification_benchmarks.py +++ b/sdks/python/apache_beam/testing/benchmarks/inference/pytorch_image_classification_benchmarks.py @@ -54,12 +54,13 @@ def test(self): extra_opts = {} extra_opts['input'] = self.pipeline.get_option('input_file') - + device = self.pipeline.get_option('device') self.result = pytorch_image_classification.run( self.pipeline.get_full_options_as_args(**extra_opts), model_class=model_class, model_params=model_params, - test_pipeline=self.pipeline) + test_pipeline=self.pipeline, + device=device) if __name__ == '__main__': diff --git a/sdks/python/test-suites/containers/tensorrt_runinference/tensor_rt.dockerfile b/sdks/python/test-suites/containers/tensorrt_runinference/tensor_rt.dockerfile index 7e0f0575ddf3..73594cf94d0a 100644 --- a/sdks/python/test-suites/containers/tensorrt_runinference/tensor_rt.dockerfile +++ b/sdks/python/test-suites/containers/tensorrt_runinference/tensor_rt.dockerfile @@ -22,8 +22,8 @@ ENV PATH="/usr/src/tensorrt/bin:${PATH}" WORKDIR /workspace -RUN pip install --no-cache-dir apache-beam[gcp]==2.40.0 -COPY --from=apache/beam_python3.8_sdk:2.40.0 /opt/apache/beam /opt/apache/beam +RUN pip install --no-cache-dir apache-beam[gcp]==2.42.0 +COPY --from=apache/beam_python3.8_sdk:2.42.0 /opt/apache/beam /opt/apache/beam RUN pip install --upgrade pip \ && pip install torch>=1.7.1 \ From 9915ec466b217dc79ae47d0d318f73ad174da626 Mon Sep 17 00:00:00 2001 From: Yi Hu Date: Wed, 30 Nov 2022 17:08:53 -0500 Subject: [PATCH 035/456] Fix multiple mutations affecting the same entity in Datastore write (#24403) * Fix multiple mutations affecting the same entity in Datastore write * Use entity key instead of mutation hash value to decide flush * Fix typos Datatore->Datastore --- .../sdk/io/gcp/datastore/DatastoreV1.java | 9 ++-- .../sdk/io/gcp/datastore/DatastoreV1Test.java | 50 +++++++++++++++++-- .../gcp/datastore/RampupThrottlingFnTest.java | 9 ++-- 3 files changed, 55 insertions(+), 13 deletions(-) diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/datastore/DatastoreV1.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/datastore/DatastoreV1.java index 6a7dc725fbc3..a8870e4d6128 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/datastore/DatastoreV1.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/datastore/DatastoreV1.java @@ -1485,7 +1485,7 @@ static class DatastoreWriterFn extends DoFn { private final V1DatastoreFactory datastoreFactory; // Current batch of mutations to be written. private final List mutations = new ArrayList<>(); - private final HashSet uniqueMutations = new HashSet<>(); + private final HashSet uniqueMutationKeys = new HashSet<>(); private int mutationsSize = 0; // Accumulated size of protos in mutations. private WriteBatcher writeBatcher; private transient AdaptiveThrottler adaptiveThrottler; @@ -1547,7 +1547,7 @@ public void processElement(ProcessContext c) throws Exception { Mutation write = c.element(); int size = write.getSerializedSize(); - if (!uniqueMutations.add(c.element())) { + if (!uniqueMutationKeys.add(write.getUpsert().getKey())) { flushBatch(); } @@ -1579,7 +1579,8 @@ public void finishBundle() throws Exception { * @throws DatastoreException if the commit fails or IOException or InterruptedException if * backing off between retries fails. */ - private void flushBatch() throws DatastoreException, IOException, InterruptedException { + private synchronized void flushBatch() + throws DatastoreException, IOException, InterruptedException { LOG.debug("Writing batch of {} mutations", mutations.size()); Sleeper sleeper = Sleeper.DEFAULT; BackOff backoff = BUNDLE_WRITE_BACKOFF.backoff(); @@ -1654,7 +1655,7 @@ private void flushBatch() throws DatastoreException, IOException, InterruptedExc } LOG.debug("Successfully wrote {} mutations", mutations.size()); mutations.clear(); - uniqueMutations.clear(); + uniqueMutationKeys.clear(); mutationsSize = 0; } diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/datastore/DatastoreV1Test.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/datastore/DatastoreV1Test.java index 4aed59c4da38..3280e17998e4 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/datastore/DatastoreV1Test.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/datastore/DatastoreV1Test.java @@ -77,6 +77,7 @@ import java.util.HashMap; import java.util.List; import java.util.Set; +import java.util.UUID; import org.apache.beam.runners.core.metrics.MetricsContainerImpl; import org.apache.beam.runners.core.metrics.MonitoringInfoConstants; import org.apache.beam.runners.core.metrics.MonitoringInfoMetricName; @@ -552,7 +553,7 @@ public void testDatastoreWriteFnDisplayData() { /** Tests {@link DatastoreWriterFn} with entities less than one batch. */ @Test - public void testDatatoreWriterFnWithOneBatch() throws Exception { + public void testDatastoreWriterFnWithOneBatch() throws Exception { datastoreWriterFnTest(100); verifyMetricWasSet("BatchDatastoreWrite", "ok", "", 2); } @@ -561,7 +562,7 @@ public void testDatatoreWriterFnWithOneBatch() throws Exception { * Tests {@link DatastoreWriterFn} with entities of more than one batches, but not a multiple. */ @Test - public void testDatatoreWriterFnWithMultipleBatches() throws Exception { + public void testDatastoreWriterFnWithMultipleBatches() throws Exception { datastoreWriterFnTest(DatastoreV1.DATASTORE_BATCH_UPDATE_ENTITIES_START * 3 + 100); verifyMetricWasSet("BatchDatastoreWrite", "ok", "", 5); } @@ -571,7 +572,7 @@ public void testDatatoreWriterFnWithMultipleBatches() throws Exception { * write batch size. */ @Test - public void testDatatoreWriterFnWithBatchesExactMultiple() throws Exception { + public void testDatastoreWriterFnWithBatchesExactMultiple() throws Exception { datastoreWriterFnTest(DatastoreV1.DATASTORE_BATCH_UPDATE_ENTITIES_START * 2); verifyMetricWasSet("BatchDatastoreWrite", "ok", "", 2); } @@ -611,7 +612,7 @@ private void datastoreWriterFnTest(int numMutations) throws Exception { * Tests {@link DatastoreWriterFn} with large entities that need to be split into more batches. */ @Test - public void testDatatoreWriterFnWithLargeEntities() throws Exception { + public void testDatastoreWriterFnWithLargeEntities() throws Exception { List mutations = new ArrayList<>(); int entitySize = 0; for (int i = 0; i < 12; ++i) { @@ -651,9 +652,48 @@ public void testDatatoreWriterFnWithLargeEntities() throws Exception { } } + /** Tests {@link DatastoreWriterFn} correctly flushes batch upon receive same entity keys. */ + @Test + public void testDatastoreWriterFnWithDuplicateEntities() throws Exception { + List mutations = new ArrayList<>(); + for (int i : Arrays.asList(0, 1, 0, 2)) { + // this will generate entities having key 0, 1, 0, 2 and random values + mutations.add( + makeUpsert( + Entity.newBuilder() + .setKey(makeKey("key" + i)) + .putProperties("value", makeValue(UUID.randomUUID().toString()).build()) + .build()) + .build()); + } + + DatastoreWriterFn datastoreWriter = + new DatastoreWriterFn( + StaticValueProvider.of(PROJECT_ID), + null, + mockDatastoreFactory, + new FakeWriteBatcher()); + DoFnTester doFnTester = DoFnTester.of(datastoreWriter); + doFnTester.setCloningBehavior(CloningBehavior.DO_NOT_CLONE); + doFnTester.processBundle(mutations); + + // first invocation has key [0, 1] + CommitRequest.Builder commitRequest = CommitRequest.newBuilder(); + commitRequest.setMode(CommitRequest.Mode.NON_TRANSACTIONAL); + commitRequest.addAllMutations(mutations.subList(0, 2)); + verify(mockDatastore, times(1)).commit(commitRequest.build()); + + // second invocation has key [0, 2] because the second 0 triggered a flush batch + commitRequest = CommitRequest.newBuilder(); + commitRequest.setMode(CommitRequest.Mode.NON_TRANSACTIONAL); + commitRequest.addAllMutations(mutations.subList(2, 4)); + verify(mockDatastore, times(1)).commit(commitRequest.build()); + verifyMetricWasSet("BatchDatastoreWrite", "ok", "", 2); + } + /** Tests {@link DatastoreWriterFn} with a failed request which is retried. */ @Test - public void testDatatoreWriterFnRetriesErrors() throws Exception { + public void testDatastoreWriterFnRetriesErrors() throws Exception { List mutations = new ArrayList<>(); int numRpcs = 2; for (int i = 0; i < DatastoreV1.DATASTORE_BATCH_UPDATE_ENTITIES_START * numRpcs; ++i) { diff --git a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/datastore/RampupThrottlingFnTest.java b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/datastore/RampupThrottlingFnTest.java index bb1377c83983..9eb3c2b66d34 100644 --- a/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/datastore/RampupThrottlingFnTest.java +++ b/sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/datastore/RampupThrottlingFnTest.java @@ -21,6 +21,7 @@ import static org.mockito.Mockito.verify; import java.util.Map; +import java.util.UUID; import org.apache.beam.sdk.metrics.Counter; import org.apache.beam.sdk.testing.TestPipeline; import org.apache.beam.sdk.transforms.Create; @@ -52,7 +53,7 @@ public class RampupThrottlingFnTest { verify(mockCounter).inc(millis); throw new RampupDelayException(); }; - private DoFnTester rampupThrottlingFnTester; + private DoFnTester rampupThrottlingFnTester; @Before public void setUp() throws Exception { @@ -62,8 +63,8 @@ public void setUp() throws Exception { TestPipeline pipeline = TestPipeline.create(); PCollectionView startTimeView = pipeline.apply(Create.of(Instant.now())).apply(View.asSingleton()); - RampupThrottlingFn rampupThrottlingFn = - new RampupThrottlingFn(1, startTimeView) { + RampupThrottlingFn rampupThrottlingFn = + new RampupThrottlingFn(1, startTimeView) { @Override @Setup public void setup() { @@ -101,7 +102,7 @@ public void testRampupThrottler() throws Exception { for (Map.Entry entry : rampupSchedule.entrySet()) { DateTimeUtils.setCurrentMillisFixed(entry.getKey().getMillis()); for (int i = 0; i < entry.getValue(); i++) { - rampupThrottlingFnTester.processElement(null); + rampupThrottlingFnTester.processElement(UUID.randomUUID().toString()); } assertThrows(RampupDelayException.class, () -> rampupThrottlingFnTester.processElement(null)); } From 8349ee6c12addef56035ae1f12291a239175cc92 Mon Sep 17 00:00:00 2001 From: Yi Hu Date: Wed, 30 Nov 2022 20:18:30 -0500 Subject: [PATCH 036/456] Fix BlobstorageIO.checksum Attribute Error (#24442) --- sdks/python/apache_beam/io/azure/blobstorageio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/io/azure/blobstorageio.py b/sdks/python/apache_beam/io/azure/blobstorageio.py index c614ad64ab37..948dda32b8f3 100644 --- a/sdks/python/apache_beam/io/azure/blobstorageio.py +++ b/sdks/python/apache_beam/io/azure/blobstorageio.py @@ -375,7 +375,7 @@ def checksum(self, path): path: Azure Blob Storage file path pattern in the form azfs:////[name]. """ - return self._blob_properties(path).properties.etag + return self._blob_properties(path).etag def _status(self, path): """For internal use only; no backwards-compatibility guarantees. From 61f966740e90a38b941a719d46f7e6db5225da43 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 30 Nov 2022 21:18:57 -0800 Subject: [PATCH 037/456] Bump github.com/tetratelabs/wazero in /sdks (#24453) Bumps [github.com/tetratelabs/wazero](https://github.com/tetratelabs/wazero) from 1.0.0-pre.3 to 1.0.0-pre.4. - [Release notes](https://github.com/tetratelabs/wazero/releases) - [Commits](https://github.com/tetratelabs/wazero/compare/v1.0.0-pre.3...v1.0.0-pre.4) --- updated-dependencies: - dependency-name: github.com/tetratelabs/wazero dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- sdks/go.mod | 2 +- sdks/go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sdks/go.mod b/sdks/go.mod index 16c65636209d..a258fe28630e 100644 --- a/sdks/go.mod +++ b/sdks/go.mod @@ -65,7 +65,7 @@ require cloud.google.com/go/spanner v1.40.0 require ( cloud.google.com/go/bigtable v1.18.0 - github.com/tetratelabs/wazero v1.0.0-pre.3 + github.com/tetratelabs/wazero v1.0.0-pre.4 ) require ( diff --git a/sdks/go.sum b/sdks/go.sum index 2126c83bca69..686957229ac0 100644 --- a/sdks/go.sum +++ b/sdks/go.sum @@ -873,8 +873,8 @@ github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635/go.mod h1:hkRG github.com/tchap/go-patricia v2.2.6+incompatible/go.mod h1:bmLyhP68RS6kStMGxByiQ23RP/odRBOTVjwp2cDyi6I= github.com/testcontainers/testcontainers-go v0.15.0 h1:3Ex7PUGFv0b2bBsdOv6R42+SK2qoZnWBd21LvZYhUtQ= github.com/testcontainers/testcontainers-go v0.15.0/go.mod h1:PkohMRH2X8Hib0IWtifVexDfLPVT+tb5E9hsf7cW12w= -github.com/tetratelabs/wazero v1.0.0-pre.3 h1:Z5fbogMUGcERzaQb9mQU8+yJSy0bVvv2ce3dfR4wcZg= -github.com/tetratelabs/wazero v1.0.0-pre.3/go.mod h1:M8UDNECGm/HVjOfq0EOe4QfCY9Les1eq54IChMLETbc= +github.com/tetratelabs/wazero v1.0.0-pre.4 h1:RBJQT5OzmORkSp6MmZDWoFEr0zXjk4pmvMKAdeUnsaI= +github.com/tetratelabs/wazero v1.0.0-pre.4/go.mod h1:u8wrFmpdrykiFK0DFPiFm5a4+0RzsdmXYVtijBKqUVo= github.com/tmc/grpc-websocket-proxy v0.0.0-20170815181823-89b8d40f7ca8/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= github.com/ugorji/go v1.1.4/go.mod h1:uQMGLiO92mf5W77hV/PUCpI3pbzQx3CRekS0kk+RGrc= From 57e5b69f45bfc11c6060a76888364383ac71d9a4 Mon Sep 17 00:00:00 2001 From: nancyxu123 Date: Wed, 30 Nov 2022 22:34:50 -0800 Subject: [PATCH 038/456] [BEAM-12164] Support querying against Postgres for the SpannerIO change streams connector (#24390) * Initial commit * Added all * SecondCommit * thirdCommit * commitFive * Addresswd Thiago's comments * Disable integration test * Addressed more comments * formatted files * Removed testing changes * Update ChangeStreamRecordMapper.java * Update ChangeStreamDao.java * Update SpannerChangeStreamPostgresIT.java * Update ChangeStreamDao.java * fix failing tests * fixed errors Co-authored-by: Nancy Xu --- .../beam/sdk/io/gcp/spanner/SpannerIO.java | 36 +- .../spanner/changestreams/NameGenerator.java | 21 +- .../action/QueryChangeStreamAction.java | 2 +- .../changestreams/dao/ChangeStreamDao.java | 73 ++- .../dao/ChangeStreamResultSet.java | 18 + .../spanner/changestreams/dao/DaoFactory.java | 22 +- .../dao/PartitionMetadataAdminDao.java | 119 ++-- .../dao/PartitionMetadataDao.java | 228 +++++--- .../mapper/ChangeStreamRecordMapper.java | 283 ++++++++-- .../changestreams/mapper/MapperFactory.java | 9 +- .../changestreams/NameGeneratorTest.java | 7 +- .../SpannerChangeStreamErrorTest.java | 128 +++-- .../action/QueryChangeStreamActionTest.java | 8 +- .../dao/PartitionMetadataAdminDaoTest.java | 29 +- .../dao/PartitionMetadataDaoTest.java | 7 +- .../changestreams/it/IntegrationTestEnv.java | 176 ++++-- .../it/SpannerChangeStreamPostgresIT.java | 274 +++++++++ .../mapper/ChangeStreamRecordMapperTest.java | 520 +++++++++++++++--- .../changestreams/util/TestJsonMapper.java | 246 +++++++++ 19 files changed, 1847 insertions(+), 359 deletions(-) create mode 100644 sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/it/SpannerChangeStreamPostgresIT.java create mode 100644 sdks/java/io/google-cloud-platform/src/test/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/util/TestJsonMapper.java diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/SpannerIO.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/SpannerIO.java index d584f3039860..913d98ff8d18 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/SpannerIO.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/SpannerIO.java @@ -35,6 +35,7 @@ import com.google.cloud.ServiceFactory; import com.google.cloud.Timestamp; import com.google.cloud.spanner.AbortedException; +import com.google.cloud.spanner.DatabaseClient; import com.google.cloud.spanner.DatabaseId; import com.google.cloud.spanner.Dialect; import com.google.cloud.spanner.ErrorCode; @@ -1598,10 +1599,11 @@ && getInclusiveStartAt().toSqlTimestamp().after(getInclusiveEndAt().toSqlTimesta getMetadataInstance(), changeStreamDatabaseId.getInstanceId().getInstance()); final String partitionMetadataDatabaseId = MoreObjects.firstNonNull(getMetadataDatabase(), changeStreamDatabaseId.getDatabase()); - final String partitionMetadataTableName = - MoreObjects.firstNonNull( - getMetadataTable(), generatePartitionMetadataTableName(partitionMetadataDatabaseId)); - + final DatabaseId fullPartitionMetadataDatabaseId = + DatabaseId.of( + getSpannerConfig().getProjectId().get(), + partitionMetadataInstanceId, + partitionMetadataDatabaseId); SpannerConfig changeStreamSpannerConfig = getSpannerConfig(); // Set default retryable errors for ReadChangeStream if (changeStreamSpannerConfig.getRetryableCodes() == null) { @@ -1628,6 +1630,21 @@ && getInclusiveStartAt().toSqlTimestamp().after(getInclusiveEndAt().toSqlTimesta .setInstanceId(StaticValueProvider.of(partitionMetadataInstanceId)) .setDatabaseId(StaticValueProvider.of(partitionMetadataDatabaseId)) .build(); + Dialect changeStreamDatabaseDialect = getDialect(changeStreamSpannerConfig); + Dialect metadataDatabaseDialect = getDialect(partitionMetadataSpannerConfig); + LOG.info( + "The Spanner database " + + changeStreamDatabaseId + + " has dialect " + + changeStreamDatabaseDialect); + LOG.info( + "The Spanner database " + + fullPartitionMetadataDatabaseId + + " has dialect " + + metadataDatabaseDialect); + final String partitionMetadataTableName = + MoreObjects.firstNonNull( + getMetadataTable(), generatePartitionMetadataTableName(partitionMetadataDatabaseId)); final String changeStreamName = getChangeStreamName(); final Timestamp startTimestamp = getInclusiveStartAt(); // Uses (Timestamp.MAX - 1ns) at max for end timestamp, because we add 1ns to transform the @@ -1636,7 +1653,7 @@ && getInclusiveStartAt().toSqlTimestamp().after(getInclusiveEndAt().toSqlTimesta getInclusiveEndAt().compareTo(MAX_INCLUSIVE_END_AT) > 0 ? MAX_INCLUSIVE_END_AT : getInclusiveEndAt(); - final MapperFactory mapperFactory = new MapperFactory(); + final MapperFactory mapperFactory = new MapperFactory(changeStreamDatabaseDialect); final ChangeStreamMetrics metrics = new ChangeStreamMetrics(); final RpcPriority rpcPriority = MoreObjects.firstNonNull(getRpcPriority(), RpcPriority.HIGH); final DaoFactory daoFactory = @@ -1646,7 +1663,9 @@ && getInclusiveStartAt().toSqlTimestamp().after(getInclusiveEndAt().toSqlTimesta partitionMetadataSpannerConfig, partitionMetadataTableName, rpcPriority, - input.getPipeline().getOptions().getJobName()); + input.getPipeline().getOptions().getJobName(), + changeStreamDatabaseDialect, + metadataDatabaseDialect); final ActionFactory actionFactory = new ActionFactory(); final InitializeDoFn initializeDoFn = @@ -1696,6 +1715,11 @@ && getInclusiveStartAt().toSqlTimestamp().after(getInclusiveEndAt().toSqlTimesta } } + private static Dialect getDialect(SpannerConfig spannerConfig) { + DatabaseClient databaseClient = SpannerAccessor.getOrCreate(spannerConfig).getDatabaseClient(); + return databaseClient.getDialect(); + } + /** * Interface to display the name of the metadata table on Dataflow UI. This is only used for * internal purpose. This should not be used to pass the name of the metadata table. diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/NameGenerator.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/NameGenerator.java index 26e036259a63..322e85cb07a2 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/NameGenerator.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/NameGenerator.java @@ -25,23 +25,28 @@ */ public class NameGenerator { - private static final String PARTITION_METADATA_TABLE_NAME_FORMAT = - "CDC_Partitions_Metadata_%s_%s"; + private static final String PARTITION_METADATA_TABLE_NAME_FORMAT = "Metadata_%s_%s"; + private static final int MAX_TABLE_NAME_LENGTH = 63; /** * Generates an unique name for the partition metadata table in the form of {@code - * "CDC_Partitions_Metadata__"}. + * "Metadata__"}. * * @param databaseId The database id where the table will be created * @return the unique generated name of the partition metadata table */ public static String generatePartitionMetadataTableName(String databaseId) { - // Maximum Spanner table name length is 128 characters. - // There are 25 characters in the name format. + // There are 11 characters in the name format. // Maximum Spanner database ID length is 30 characters. // UUID always generates a String with 36 characters. - // 128 - (25 + 30 + 36) = 37 characters short of the limit - return String.format(PARTITION_METADATA_TABLE_NAME_FORMAT, databaseId, UUID.randomUUID()) - .replaceAll("-", "_"); + // Since the Postgres table name length is 63, we may need to truncate the table name depending + // on the database length. + String fullString = + String.format(PARTITION_METADATA_TABLE_NAME_FORMAT, databaseId, UUID.randomUUID()) + .replaceAll("-", "_"); + if (fullString.length() < MAX_TABLE_NAME_LENGTH) { + return fullString; + } + return fullString.substring(0, MAX_TABLE_NAME_LENGTH); } } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/QueryChangeStreamAction.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/QueryChangeStreamAction.java index f3b895371a6d..5fd39a6b13fc 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/QueryChangeStreamAction.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/action/QueryChangeStreamAction.java @@ -172,7 +172,7 @@ public ProcessContinuation run( while (resultSet.next()) { final List records = changeStreamRecordMapper.toChangeStreamRecords( - updatedPartition, resultSet.getCurrentRowAsStruct(), resultSet.getMetadata()); + updatedPartition, resultSet, resultSet.getMetadata()); Optional maybeContinuation; for (final ChangeStreamRecord record : records) { diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/dao/ChangeStreamDao.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/dao/ChangeStreamDao.java index c08e14c3de50..3ef9c13f4714 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/dao/ChangeStreamDao.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/dao/ChangeStreamDao.java @@ -19,6 +19,7 @@ import com.google.cloud.Timestamp; import com.google.cloud.spanner.DatabaseClient; +import com.google.cloud.spanner.Dialect; import com.google.cloud.spanner.Options; import com.google.cloud.spanner.Options.RpcPriority; import com.google.cloud.spanner.ResultSet; @@ -35,6 +36,7 @@ public class ChangeStreamDao { private final DatabaseClient databaseClient; private final RpcPriority rpcPriority; private final String jobName; + private final Dialect dialect; /** * Constructs a change stream dao. All the queries performed by this class will be for the given @@ -50,11 +52,13 @@ public class ChangeStreamDao { String changeStreamName, DatabaseClient databaseClient, RpcPriority rpcPriority, - String jobName) { + String jobName, + Dialect dialect) { this.changeStreamName = changeStreamName; this.databaseClient = databaseClient; this.rpcPriority = rpcPriority; this.jobName = jobName; + this.dialect = dialect; } /** @@ -84,33 +88,54 @@ public ChangeStreamResultSet changeStreamQuery( final String partitionTokenOrNull = InitialPartition.isInitialPartition(partitionToken) ? null : partitionToken; - final String query = - "SELECT * FROM READ_" - + changeStreamName - + "(" - + " start_timestamp => @startTimestamp," - + " end_timestamp => @endTimestamp," - + " partition_token => @partitionToken," - + " read_options => null," - + " heartbeat_milliseconds => @heartbeatMillis" - + ")"; + String query = ""; + Statement statement; + if (this.isPostgres()) { + query = + "SELECT * FROM \"spanner\".\"read_json_" + changeStreamName + "\"($1, $2, $3, $4, null)"; + statement = + Statement.newBuilder(query) + .bind("p1") + .to(startTimestamp) + .bind("p2") + .to(endTimestamp) + .bind("p3") + .to(partitionTokenOrNull) + .bind("p4") + .to(heartbeatMillis) + .build(); + } else { + query = + "SELECT * FROM READ_" + + changeStreamName + + "(" + + " start_timestamp => @startTimestamp," + + " end_timestamp => @endTimestamp," + + " partition_token => @partitionToken," + + " read_options => null," + + " heartbeat_milliseconds => @heartbeatMillis" + + ")"; + statement = + Statement.newBuilder(query) + .bind("startTimestamp") + .to(startTimestamp) + .bind("endTimestamp") + .to(endTimestamp) + .bind("partitionToken") + .to(partitionTokenOrNull) + .bind("heartbeatMillis") + .to(heartbeatMillis) + .build(); + } final ResultSet resultSet = databaseClient .singleUse() - .executeQuery( - Statement.newBuilder(query) - .bind("startTimestamp") - .to(startTimestamp) - .bind("endTimestamp") - .to(endTimestamp) - .bind("partitionToken") - .to(partitionTokenOrNull) - .bind("heartbeatMillis") - .to(heartbeatMillis) - .build(), - Options.priority(rpcPriority), - Options.tag("job=" + jobName)); + .executeQuery(statement, Options.priority(rpcPriority), Options.tag("job=" + jobName)); return new ChangeStreamResultSet(resultSet); } + + private boolean isPostgres() { + return this.dialect == Dialect.POSTGRESQL; + } } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/dao/ChangeStreamResultSet.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/dao/ChangeStreamResultSet.java index 0ed690a33b48..f4ffba598a4b 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/dao/ChangeStreamResultSet.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/dao/ChangeStreamResultSet.java @@ -99,6 +99,8 @@ public boolean next() { *

    If {@link ChangeStreamResultSet#next()} was not called or if it was called but there are no * more records in the stream, null will be returned. * + *

    Should only be used for GoogleSQL databases. + * * @return a change stream record as a {@link Struct} or null */ public Struct getCurrentRowAsStruct() { @@ -106,6 +108,22 @@ public Struct getCurrentRowAsStruct() { return resultSet.getCurrentRowAsStruct(); } + /** + * Returns the record at the current pointer as {@link JsonB}. It also updates the timestamp at + * which the record was read. + * + *

    If {@link ChangeStreamResultSet#next()} was not called or if it was called but there are no + * more records in the stream, null will be returned. + * + *

    Should only be used for PostgreSQL databases. + * + * @return a change stream record as a {@link Struct} or null + */ + public String getPgJsonb(int index) { + recordReadAt = Timestamp.now(); + return resultSet.getPgJsonb(index); + } + /** * Returns the gathered metadata for the change stream query so far. * diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/dao/DaoFactory.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/dao/DaoFactory.java index 43b581480dc4..0b40ddaccad9 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/dao/DaoFactory.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/dao/DaoFactory.java @@ -18,6 +18,7 @@ package org.apache.beam.sdk.io.gcp.spanner.changestreams.dao; import com.google.cloud.spanner.DatabaseAdminClient; +import com.google.cloud.spanner.Dialect; import com.google.cloud.spanner.Options.RpcPriority; import java.io.Serializable; import org.apache.beam.sdk.io.gcp.spanner.SpannerAccessor; @@ -46,6 +47,8 @@ public class DaoFactory implements Serializable { private final String partitionMetadataTableName; private final RpcPriority rpcPriority; private final String jobName; + private final Dialect spannerChangeStreamDatabaseDialect; + private final Dialect metadataDatabaseDialect; /** * Constructs a {@link DaoFactory} with the configuration to be used for the underlying instances. @@ -63,7 +66,9 @@ public DaoFactory( SpannerConfig metadataSpannerConfig, String partitionMetadataTableName, RpcPriority rpcPriority, - String jobName) { + String jobName, + Dialect spannerChangeStreamDatabaseDialect, + Dialect metadataDatabaseDialect) { if (metadataSpannerConfig.getInstanceId() == null) { throw new IllegalArgumentException("Metadata instance can not be null"); } @@ -76,6 +81,8 @@ public DaoFactory( this.partitionMetadataTableName = partitionMetadataTableName; this.rpcPriority = rpcPriority; this.jobName = jobName; + this.spannerChangeStreamDatabaseDialect = spannerChangeStreamDatabaseDialect; + this.metadataDatabaseDialect = metadataDatabaseDialect; } /** @@ -95,7 +102,8 @@ public synchronized PartitionMetadataAdminDao getPartitionMetadataAdminDao() { databaseAdminClient, metadataSpannerConfig.getInstanceId().get(), metadataSpannerConfig.getDatabaseId().get(), - partitionMetadataTableName); + partitionMetadataTableName, + this.metadataDatabaseDialect); } return partitionMetadataAdminDao; } @@ -112,7 +120,9 @@ public synchronized PartitionMetadataDao getPartitionMetadataDao() { if (partitionMetadataDaoInstance == null) { partitionMetadataDaoInstance = new PartitionMetadataDao( - this.partitionMetadataTableName, spannerAccessor.getDatabaseClient()); + this.partitionMetadataTableName, + spannerAccessor.getDatabaseClient(), + this.metadataDatabaseDialect); } return partitionMetadataDaoInstance; } @@ -129,7 +139,11 @@ public synchronized ChangeStreamDao getChangeStreamDao() { if (changeStreamDaoInstance == null) { changeStreamDaoInstance = new ChangeStreamDao( - this.changeStreamName, spannerAccessor.getDatabaseClient(), rpcPriority, jobName); + this.changeStreamName, + spannerAccessor.getDatabaseClient(), + rpcPriority, + jobName, + this.spannerChangeStreamDatabaseDialect); } return changeStreamDaoInstance; } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/dao/PartitionMetadataAdminDao.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/dao/PartitionMetadataAdminDao.java index ef3f4416d6f7..8f0951d4ac59 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/dao/PartitionMetadataAdminDao.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/dao/PartitionMetadataAdminDao.java @@ -19,6 +19,7 @@ import com.google.api.gax.longrunning.OperationFuture; import com.google.cloud.spanner.DatabaseAdminClient; +import com.google.cloud.spanner.Dialect; import com.google.cloud.spanner.SpannerException; import com.google.cloud.spanner.SpannerExceptionFactory; import com.google.spanner.admin.database.v1.UpdateDatabaseDdlMetadata; @@ -84,6 +85,7 @@ public class PartitionMetadataAdminDao { private final String instanceId; private final String databaseId; private final String tableName; + private final Dialect dialect; /** * Constructs the partition metadata admin dao. @@ -98,11 +100,13 @@ public class PartitionMetadataAdminDao { DatabaseAdminClient databaseAdminClient, String instanceId, String databaseId, - String tableName) { + String tableName, + Dialect dialect) { this.databaseAdminClient = databaseAdminClient; this.instanceId = instanceId; this.databaseId = databaseId; this.tableName = tableName; + this.dialect = dialect; } /** @@ -113,38 +117,76 @@ public class PartitionMetadataAdminDao { * PartitionMetadataAdminDao#TTL_AFTER_PARTITION_FINISHED_DAYS} days. */ public void createPartitionMetadataTable() { - final String metadataCreateStmt = - "CREATE TABLE " - + tableName - + " (" - + COLUMN_PARTITION_TOKEN - + " STRING(MAX) NOT NULL," - + COLUMN_PARENT_TOKENS - + " ARRAY NOT NULL," - + COLUMN_START_TIMESTAMP - + " TIMESTAMP NOT NULL," - + COLUMN_END_TIMESTAMP - + " TIMESTAMP NOT NULL," - + COLUMN_HEARTBEAT_MILLIS - + " INT64 NOT NULL," - + COLUMN_STATE - + " STRING(MAX) NOT NULL," - + COLUMN_WATERMARK - + " TIMESTAMP NOT NULL," - + COLUMN_CREATED_AT - + " TIMESTAMP NOT NULL OPTIONS (allow_commit_timestamp=true)," - + COLUMN_SCHEDULED_AT - + " TIMESTAMP OPTIONS (allow_commit_timestamp=true)," - + COLUMN_RUNNING_AT - + " TIMESTAMP OPTIONS (allow_commit_timestamp=true)," - + COLUMN_FINISHED_AT - + " TIMESTAMP OPTIONS (allow_commit_timestamp=true)," - + ") PRIMARY KEY (PartitionToken)," - + " ROW DELETION POLICY (OLDER_THAN(" - + COLUMN_FINISHED_AT - + ", INTERVAL " - + TTL_AFTER_PARTITION_FINISHED_DAYS - + " DAY))"; + String metadataCreateStmt = ""; + if (this.isPostgres()) { + // Literals need be added around literals to preserve casing. + metadataCreateStmt = + "CREATE TABLE \"" + + tableName + + "\"(\"" + + COLUMN_PARTITION_TOKEN + + "\" text NOT NULL,\"" + + COLUMN_PARENT_TOKENS + + "\" text[] NOT NULL,\"" + + COLUMN_START_TIMESTAMP + + "\" timestamptz NOT NULL,\"" + + COLUMN_END_TIMESTAMP + + "\" timestamptz NOT NULL,\"" + + COLUMN_HEARTBEAT_MILLIS + + "\" BIGINT NOT NULL,\"" + + COLUMN_STATE + + "\" text NOT NULL,\"" + + COLUMN_WATERMARK + + "\" timestamptz NOT NULL,\"" + + COLUMN_CREATED_AT + + "\" SPANNER.COMMIT_TIMESTAMP NOT NULL,\"" + + COLUMN_SCHEDULED_AT + + "\" SPANNER.COMMIT_TIMESTAMP,\"" + + COLUMN_RUNNING_AT + + "\" SPANNER.COMMIT_TIMESTAMP,\"" + + COLUMN_FINISHED_AT + + "\" SPANNER.COMMIT_TIMESTAMP," + + " PRIMARY KEY (\"PartitionToken\")" + + ")" + + " TTL INTERVAL '" + + TTL_AFTER_PARTITION_FINISHED_DAYS + + " days' ON \"" + + COLUMN_FINISHED_AT + + "\""; + } else { + metadataCreateStmt = + "CREATE TABLE " + + tableName + + " (" + + COLUMN_PARTITION_TOKEN + + " STRING(MAX) NOT NULL," + + COLUMN_PARENT_TOKENS + + " ARRAY NOT NULL," + + COLUMN_START_TIMESTAMP + + " TIMESTAMP NOT NULL," + + COLUMN_END_TIMESTAMP + + " TIMESTAMP NOT NULL," + + COLUMN_HEARTBEAT_MILLIS + + " INT64 NOT NULL," + + COLUMN_STATE + + " STRING(MAX) NOT NULL," + + COLUMN_WATERMARK + + " TIMESTAMP NOT NULL," + + COLUMN_CREATED_AT + + " TIMESTAMP NOT NULL OPTIONS (allow_commit_timestamp=true)," + + COLUMN_SCHEDULED_AT + + " TIMESTAMP OPTIONS (allow_commit_timestamp=true)," + + COLUMN_RUNNING_AT + + " TIMESTAMP OPTIONS (allow_commit_timestamp=true)," + + COLUMN_FINISHED_AT + + " TIMESTAMP OPTIONS (allow_commit_timestamp=true)," + + ") PRIMARY KEY (PartitionToken)," + + " ROW DELETION POLICY (OLDER_THAN(" + + COLUMN_FINISHED_AT + + ", INTERVAL " + + TTL_AFTER_PARTITION_FINISHED_DAYS + + " DAY))"; + } OperationFuture op = databaseAdminClient.updateDatabaseDdl( instanceId, databaseId, Collections.singletonList(metadataCreateStmt), null); @@ -170,7 +212,12 @@ public void createPartitionMetadataTable() { * PartitionMetadataAdminDao#TIMEOUT_MINUTES} minutes. */ public void deletePartitionMetadataTable() { - final String metadataDropStmt = "DROP TABLE " + tableName; + String metadataDropStmt; + if (this.isPostgres()) { + metadataDropStmt = "DROP TABLE \"" + tableName + "\""; + } else { + metadataDropStmt = "DROP TABLE " + tableName; + } OperationFuture op = databaseAdminClient.updateDatabaseDdl( instanceId, databaseId, Collections.singletonList(metadataDropStmt), null); @@ -190,4 +237,8 @@ public void deletePartitionMetadataTable() { throw SpannerExceptionFactory.propagateInterrupt(e); } } + + private boolean isPostgres() { + return this.dialect == Dialect.POSTGRESQL; + } } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/dao/PartitionMetadataDao.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/dao/PartitionMetadataDao.java index 0d044b09119d..f6f32c8023f0 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/dao/PartitionMetadataDao.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/dao/PartitionMetadataDao.java @@ -31,6 +31,7 @@ import com.google.cloud.Timestamp; import com.google.cloud.spanner.DatabaseClient; +import com.google.cloud.spanner.Dialect; import com.google.cloud.spanner.Mutation; import com.google.cloud.spanner.ResultSet; import com.google.cloud.spanner.Statement; @@ -53,6 +54,7 @@ public class PartitionMetadataDao { private final String metadataTableName; private final DatabaseClient databaseClient; + private final Dialect dialect; /** * Constructs a partition metadata dao object given the generated name of the tables. @@ -60,9 +62,10 @@ public class PartitionMetadataDao { * @param metadataTableName the name of the partition metadata table * @param databaseClient the {@link DatabaseClient} to perform queries */ - PartitionMetadataDao(String metadataTableName, DatabaseClient databaseClient) { + PartitionMetadataDao(String metadataTableName, DatabaseClient databaseClient, Dialect dialect) { this.metadataTableName = metadataTableName; this.databaseClient = databaseClient; + this.dialect = dialect; } /** @@ -94,19 +97,31 @@ public boolean tableExists() { * returns null. */ public @Nullable Struct getPartition(String partitionToken) { - try (ResultSet resultSet = - databaseClient - .singleUse() - .executeQuery( - Statement.newBuilder( - "SELECT * FROM " - + metadataTableName - + " WHERE " - + COLUMN_PARTITION_TOKEN - + " = @partition") - .bind("partition") - .to(partitionToken) - .build())) { + Statement statement; + if (this.isPostgres()) { + statement = + Statement.newBuilder( + "SELECT * FROM \"" + + metadataTableName + + "\" WHERE \"" + + COLUMN_PARTITION_TOKEN + + "\" = $1") + .bind("p1") + .to(partitionToken) + .build(); + } else { + statement = + Statement.newBuilder( + "SELECT * FROM " + + metadataTableName + + " WHERE " + + COLUMN_PARTITION_TOKEN + + " = @partition") + .bind("partition") + .to(partitionToken) + .build(); + } + try (ResultSet resultSet = databaseClient.singleUse().executeQuery(statement)) { if (resultSet.next()) { return resultSet.getCurrentRowAsStruct(); } @@ -121,21 +136,40 @@ public boolean tableExists() { * @return the earliest partition watermark which is not in a {@link State#FINISHED} state. */ public @Nullable Timestamp getUnfinishedMinWatermark() { - final Statement statement = - Statement.newBuilder( - "SELECT " - + COLUMN_WATERMARK - + " FROM " - + metadataTableName - + " WHERE " - + COLUMN_STATE - + " != @state" - + " ORDER BY " - + COLUMN_WATERMARK - + " ASC LIMIT 1") - .bind("state") - .to(State.FINISHED.name()) - .build(); + Statement statement; + if (this.isPostgres()) { + statement = + Statement.newBuilder( + "SELECT \"" + + COLUMN_WATERMARK + + "\" FROM \"" + + metadataTableName + + "\" WHERE \"" + + COLUMN_STATE + + "\" != $1" + + " ORDER BY \"" + + COLUMN_WATERMARK + + "\" ASC LIMIT 1") + .bind("p1") + .to(State.FINISHED.name()) + .build(); + } else { + statement = + Statement.newBuilder( + "SELECT " + + COLUMN_WATERMARK + + " FROM " + + metadataTableName + + " WHERE " + + COLUMN_STATE + + " != @state" + + " ORDER BY " + + COLUMN_WATERMARK + + " ASC LIMIT 1") + .bind("state") + .to(State.FINISHED.name()) + .build(); + } try (ResultSet resultSet = databaseClient.singleUse().executeQuery(statement)) { if (resultSet.next()) { return resultSet.getTimestamp(COLUMN_WATERMARK); @@ -151,23 +185,42 @@ public boolean tableExists() { * PartitionMetadataAdminDao#COLUMN_START_TIMESTAMP} columns in ascending order. */ public ResultSet getAllPartitionsCreatedAfter(Timestamp timestamp) { - final Statement statement = - Statement.newBuilder( - "SELECT * FROM " - + metadataTableName - + " WHERE " - + COLUMN_CREATED_AT - + " > @timestamp" - + " ORDER BY " - + COLUMN_CREATED_AT - + " ASC" - + ", " - + COLUMN_START_TIMESTAMP - + " ASC") - .bind("timestamp") - .to(timestamp) - .build(); - + Statement statement; + if (this.isPostgres()) { + statement = + Statement.newBuilder( + "SELECT * FROM \"" + + metadataTableName + + "\" WHERE \"" + + COLUMN_CREATED_AT + + "\" > $1" + + " ORDER BY \"" + + COLUMN_CREATED_AT + + "\" ASC" + + ", \"" + + COLUMN_START_TIMESTAMP + + "\" ASC") + .bind("p1") + .to(timestamp) + .build(); + } else { + statement = + Statement.newBuilder( + "SELECT * FROM " + + metadataTableName + + " WHERE " + + COLUMN_CREATED_AT + + " > @timestamp" + + " ORDER BY " + + COLUMN_CREATED_AT + + " ASC" + + ", " + + COLUMN_START_TIMESTAMP + + " ASC") + .bind("timestamp") + .to(timestamp) + .build(); + } return databaseClient.singleUse().executeQuery(statement); } @@ -176,16 +229,30 @@ public ResultSet getAllPartitionsCreatedAfter(Timestamp timestamp) { * given timestamp. */ public long countPartitionsCreatedAfter(Timestamp timestamp) { - final Statement statement = - Statement.newBuilder( - "SELECT COUNT(*) as count FROM " - + metadataTableName - + " WHERE " - + COLUMN_CREATED_AT - + " > @timestamp") - .bind("timestamp") - .to(timestamp) - .build(); + Statement statement; + if (this.isPostgres()) { + statement = + Statement.newBuilder( + "SELECT COUNT(*) as count FROM \"" + + metadataTableName + + "\" WHERE \"" + + COLUMN_CREATED_AT + + "\" > $1") + .bind("p1") + .to(timestamp) + .build(); + } else { + statement = + Statement.newBuilder( + "SELECT COUNT(*) as count FROM " + + metadataTableName + + " WHERE " + + COLUMN_CREATED_AT + + " > @timestamp") + .bind("timestamp") + .to(timestamp) + .build(); + } try (ResultSet resultSet = databaseClient.singleUse().executeQuery(statement)) { if (resultSet.next()) { @@ -196,6 +263,10 @@ public long countPartitionsCreatedAfter(Timestamp timestamp) { } } + private boolean isPostgres() { + return this.dialect == Dialect.POSTGRESQL; + } + /** * Inserts the partition metadata. * @@ -270,7 +341,7 @@ public TransactionResult runInTransaction(Function { final InTransactionContext transactionContext = - new InTransactionContext(metadataTableName, transaction); + new InTransactionContext(metadataTableName, transaction, this.dialect); return callable.apply(transactionContext); }); return new TransactionResult<>(result, readWriteTransaction.getCommitTimestamp()); @@ -282,17 +353,21 @@ public static class InTransactionContext { private final String metadataTableName; private final TransactionContext transaction; private final Map stateToTimestampColumn; + private final Dialect dialect; /** * Constructs a context to execute a user defined function transactionally. * * @param metadataTableName the name of the partition metadata table * @param transaction the underlying client library transaction to be executed + * @param dialect the dialect of the database. */ - public InTransactionContext(String metadataTableName, TransactionContext transaction) { + public InTransactionContext( + String metadataTableName, TransactionContext transaction, Dialect dialect) { this.metadataTableName = metadataTableName; this.transaction = transaction; this.stateToTimestampColumn = new HashMap<>(); + this.dialect = dialect; stateToTimestampColumn.put(State.CREATED, COLUMN_CREATED_AT); stateToTimestampColumn.put(State.SCHEDULED, COLUMN_SCHEDULED_AT); stateToTimestampColumn.put(State.RUNNING, COLUMN_RUNNING_AT); @@ -365,17 +440,32 @@ public Void updateWatermark(String partitionToken, Timestamp watermark) { * returns null. */ public @Nullable Struct getPartition(String partitionToken) { - try (ResultSet resultSet = - transaction.executeQuery( - Statement.newBuilder( - "SELECT * FROM " - + metadataTableName - + " WHERE " - + COLUMN_PARTITION_TOKEN - + " = @partition") - .bind("partition") - .to(partitionToken) - .build())) { + Statement statement; + if (this.dialect == Dialect.POSTGRESQL) { + statement = + Statement.newBuilder( + "SELECT * FROM \"" + + metadataTableName + + "\" WHERE \"" + + COLUMN_PARTITION_TOKEN + + "\" = $1") + .bind("p1") + .to(partitionToken) + .build(); + + } else { + statement = + Statement.newBuilder( + "SELECT * FROM " + + metadataTableName + + " WHERE " + + COLUMN_PARTITION_TOKEN + + " = @partition") + .bind("partition") + .to(partitionToken) + .build(); + } + try (ResultSet resultSet = transaction.executeQuery(statement)) { if (resultSet.next()) { return resultSet.getCurrentRowAsStruct(); } diff --git a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/mapper/ChangeStreamRecordMapper.java b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/mapper/ChangeStreamRecordMapper.java index b697efc0e4cc..cfd6d91a65dc 100644 --- a/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/mapper/ChangeStreamRecordMapper.java +++ b/sdks/java/io/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/spanner/changestreams/mapper/ChangeStreamRecordMapper.java @@ -18,12 +18,19 @@ package org.apache.beam.sdk.io.gcp.spanner.changestreams.mapper; import com.google.cloud.Timestamp; +import com.google.cloud.spanner.Dialect; import com.google.cloud.spanner.Struct; -import com.google.cloud.spanner.Type; +import com.google.protobuf.InvalidProtocolBufferException; +import com.google.protobuf.Value; +import com.google.protobuf.util.JsonFormat; +import java.util.Collections; import java.util.HashSet; import java.util.List; +import java.util.Map; +import java.util.Optional; import java.util.stream.Collectors; import java.util.stream.Stream; +import org.apache.beam.sdk.io.gcp.spanner.changestreams.dao.ChangeStreamResultSet; import org.apache.beam.sdk.io.gcp.spanner.changestreams.dao.ChangeStreamResultSetMetadata; import org.apache.beam.sdk.io.gcp.spanner.changestreams.model.ChangeStreamRecord; import org.apache.beam.sdk.io.gcp.spanner.changestreams.model.ChangeStreamRecordMetadata; @@ -83,13 +90,24 @@ public class ChangeStreamRecordMapper { private static final String CHILD_PARTITIONS_COLUMN = "child_partitions"; private static final String PARENT_PARTITION_TOKENS_COLUMN = "parent_partition_tokens"; private static final String TOKEN_COLUMN = "token"; + private final Dialect dialect; + private final JsonFormat.Printer printer; + private final JsonFormat.Parser parser; - ChangeStreamRecordMapper() {} + ChangeStreamRecordMapper(Dialect dialect) { + this.dialect = dialect; + + this.printer = + JsonFormat.printer().preservingProtoFieldNames().omittingInsignificantWhitespace(); + this.parser = JsonFormat.parser().ignoringUnknownFields(); + } /** - * Transforms a {@link Struct} representing a change stream result into a {@link List} of {@link - * ChangeStreamRecord} model. The type of the change stream record will be identified and one of - * the following subclasses can be returned within the resulting {@link List}: + * In GoogleSQL, change stream records are returned as an array of {@link Struct}. In PostgresQL, + * change stream records are returned as {@link Jsonb} Transforms a {@link Struct / Jsonb} + * representing a change stream result into a {@link List} of {@link ChangeStreamRecord} model. + * The type of the change stream record will be identified and one of the following subclasses can + * be returned within the resulting {@link List}: * *

      *
    • {@link DataChangeRecord} @@ -97,16 +115,16 @@ public class ChangeStreamRecordMapper { *
    • {@link ChildPartitionsRecord} *
    * - * Additionally to the {@link Struct} received, the originating partition of the records (given by - * the {@link PartitionMetadata} parameter) and the stream metadata (given by the {@link + * Additionally to the {@link Struct / Jsonb} received, the originating partition of the records + * (given by the {@link PartitionMetadata} parameter) and the stream metadata (given by the {@link * ChangeStreamResultSetMetadata}) are used to populate the {@link ChangeStreamRecordMetadata} for * each record mapped. * - *

    The {@link Struct} is expected to have the following fields: + *

    The {@link Struct / Jsonb} is expected to have the following fields: * *