From 30b26171b4038c1c976adbc3be386e118b35153b Mon Sep 17 00:00:00 2001 From: Danny McCormick Date: Fri, 4 Nov 2022 08:22:17 -0400 Subject: [PATCH] Update watermark during periodic sequence/impulse (#23507) * Update watermark during periodic sequence/impulse * Remove extraneous import * Formatting * Linting * Only run on dataflow for guaranteed watermark support * More permissive test to avoid timing issues * Test pipeline options * Fix test * Formatting * Formatting * Apply feedback - cleanup/naming/flink * Format * Unused import --- .../transforms/periodicsequence.py | 9 +- .../transforms/periodicsequence_it_test.py | 89 +++++++++++++++++++ 2 files changed, 97 insertions(+), 1 deletion(-) create mode 100644 sdks/python/apache_beam/transforms/periodicsequence_it_test.py diff --git a/sdks/python/apache_beam/transforms/periodicsequence.py b/sdks/python/apache_beam/transforms/periodicsequence.py index 56075a5acc71..0417224c6479 100644 --- a/sdks/python/apache_beam/transforms/periodicsequence.py +++ b/sdks/python/apache_beam/transforms/periodicsequence.py @@ -21,6 +21,7 @@ import apache_beam as beam from apache_beam.io.restriction_trackers import OffsetRange from apache_beam.io.restriction_trackers import OffsetRestrictionTracker +from apache_beam.io.watermark_estimators import ManualWatermarkEstimator from apache_beam.runners import sdf_utils from apache_beam.transforms import core from apache_beam.transforms import window @@ -75,7 +76,9 @@ def process( self, element, restriction_tracker=beam.DoFn.RestrictionParam( - ImpulseSeqGenRestrictionProvider())): + ImpulseSeqGenRestrictionProvider()), + watermark_estimator=beam.DoFn.WatermarkEstimatorParam( + ManualWatermarkEstimator.default_provider())): ''' :param element: (start_timestamp, end_timestamp, interval) :param restriction_tracker: @@ -92,6 +95,8 @@ def process( current_output_index = restriction_tracker.current_restriction().start current_output_timestamp = start + interval * current_output_index current_time = time.time() + watermark_estimator.set_watermark( + timestamp.Timestamp(current_output_timestamp)) while current_output_timestamp <= current_time: if restriction_tracker.try_claim(current_output_index): @@ -99,6 +104,8 @@ def process( current_output_index += 1 current_output_timestamp = start + interval * current_output_index current_time = time.time() + watermark_estimator.set_watermark( + timestamp.Timestamp(current_output_timestamp)) else: return diff --git a/sdks/python/apache_beam/transforms/periodicsequence_it_test.py b/sdks/python/apache_beam/transforms/periodicsequence_it_test.py new file mode 100644 index 000000000000..e900ba4cd855 --- /dev/null +++ b/sdks/python/apache_beam/transforms/periodicsequence_it_test.py @@ -0,0 +1,89 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Integration tests for cross-language transform expansion.""" + +# pytype: skip-file + +import time +import unittest + +import pytest + +import apache_beam as beam +from apache_beam.options.pipeline_options import StandardOptions +from apache_beam.testing.test_pipeline import TestPipeline +from apache_beam.testing.util import assert_that +from apache_beam.testing.util import is_empty +from apache_beam.transforms import trigger +from apache_beam.transforms import window +from apache_beam.transforms.core import DoFn +from apache_beam.transforms.periodicsequence import PeriodicSequence + + +@unittest.skipIf( + not TestPipeline().get_pipeline_options().view_as( + StandardOptions).streaming, + "Watermark tests are only valid for streaming jobs.") +class PeriodicSequenceIT(unittest.TestCase): + def setUp(self): + self.test_pipeline = TestPipeline(is_integration_test=True) + + @pytest.mark.it_postcommit + @pytest.mark.sickbay_direct + @pytest.mark.sickbay_spark + @pytest.mark.timeout( + 1800) # Timeout after 30 minutes to give Dataflow some extra time + def test_periodicsequence_outputs_valid_watermarks_it(self): + """Tests periodic sequence with watermarks on dataflow. + For testing that watermarks are being correctly emitted, + we make sure that there's not a long gap between an element being + emitted and being correctly aggregated. + """ + class FindLongGaps(DoFn): + def process(self, element): + emitted_at, unused_count = element + processed_at = time.time() + if processed_at - emitted_at > 25: + yield ( + 'Elements emitted took too long to process.', + emitted_at, + processed_at) + + start_time = time.time() + # Run long enough for Dataflow to start up + duration_sec = 540 + end_time = start_time + duration_sec + interval = 1 + + res = ( + self.test_pipeline + | 'ImpulseElement' >> beam.Create([(start_time, end_time, interval)]) + | 'ImpulseSeqGen' >> PeriodicSequence() + | 'MapToCurrentTime' >> beam.Map(lambda element: time.time()) + | 'window_into' >> beam.WindowInto( + window.FixedWindows(2), + accumulation_mode=trigger.AccumulationMode.DISCARDING) + | beam.combiners.Count.PerElement() + | beam.ParDo(FindLongGaps())) + assert_that(res, is_empty()) + + self.test_pipeline.run().wait_until_finish() + + +if __name__ == '__main__': + unittest.main()