Skip to content

Commit

Permalink
Update watermark during periodic sequence/impulse (#23507)
Browse files Browse the repository at this point in the history
* Update watermark during periodic sequence/impulse

* Remove extraneous import

* Formatting

* Linting

* Only run on dataflow for guaranteed watermark support

* More permissive test to avoid timing issues

* Test pipeline options

* Fix test

* Formatting

* Formatting

* Apply feedback - cleanup/naming/flink

* Format

* Unused import
  • Loading branch information
damccorm authored Nov 4, 2022
1 parent 1563b97 commit 30b2617
Show file tree
Hide file tree
Showing 2 changed files with 97 additions and 1 deletion.
9 changes: 8 additions & 1 deletion sdks/python/apache_beam/transforms/periodicsequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import apache_beam as beam
from apache_beam.io.restriction_trackers import OffsetRange
from apache_beam.io.restriction_trackers import OffsetRestrictionTracker
from apache_beam.io.watermark_estimators import ManualWatermarkEstimator
from apache_beam.runners import sdf_utils
from apache_beam.transforms import core
from apache_beam.transforms import window
Expand Down Expand Up @@ -75,7 +76,9 @@ def process(
self,
element,
restriction_tracker=beam.DoFn.RestrictionParam(
ImpulseSeqGenRestrictionProvider())):
ImpulseSeqGenRestrictionProvider()),
watermark_estimator=beam.DoFn.WatermarkEstimatorParam(
ManualWatermarkEstimator.default_provider())):
'''
:param element: (start_timestamp, end_timestamp, interval)
:param restriction_tracker:
Expand All @@ -92,13 +95,17 @@ def process(
current_output_index = restriction_tracker.current_restriction().start
current_output_timestamp = start + interval * current_output_index
current_time = time.time()
watermark_estimator.set_watermark(
timestamp.Timestamp(current_output_timestamp))

while current_output_timestamp <= current_time:
if restriction_tracker.try_claim(current_output_index):
yield current_output_timestamp
current_output_index += 1
current_output_timestamp = start + interval * current_output_index
current_time = time.time()
watermark_estimator.set_watermark(
timestamp.Timestamp(current_output_timestamp))
else:
return

Expand Down
89 changes: 89 additions & 0 deletions sdks/python/apache_beam/transforms/periodicsequence_it_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""Integration tests for cross-language transform expansion."""

# pytype: skip-file

import time
import unittest

import pytest

import apache_beam as beam
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.testing.test_pipeline import TestPipeline
from apache_beam.testing.util import assert_that
from apache_beam.testing.util import is_empty
from apache_beam.transforms import trigger
from apache_beam.transforms import window
from apache_beam.transforms.core import DoFn
from apache_beam.transforms.periodicsequence import PeriodicSequence


@unittest.skipIf(
not TestPipeline().get_pipeline_options().view_as(
StandardOptions).streaming,
"Watermark tests are only valid for streaming jobs.")
class PeriodicSequenceIT(unittest.TestCase):
def setUp(self):
self.test_pipeline = TestPipeline(is_integration_test=True)

@pytest.mark.it_postcommit
@pytest.mark.sickbay_direct
@pytest.mark.sickbay_spark
@pytest.mark.timeout(
1800) # Timeout after 30 minutes to give Dataflow some extra time
def test_periodicsequence_outputs_valid_watermarks_it(self):
"""Tests periodic sequence with watermarks on dataflow.
For testing that watermarks are being correctly emitted,
we make sure that there's not a long gap between an element being
emitted and being correctly aggregated.
"""
class FindLongGaps(DoFn):
def process(self, element):
emitted_at, unused_count = element
processed_at = time.time()
if processed_at - emitted_at > 25:
yield (
'Elements emitted took too long to process.',
emitted_at,
processed_at)

start_time = time.time()
# Run long enough for Dataflow to start up
duration_sec = 540
end_time = start_time + duration_sec
interval = 1

res = (
self.test_pipeline
| 'ImpulseElement' >> beam.Create([(start_time, end_time, interval)])
| 'ImpulseSeqGen' >> PeriodicSequence()
| 'MapToCurrentTime' >> beam.Map(lambda element: time.time())
| 'window_into' >> beam.WindowInto(
window.FixedWindows(2),
accumulation_mode=trigger.AccumulationMode.DISCARDING)
| beam.combiners.Count.PerElement()
| beam.ParDo(FindLongGaps()))
assert_that(res, is_empty())

self.test_pipeline.run().wait_until_finish()


if __name__ == '__main__':
unittest.main()

0 comments on commit 30b2617

Please sign in to comment.