ProcessFunction 提供了 "定时服务(TimerService)",可以访问到流中的 Event、Timestamp、Watermark, 还可以注册 "定时事件", 因为 ProcessFunction 继承了 AbstractRichFunction 抽象类, 所以 ProcessFunction 拥有富函数的所有特性, 可以访问 state、将数据输出到 side output 中。
- ProcessFunction: 最基本的处理函数
- KeyedProcessFunction:Keyby 之后的处理函数
- ProcessWindowFunction:开窗之后的处理函数
- ProcessAllWindowFunction:开窗之后的处理函数,只不过是全窗口 AllWindow.
- CoProcessFunction:合并(connect) 两条六之后的处理函数
- ProcessJoinFunction:interval join 两条流之后的处理函数
- BroadcastProcessFunction:广播连接流处理函数
- KeyedBroadcastProcessFunction:按键分区的广播连接流处理函数
最基本的处理函数
import com.atguigu.chapter05.ClickSource;
import com.atguigu.chapter05.Event;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.util.Collector;
public class ProcessFunctionTest {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
env
.addSource(new ClickSource())
.assignTimestampsAndWatermarks(
WatermarkStrategy.<Event>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event event, long l) {
return event.timestamp;
}
})
)
.process(new ProcessFunction<Event, String>() {
@Override
public void processElement(Event value, Context ctx, Collector<String> out) throws Exception {
if (value.user.equals("Mary")) {
out.collect(value.user);
} else if (value.user.equals("Bob")) {
out.collect(value.user);
out.collect(value.user);
}
System.out.println(ctx.timerService().currentWatermark());
}
})
.print();
env.execute();
}
}
继承 ProcessFunction 有两个泛型参数:
- I: Input,输入的数据类型
- O: Output,输出的数据类型
内部定义了两个方法
- onTimer: 不是必须实现的抽象方法,定义定时触发的逻辑,processElement 中定义定时,到时触发这里的逻辑。
- processElement: 必须实现的抽象方法,每条数据进来都会调用一次, 又有三个入参: 输入、上下文ctx、Collector。
- 输入: 每一个输入的数据。
- 上下文ctx: 可以获取当前的时间戳,并且提供了用于查询时间和注册定时器的"定时服务(TimeService)",以及可以将数据发送到测输出流中
- Collector: 返回输出数据
KeyBy 之后的处理函数
import com.kino.process.Event;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.util.Collector;
public class EventTimeTimerTest {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
SingleOutputStreamOperator<Event> stream = env.addSource(new CustomSource())
.assignTimestampsAndWatermarks(WatermarkStrategy.<Event>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event element, long recordTimestamp) {
return element.timestamp;
}
}));
// 基于KeyedStream定义事件时间定时器
stream.keyBy(data -> true)
.process(new KeyedProcessFunction<Boolean, Event, String>() {
@Override
public void processElement(Event value, Context ctx, Collector<String> out) throws Exception {
out.collect("数据到达,时间戳为:" + ctx.timestamp());
out.collect("数据到达,水位线为:" + ctx.timerService().currentWatermark() + "\n -------分割线-------");
// 注册一个10秒后的定时器
ctx.timerService().registerEventTimeTimer(ctx.timestamp() + 10 * 1000L);
// timerService 有以下 6 个方法:
// 1. 获取当前的处理时间
// long currentProcessingTime();
// 获取当前的水位线(事件时间)
// long currentWatermark();
// 注册处理时间定时器,当处理时间超过 time 时触发
// void registerProcessingTimeTimer(long time);
// 注册事件时间定时器,当水位线超过 time 时触发
// void registerEventTimeTimer(long time);
// 删除触发时间为 time 的处理时间定时器
// void deleteProcessingTimeTimer(long time);
// 删除触发时间为 time 的处理时间定时器
// void deleteEventTimeTimer(long time);
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
out.collect("定时器触发,触发时间:" + timestamp);
}
})
.print();
env.execute();
}
// 自定义测试数据源
public static class CustomSource implements SourceFunction<Event> {
@Override
public void run(SourceContext<Event> ctx) throws Exception {
// 直接发出测试数据
ctx.collect(new Event("Mary", "./home", 1000L));
// 为了更加明显,中间停顿5秒钟
Thread.sleep(5000L);
// 发出10秒后的数据
ctx.collect(new Event("Mary", "./home", 11000L));
Thread.sleep(5000L);
// 发出10秒+1ms后的数据
ctx.collect(new Event("Alice", "./cart", 11001L));
Thread.sleep(5000L);
}
@Override
public void cancel() { }
}
}
需要说明的是,所有 ProcessFunction 都可以直接访问 TimeService,但是之后基于 KeyedStream 的 ProcessFunction 才能调用注册和删除定时器的方法;且未做 KeyBy 的 DataStream 不支持定时器操作, 只能获取当前时间。
开窗之后的处理函数,只不过是全窗口 AllWindow.
import com.kino.process.ClickSource;
import com.kino.process.Event;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.windowing.ProcessAllWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
public class ProcessAllWindowTopN {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
SingleOutputStreamOperator<Event> eventStream = env.addSource(new ClickSource())
.assignTimestampsAndWatermarks(
WatermarkStrategy.<Event>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event element, long recordTimestamp) {
return element.timestamp;
}
})
);
// 只需要url就可以统计数量,所以转换成String直接开窗统计
SingleOutputStreamOperator<String> result = eventStream
.map(new MapFunction<Event, String>() {
@Override
public String map(Event value) throws Exception {
return value.url;
}
})
.windowAll(SlidingEventTimeWindows.of(Time.seconds(10), Time.seconds(5))) // 开滑动窗口
.process(new ProcessAllWindowFunction<String, String, TimeWindow>() {
@Override
public void process(Context context, Iterable<String> elements, Collector<String> out) throws Exception {
HashMap<String, Long> urlCountMap = new HashMap<>();
// 遍历窗口中数据,将浏览量保存到一个 HashMap 中
for (String url : elements) {
if (urlCountMap.containsKey(url)) {
long count = urlCountMap.get(url);
urlCountMap.put(url, count + 1L);
} else {
urlCountMap.put(url, 1L);
}
}
ArrayList<Tuple2<String, Long>> mapList = new ArrayList<Tuple2<String, Long>>();
// 将浏览量数据放入ArrayList,进行排序
for (String key : urlCountMap.keySet()) {
mapList.add(Tuple2.of(key, urlCountMap.get(key)));
}
mapList.sort(new Comparator<Tuple2<String, Long>>() {
@Override
public int compare(Tuple2<String, Long> o1, Tuple2<String, Long> o2) {
return o2.f1.intValue() - o1.f1.intValue();
}
});
// 取排序后的前两名,构建输出结果
StringBuilder result = new StringBuilder();
result.append("========================================\n");
for (int i = 0; i < 2; i++) {
Tuple2<String, Long> temp = mapList.get(i);
String info = "浏览量No." + (i + 1) +
" url:" + temp.f0 +
" 浏览量:" + temp.f1 +
" 窗口结束时间:" + new Timestamp(context.window().getEnd()) + "\n";
result.append(info);
}
result.append("========================================\n");
out.collect(result.toString());
}
});
result.print();
env.execute();
}
}
在上面 ProcessWindowFunction 的使用中,并不是来一条数据出预先处理一条,而是等到窗口关闭时才会做计算,这里使用 KeyedProcessFunction 来一条数据处理一条,等窗口关闭时, 再做关闭时的 ProcessWindowFunction 处理
import com.kino.process.ClickSource;
import com.kino.process.Event;
import com.kino.process.UrlViewCount;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.common.state.ListState;
import org.apache.flink.api.common.state.ListStateDescriptor;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Comparator;
public class KeyedProcessTopN {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 从自定义数据源读取数据
SingleOutputStreamOperator<Event> eventStream = env.addSource(new ClickSource())
.assignTimestampsAndWatermarks(WatermarkStrategy.<Event>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event element, long recordTimestamp) {
return element.timestamp;
}
}));
// 需要按照url分组,求出每个url的访问量
SingleOutputStreamOperator<UrlViewCount> urlCountStream =
eventStream.keyBy(data -> data.url)
.window(SlidingEventTimeWindows.of(Time.seconds(10), Time.seconds(5)))
.aggregate(new UrlViewCountAgg(),
new UrlViewCountResult());
// 对结果中同一个窗口的统计数据,进行排序处理
SingleOutputStreamOperator<String> result = urlCountStream.keyBy(data -> data.windowEnd)
.process(new TopN(2));
result.print("result");
env.execute();
}
// 自定义增量聚合
public static class UrlViewCountAgg implements AggregateFunction<Event, Long, Long> {
@Override
public Long createAccumulator() {
return 0L;
}
@Override
public Long add(Event value, Long accumulator) {
return accumulator + 1;
}
@Override
public Long getResult(Long accumulator) {
return accumulator;
}
@Override
public Long merge(Long a, Long b) {
return null;
}
}
// 自定义全窗口函数,只需要包装窗口信息
public static class UrlViewCountResult extends ProcessWindowFunction<Long, UrlViewCount, String, TimeWindow> {
@Override
public void process(String url, Context context, Iterable<Long> elements, Collector<UrlViewCount> out) throws Exception {
// 结合窗口信息,包装输出内容
Long start = context.window().getStart();
Long end = context.window().getEnd();
out.collect(new UrlViewCount(url, elements.iterator().next(), start, end));
}
}
// 自定义处理函数,排序取top n
public static class TopN extends KeyedProcessFunction<Long, UrlViewCount, String>{
// 将n作为属性
private Integer n;
// 定义一个列表状态
private ListState<UrlViewCount> urlViewCountListState;
public TopN(Integer n) {
this.n = n;
}
@Override
public void open(Configuration parameters) throws Exception {
// 从环境中获取列表状态句柄
urlViewCountListState = getRuntimeContext().getListState(
new ListStateDescriptor<UrlViewCount>("url-view-count-list",
Types.POJO(UrlViewCount.class)));
}
@Override
public void processElement(UrlViewCount value, Context ctx, Collector<String> out) throws Exception {
// 将count数据添加到列表状态中,保存起来
urlViewCountListState.add(value);
// 注册 window end + 1ms后的定时器,等待所有数据到齐开始排序
ctx.timerService().registerEventTimeTimer(ctx.getCurrentKey() + 1);
// 还可以把数据输出到测输出流中
ctx.output(new OutputTag<String>("aaa"){}, "side-output: " + String.valueOf(value));
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
// 将数据从列表状态变量中取出,放入ArrayList,方便排序
ArrayList<UrlViewCount> urlViewCountArrayList = new ArrayList<>();
for (UrlViewCount urlViewCount : urlViewCountListState.get()) {
urlViewCountArrayList.add(urlViewCount);
}
// 清空状态,释放资源
urlViewCountListState.clear();
// 排序
urlViewCountArrayList.sort(new Comparator<UrlViewCount>() {
@Override
public int compare(UrlViewCount o1, UrlViewCount o2) {
return o2.count.intValue() - o1.count.intValue();
}
});
// 取前两名,构建输出结果
StringBuilder result = new StringBuilder();
result.append("========================================\n");
result.append("窗口结束时间:" + new Timestamp(timestamp - 1) + "\n");
for (int i = 0; i < this.n; i++) {
UrlViewCount UrlViewCount = urlViewCountArrayList.get(i);
String info = "No." + (i + 1) + " "
+ "url:" + UrlViewCount.url + " "
+ "浏览量:" + UrlViewCount.count + "\n";
result.append(info);
}
result.append("========================================\n");
out.collect(result.toString());
}
}
}