From 658e58f8822926cdc53cd0af89a84d53257feb22 Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Mon, 22 Jul 2024 16:28:51 -0400 Subject: [PATCH] add benchmark to track performance (#6101) --- parquet/benches/arrow_reader.rs | 37 +++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/parquet/benches/arrow_reader.rs b/parquet/benches/arrow_reader.rs index 927998ac2489..814e75c249bf 100644 --- a/parquet/benches/arrow_reader.rs +++ b/parquet/benches/arrow_reader.rs @@ -263,9 +263,10 @@ where InMemoryPageIterator::new(pages) } -fn build_plain_encoded_byte_array_page_iterator( +fn build_plain_encoded_byte_array_page_iterator_inner( column_desc: ColumnDescPtr, null_density: f32, + short_string: bool, ) -> impl PageIterator + Clone { let max_def_level = column_desc.max_def_level(); let max_rep_level = column_desc.max_rep_level(); @@ -285,7 +286,11 @@ fn build_plain_encoded_byte_array_page_iterator( max_def_level }; if def_level == max_def_level { - let string_value = format!("Test value {k}, row group: {i}, page: {j}"); + let string_value = if short_string { + format!("{k}{i}{j}") + } else { + format!("Test value {k}, row group: {i}, page: {j}") + }; values.push(parquet::data_type::ByteArray::from(string_value.as_str())); } def_levels.push(def_level); @@ -303,6 +308,13 @@ fn build_plain_encoded_byte_array_page_iterator( InMemoryPageIterator::new(pages) } +fn build_plain_encoded_byte_array_page_iterator( + column_desc: ColumnDescPtr, + null_density: f32, +) -> impl PageIterator + Clone { + build_plain_encoded_byte_array_page_iterator_inner(column_desc, null_density, false) +} + fn build_dictionary_encoded_string_page_iterator( column_desc: ColumnDescPtr, null_density: f32, @@ -1066,6 +1078,27 @@ fn add_benches(c: &mut Criterion) { let mut group = c.benchmark_group("arrow_array_reader/BinaryViewArray"); + // binary view, plain encoded, no NULLs, short string + let plain_byte_array_no_null_data = build_plain_encoded_byte_array_page_iterator_inner( + mandatory_binary_column_desc.clone(), + 0.0, + true, + ); + + // Short strings should not be slower than long strings, however, as discussed in https://github.com/apache/arrow-rs/issues/6034, + // the current implementation is more than 2x slower. + // This benchmark tracks the performance of short strings so that we can optimize it. + group.bench_function("plain encoded, mandatory, no NULLs, short string", |b| { + b.iter(|| { + let array_reader = create_byte_view_array_reader( + plain_byte_array_no_null_data.clone(), + mandatory_binary_column_desc.clone(), + ); + count = bench_array_reader(array_reader); + }); + assert_eq!(count, EXPECTED_VALUE_COUNT); + }); + // binary view, plain encoded, no NULLs let plain_byte_array_no_null_data = build_plain_encoded_byte_array_page_iterator(mandatory_binary_column_desc.clone(), 0.0);