diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 688563baecfa..2b3f80fc930b 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -151,3 +151,8 @@ required-features = ["string_expressions"] harness = false name = "pad" required-features = ["unicode_expressions"] + +[[bench]] +harness = false +name = "repeat" +required-features = ["string_expressions"] diff --git a/datafusion/functions/benches/repeat.rs b/datafusion/functions/benches/repeat.rs new file mode 100644 index 000000000000..916c8374e5fb --- /dev/null +++ b/datafusion/functions/benches/repeat.rs @@ -0,0 +1,136 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +extern crate criterion; + +use arrow::array::{ArrayRef, Int64Array, OffsetSizeTrait}; +use arrow::util::bench_util::{ + create_string_array_with_len, create_string_view_array_with_len, +}; +use criterion::{black_box, criterion_group, criterion_main, Criterion, SamplingMode}; +use datafusion_expr::ColumnarValue; +use datafusion_functions::string; +use std::sync::Arc; +use std::time::Duration; + +fn create_args( + size: usize, + str_len: usize, + repeat_times: i64, + use_string_view: bool, +) -> Vec { + let number_array = Arc::new(Int64Array::from( + (0..size).map(|_| repeat_times).collect::>(), + )); + + if use_string_view { + let string_array = + Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false)); + vec![ + ColumnarValue::Array(string_array), + ColumnarValue::Array(number_array), + ] + } else { + let string_array = + Arc::new(create_string_array_with_len::(size, 0.1, str_len)); + + vec![ + ColumnarValue::Array(string_array), + ColumnarValue::Array(Arc::clone(&number_array) as ArrayRef), + ] + } +} + +fn criterion_benchmark(c: &mut Criterion) { + let repeat = string::repeat(); + for size in [1024, 4096] { + // REPEAT 3 TIMES + let repeat_times = 3; + let mut group = c.benchmark_group(format!("repeat {} times", repeat_times)); + group.sampling_mode(SamplingMode::Flat); + group.sample_size(10); + group.measurement_time(Duration::from_secs(10)); + + let args = create_args::(size, 32, repeat_times, true); + group.bench_function( + &format!( + "repeat_string_view [size={}, repeat_times={}]", + size, repeat_times + ), + |b| b.iter(|| black_box(repeat.invoke(&args))), + ); + + let args = create_args::(size, 32, repeat_times, false); + group.bench_function( + &format!( + "repeat_string [size={}, repeat_times={}]", + size, repeat_times + ), + |b| b.iter(|| black_box(repeat.invoke(&args))), + ); + + let args = create_args::(size, 32, repeat_times, false); + group.bench_function( + &format!( + "repeat_large_string [size={}, repeat_times={}]", + size, repeat_times + ), + |b| b.iter(|| black_box(repeat.invoke(&args))), + ); + + group.finish(); + + // REPEAT 30 TIMES + let repeat_times = 30; + let mut group = c.benchmark_group(format!("repeat {} times", repeat_times)); + group.sampling_mode(SamplingMode::Flat); + group.sample_size(10); + group.measurement_time(Duration::from_secs(10)); + + let args = create_args::(size, 32, repeat_times, true); + group.bench_function( + &format!( + "repeat_string_view [size={}, repeat_times={}]", + size, repeat_times + ), + |b| b.iter(|| black_box(repeat.invoke(&args))), + ); + + let args = create_args::(size, 32, repeat_times, false); + group.bench_function( + &format!( + "repeat_string [size={}, repeat_times={}]", + size, repeat_times + ), + |b| b.iter(|| black_box(repeat.invoke(&args))), + ); + + let args = create_args::(size, 32, repeat_times, false); + group.bench_function( + &format!( + "repeat_large_string [size={}, repeat_times={}]", + size, repeat_times + ), + |b| b.iter(|| black_box(repeat.invoke(&args))), + ); + + group.finish(); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index 7037c1d1c3c3..54aebb039046 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -19,8 +19,9 @@ use std::fmt::{Display, Formatter}; use std::sync::Arc; use arrow::array::{ - new_null_array, Array, ArrayDataBuilder, ArrayRef, GenericStringArray, - GenericStringBuilder, OffsetSizeTrait, StringArray, + new_null_array, Array, ArrayAccessor, ArrayDataBuilder, ArrayIter, ArrayRef, + GenericStringArray, GenericStringBuilder, OffsetSizeTrait, StringArray, + StringViewArray, }; use arrow::buffer::{Buffer, MutableBuffer, NullBuffer}; use arrow::datatypes::DataType; @@ -251,6 +252,22 @@ impl<'a> ColumnarValueRef<'a> { } } +pub trait StringArrayType<'a>: ArrayAccessor + Sized { + fn iter(&self) -> ArrayIter; +} + +impl<'a, T: OffsetSizeTrait> StringArrayType<'a> for &'a GenericStringArray { + fn iter(&self) -> ArrayIter { + GenericStringArray::::iter(self) + } +} + +impl<'a> StringArrayType<'a> for &'a StringViewArray { + fn iter(&self) -> ArrayIter { + StringViewArray::iter(self) + } +} + /// Optimized version of the StringBuilder in Arrow that: /// 1. Precalculating the expected length of the result, avoiding reallocations. /// 2. Avoids creating / incrementally creating a `NullBufferBuilder` diff --git a/datafusion/functions/src/string/repeat.rs b/datafusion/functions/src/string/repeat.rs index a377dee06f41..20e4462784b8 100644 --- a/datafusion/functions/src/string/repeat.rs +++ b/datafusion/functions/src/string/repeat.rs @@ -18,17 +18,20 @@ use std::any::Any; use std::sync::Arc; -use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait, StringArray}; +use arrow::array::{ + ArrayRef, AsArray, GenericStringArray, GenericStringBuilder, Int64Array, + OffsetSizeTrait, StringViewArray, +}; use arrow::datatypes::DataType; +use arrow::datatypes::DataType::{Int64, LargeUtf8, Utf8, Utf8View}; -use datafusion_common::cast::{ - as_generic_string_array, as_int64_array, as_string_view_array, -}; +use datafusion_common::cast::as_int64_array; use datafusion_common::{exec_err, Result}; use datafusion_expr::TypeSignature::*; use datafusion_expr::{ColumnarValue, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; +use crate::string::common::StringArrayType; use crate::utils::{make_scalar_function, utf8_to_str_type}; #[derive(Debug)] @@ -44,7 +47,6 @@ impl Default for RepeatFunc { impl RepeatFunc { pub fn new() -> Self { - use DataType::*; Self { signature: Signature::one_of( vec![ @@ -79,51 +81,53 @@ impl ScalarUDFImpl for RepeatFunc { } fn invoke(&self, args: &[ColumnarValue]) -> Result { - match args[0].data_type() { - DataType::Utf8View => make_scalar_function(repeat_utf8view, vec![])(args), - DataType::Utf8 => make_scalar_function(repeat::, vec![])(args), - DataType::LargeUtf8 => make_scalar_function(repeat::, vec![])(args), - other => exec_err!("Unsupported data type {other:?} for function repeat. Expected Utf8, Utf8View or LargeUtf8"), - } + make_scalar_function(repeat, vec![])(args) } } /// Repeats string the specified number of times. /// repeat('Pg', 4) = 'PgPgPgPg' -fn repeat(args: &[ArrayRef]) -> Result { - let string_array = as_generic_string_array::(&args[0])?; +fn repeat(args: &[ArrayRef]) -> Result { let number_array = as_int64_array(&args[1])?; - - let result = string_array - .iter() - .zip(number_array.iter()) - .map(|(string, number)| repeat_common(string, number)) - .collect::>(); - - Ok(Arc::new(result) as ArrayRef) + match args[0].data_type() { + Utf8View => { + let string_view_array = args[0].as_string_view(); + repeat_impl::(string_view_array, number_array) + } + Utf8 => { + let string_array = args[0].as_string::(); + repeat_impl::>(string_array, number_array) + } + LargeUtf8 => { + let string_array = args[0].as_string::(); + repeat_impl::>(string_array, number_array) + } + other => exec_err!( + "Unsupported data type {other:?} for function repeat. \ + Expected Utf8, Utf8View or LargeUtf8." + ), + } } -fn repeat_utf8view(args: &[ArrayRef]) -> Result { - let string_view_array = as_string_view_array(&args[0])?; - let number_array = as_int64_array(&args[1])?; - - let result = string_view_array +fn repeat_impl<'a, T, S>(string_array: S, number_array: &Int64Array) -> Result +where + T: OffsetSizeTrait, + S: StringArrayType<'a>, +{ + let mut builder: GenericStringBuilder = GenericStringBuilder::new(); + string_array .iter() .zip(number_array.iter()) - .map(|(string, number)| repeat_common(string, number)) - .collect::(); - - Ok(Arc::new(result) as ArrayRef) -} - -fn repeat_common(string: Option<&str>, number: Option) -> Option { - match (string, number) { - (Some(string), Some(number)) if number >= 0 => { - Some(string.repeat(number as usize)) - } - (Some(_), Some(_)) => Some("".to_string()), - _ => None, - } + .for_each(|(string, number)| match (string, number) { + (Some(string), Some(number)) if number >= 0 => { + builder.append_value(string.repeat(number as usize)) + } + (Some(_), Some(_)) => builder.append_value(""), + _ => builder.append_null(), + }); + let array = builder.finish(); + + Ok(Arc::new(array) as ArrayRef) } #[cfg(test)] diff --git a/datafusion/functions/src/unicode/lpad.rs b/datafusion/functions/src/unicode/lpad.rs index 521cdc5d0ff0..e102673c4253 100644 --- a/datafusion/functions/src/unicode/lpad.rs +++ b/datafusion/functions/src/unicode/lpad.rs @@ -20,8 +20,8 @@ use std::fmt::Write; use std::sync::Arc; use arrow::array::{ - Array, ArrayAccessor, ArrayIter, ArrayRef, AsArray, GenericStringArray, - GenericStringBuilder, Int64Array, OffsetSizeTrait, StringViewArray, + Array, ArrayRef, AsArray, GenericStringArray, GenericStringBuilder, Int64Array, + OffsetSizeTrait, StringViewArray, }; use arrow::datatypes::DataType; use unicode_segmentation::UnicodeSegmentation; @@ -32,6 +32,7 @@ use datafusion_common::{exec_err, Result}; use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use crate::string::common::StringArrayType; use crate::utils::{make_scalar_function, utf8_to_str_type}; #[derive(Debug)] @@ -248,20 +249,6 @@ where Ok(Arc::new(array) as ArrayRef) } -trait StringArrayType<'a>: ArrayAccessor + Sized { - fn iter(&self) -> ArrayIter; -} -impl<'a, T: OffsetSizeTrait> StringArrayType<'a> for &'a GenericStringArray { - fn iter(&self) -> ArrayIter { - GenericStringArray::::iter(self) - } -} -impl<'a> StringArrayType<'a> for &'a StringViewArray { - fn iter(&self) -> ArrayIter { - StringViewArray::iter(self) - } -} - #[cfg(test)] mod tests { use crate::unicode::lpad::LPadFunc;