From 97cfe2dfb1ebfe2321a84a61a3548521a672e730 Mon Sep 17 00:00:00 2001 From: Devan Date: Tue, 27 Aug 2024 19:49:42 -0500 Subject: [PATCH 1/8] Fix: Internal error in regexp_replace() for some StringView input --- .../functions/src/regex/regexpreplace.rs | 113 ++++++++++++++---- .../sqllogictest/test_files/string_view.slt | 46 +++++++ 2 files changed, 135 insertions(+), 24 deletions(-) diff --git a/datafusion/functions/src/regex/regexpreplace.rs b/datafusion/functions/src/regex/regexpreplace.rs index d28c6cd36d65..f033b7a51948 100644 --- a/datafusion/functions/src/regex/regexpreplace.rs +++ b/datafusion/functions/src/regex/regexpreplace.rs @@ -16,15 +16,15 @@ // under the License. //! Regx expressions -use arrow::array::new_null_array; use arrow::array::ArrayAccessor; use arrow::array::ArrayDataBuilder; use arrow::array::BufferBuilder; use arrow::array::GenericStringArray; use arrow::array::StringViewBuilder; +use arrow::array::{new_null_array, ArrayIter, AsArray}; use arrow::array::{Array, ArrayRef, OffsetSizeTrait}; use arrow::datatypes::DataType; -use datafusion_common::cast::as_string_view_array; +use datafusion_common::cast::{as_string_array, as_string_view_array}; use datafusion_common::exec_err; use datafusion_common::plan_err; use datafusion_common::ScalarValue; @@ -187,27 +187,34 @@ fn regex_replace_posix_groups(replacement: &str) -> String { /// # Ok(()) /// # } /// ``` -pub fn regexp_replace(args: &[ArrayRef]) -> Result { +pub fn regexp_replace<'a, T: OffsetSizeTrait, V, B>( + string_array: V, + pattern_array: B, + replacement_array: B, + flags: Option<&ArrayRef>, +) -> Result +where + V: ArrayAccessor, + B: ArrayAccessor, +{ // Default implementation for regexp_replace, assumes all args are arrays // and args is a sequence of 3 or 4 elements. // creating Regex is expensive so create hashmap for memoization let mut patterns: HashMap = HashMap::new(); - match args.len() { - 3 => { - let string_array = as_generic_string_array::(&args[0])?; - let pattern_array = as_generic_string_array::(&args[1])?; - let replacement_array = as_generic_string_array::(&args[2])?; + let string_array_iter = ArrayIter::new(string_array); + let pattern_array_iter = ArrayIter::new(pattern_array); + let replacement_array_iter = ArrayIter::new(replacement_array); - let result = string_array - .iter() - .zip(pattern_array.iter()) - .zip(replacement_array.iter()) + match flags { + None => { + let result = string_array_iter + .zip(pattern_array_iter) + .zip(replacement_array_iter) .map(|((string, pattern), replacement)| match (string, pattern, replacement) { (Some(string), Some(pattern), Some(replacement)) => { let replacement = regex_replace_posix_groups(replacement); - // if patterns hashmap already has regexp then use else create and return let re = match patterns.get(pattern) { Some(re) => Ok(re), @@ -230,16 +237,12 @@ pub fn regexp_replace(args: &[ArrayRef]) -> Result Ok(Arc::new(result) as ArrayRef) } - 4 => { - let string_array = as_generic_string_array::(&args[0])?; - let pattern_array = as_generic_string_array::(&args[1])?; - let replacement_array = as_generic_string_array::(&args[2])?; - let flags_array = as_generic_string_array::(&args[3])?; + Some(flags) => { + let flags_array = as_generic_string_array::(flags)?; - let result = string_array - .iter() - .zip(pattern_array.iter()) - .zip(replacement_array.iter()) + let result = string_array_iter + .zip(pattern_array_iter) + .zip(replacement_array_iter) .zip(flags_array.iter()) .map(|(((string, pattern), replacement), flags)| match (string, pattern, replacement, flags) { (Some(string), Some(pattern), Some(replacement), Some(flags)) => { @@ -283,7 +286,7 @@ pub fn regexp_replace(args: &[ArrayRef]) -> Result Ok(Arc::new(result) as ArrayRef) } other => exec_err!( - "regexp_replace was called with {other} arguments. It requires at least 3 and at most 4." + "regexp_replace was called with {other:?} arguments. It requires at least 3 and at most 4." ), } } @@ -496,7 +499,69 @@ pub fn specialize_regexp_replace( .iter() .map(|arg| arg.clone().into_array(inferred_length)) .collect::>>()?; - regexp_replace::(&args) + + match args[0].data_type() { + DataType::Utf8View => { + let string_array = args[0].as_string_view(); + let pattern_array = args[1].as_string::(); + let replacement_array = args[2].as_string::(); + let regexp_replace_result = regexp_replace::( + string_array, + pattern_array, + replacement_array, + args.get(3), + )?; + + if regexp_replace_result.data_type() == &DataType::Utf8 { + let string_view_array = + as_string_array(®exp_replace_result)?.to_owned(); + + let mut builder = + StringViewBuilder::with_capacity(string_view_array.len()) + .with_block_size(1024 * 1024 * 2); + + for val in string_view_array.iter() { + if let Some(val) = val { + builder.append_value(val); + } else { + builder.append_null(); + } + } + + let result = builder.finish(); + Ok(Arc::new(result) as ArrayRef) + } else { + Ok(regexp_replace_result) + } + } + DataType::Utf8 => { + let string_array = args[0].as_string::(); + let pattern_array = args[1].as_string::(); + let replacement_array = args[2].as_string::(); + regexp_replace::( + string_array, + pattern_array, + replacement_array, + args.get(3), + ) + } + DataType::LargeUtf8 => { + let string_array = args[0].as_string::(); + let pattern_array = args[1].as_string::(); + let replacement_array = args[2].as_string::(); + regexp_replace::( + string_array, + pattern_array, + replacement_array, + args.get(3), + ) + } + other => { + exec_err!( + "Unsupported data type {other:?} for function regex_replace" + ) + } + } } } } diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index 83c75b8df38c..a50724b36468 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -460,6 +460,52 @@ Xiangpeng Raphael NULL +### Test REGEXP_REPLACE + +# Should run REGEXP_REPLACE with Scalar value for utf8view +query T +SELECT + REGEXP_REPLACE(column1_utf8view, 'e', 'f') AS k +FROM test; +---- +Andrfw +Xiangpfng +Raphafl +NULL + +# Should run REGEXP_REPLACE with Scalar value for utf8 +query T +SELECT + REGEXP_REPLACE(column1_utf8, 'e', 'f') AS k +FROM test; +---- +Andrfw +Xiangpfng +Raphafl +NULL + +# Should run REGEXP_REPLACE with ScalarArray value for utf8view +query T +SELECT + REGEXP_REPLACE(column1_utf8view, lower(column1_utf8view), 'bar') AS k +FROM test; +---- +Andrew +Xiangpeng +Raphael +NULL + +# Should run REGEXP_REPLACE with ScalarArray value for utf8 +query T +SELECT + REGEXP_REPLACE(column1_utf8, lower(column1_utf8), 'bar') AS k +FROM test; +---- +Andrew +Xiangpeng +Raphael +NULL + ### Initcap query TT From a704b99d08db4287cd72ea1d803adb94d21d43f2 Mon Sep 17 00:00:00 2001 From: Devan Date: Tue, 27 Aug 2024 20:03:37 -0500 Subject: [PATCH 2/8] fix regex bench --- datafusion/functions/benches/regx.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/datafusion/functions/benches/regx.rs b/datafusion/functions/benches/regx.rs index 23d57f38efae..f97296cfb961 100644 --- a/datafusion/functions/benches/regx.rs +++ b/datafusion/functions/benches/regx.rs @@ -122,12 +122,11 @@ fn criterion_benchmark(c: &mut Criterion) { b.iter(|| { black_box( - regexp_replace::(&[ + regexp_replace::( Arc::clone(&data), Arc::clone(®ex), Arc::clone(&replacement), - Arc::clone(&flags), - ]) + Some(&Arc::clone(&flags))) .expect("regexp_replace should work on valid values"), ) }) From 077f7613b53fba39c541130e7c910243d93b7c53 Mon Sep 17 00:00:00 2001 From: Devan Date: Tue, 27 Aug 2024 20:05:50 -0500 Subject: [PATCH 3/8] fmt --- datafusion/functions/benches/regx.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datafusion/functions/benches/regx.rs b/datafusion/functions/benches/regx.rs index f97296cfb961..89c0e08d8f19 100644 --- a/datafusion/functions/benches/regx.rs +++ b/datafusion/functions/benches/regx.rs @@ -126,7 +126,8 @@ fn criterion_benchmark(c: &mut Criterion) { Arc::clone(&data), Arc::clone(®ex), Arc::clone(&replacement), - Some(&Arc::clone(&flags))) + Some(&Arc::clone(&flags)), + ) .expect("regexp_replace should work on valid values"), ) }) From 44f28702da28ed71b62ddfe9028b55b8a32de8ca Mon Sep 17 00:00:00 2001 From: Devan Date: Tue, 27 Aug 2024 20:27:53 -0500 Subject: [PATCH 4/8] fix bench regx --- datafusion/functions/benches/regx.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/datafusion/functions/benches/regx.rs b/datafusion/functions/benches/regx.rs index 89c0e08d8f19..45bfa2351128 100644 --- a/datafusion/functions/benches/regx.rs +++ b/datafusion/functions/benches/regx.rs @@ -18,7 +18,7 @@ extern crate criterion; use arrow::array::builder::StringBuilder; -use arrow::array::{ArrayRef, StringArray}; +use arrow::array::{ArrayRef, AsArray, StringArray}; use criterion::{black_box, criterion_group, criterion_main, Criterion}; use datafusion_functions::regex::regexplike::regexp_like; use datafusion_functions::regex::regexpmatch::regexp_match; @@ -123,10 +123,10 @@ fn criterion_benchmark(c: &mut Criterion) { b.iter(|| { black_box( regexp_replace::( - Arc::clone(&data), - Arc::clone(®ex), - Arc::clone(&replacement), - Some(&Arc::clone(&flags)), + data.as_string::(), + regex.as_string::(), + replacement.as_string::(), + Some(&flags), ) .expect("regexp_replace should work on valid values"), ) From 173d80f183c7e1e0fc155c267b4a237b2bba7c24 Mon Sep 17 00:00:00 2001 From: Devan Date: Tue, 27 Aug 2024 20:38:41 -0500 Subject: [PATCH 5/8] clippy --- datafusion/functions/src/regex/regexpreplace.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/datafusion/functions/src/regex/regexpreplace.rs b/datafusion/functions/src/regex/regexpreplace.rs index f033b7a51948..aa495cd6e24c 100644 --- a/datafusion/functions/src/regex/regexpreplace.rs +++ b/datafusion/functions/src/regex/regexpreplace.rs @@ -285,9 +285,6 @@ where Ok(Arc::new(result) as ArrayRef) } - other => exec_err!( - "regexp_replace was called with {other:?} arguments. It requires at least 3 and at most 4." - ), } } From c6bb1afadc29bad7e68ecd6435b2d3682f577b42 Mon Sep 17 00:00:00 2001 From: Devan Date: Tue, 27 Aug 2024 20:41:40 -0500 Subject: [PATCH 6/8] fmt --- .../functions/src/regex/regexpreplace.rs | 135 ++++++++++-------- 1 file changed, 74 insertions(+), 61 deletions(-) diff --git a/datafusion/functions/src/regex/regexpreplace.rs b/datafusion/functions/src/regex/regexpreplace.rs index aa495cd6e24c..85d6799ed283 100644 --- a/datafusion/functions/src/regex/regexpreplace.rs +++ b/datafusion/functions/src/regex/regexpreplace.rs @@ -210,30 +210,33 @@ where match flags { None => { let result = string_array_iter - .zip(pattern_array_iter) - .zip(replacement_array_iter) - .map(|((string, pattern), replacement)| match (string, pattern, replacement) { - (Some(string), Some(pattern), Some(replacement)) => { - let replacement = regex_replace_posix_groups(replacement); - // if patterns hashmap already has regexp then use else create and return - let re = match patterns.get(pattern) { - Some(re) => Ok(re), - None => { - match Regex::new(pattern) { - Ok(re) => { - patterns.insert(pattern.to_string(), re); - Ok(patterns.get(pattern).unwrap()) + .zip(pattern_array_iter) + .zip(replacement_array_iter) + .map(|((string, pattern), replacement)| { + match (string, pattern, replacement) { + (Some(string), Some(pattern), Some(replacement)) => { + let replacement = regex_replace_posix_groups(replacement); + // if patterns hashmap already has regexp then use else create and return + let re = match patterns.get(pattern) { + Some(re) => Ok(re), + None => match Regex::new(pattern) { + Ok(re) => { + patterns.insert(pattern.to_string(), re); + Ok(patterns.get(pattern).unwrap()) + } + Err(err) => { + Err(DataFusionError::External(Box::new(err))) + } }, - Err(err) => Err(DataFusionError::External(Box::new(err))), - } - } - }; + }; - Some(re.map(|re| re.replace(string, replacement.as_str()))).transpose() - } - _ => Ok(None) - }) - .collect::>>()?; + Some(re.map(|re| re.replace(string, replacement.as_str()))) + .transpose() + } + _ => Ok(None), + } + }) + .collect::>>()?; Ok(Arc::new(result) as ArrayRef) } @@ -241,47 +244,57 @@ where let flags_array = as_generic_string_array::(flags)?; let result = string_array_iter - .zip(pattern_array_iter) - .zip(replacement_array_iter) - .zip(flags_array.iter()) - .map(|(((string, pattern), replacement), flags)| match (string, pattern, replacement, flags) { - (Some(string), Some(pattern), Some(replacement), Some(flags)) => { - let replacement = regex_replace_posix_groups(replacement); - - // format flags into rust pattern - let (pattern, replace_all) = if flags == "g" { - (pattern.to_string(), true) - } else if flags.contains('g') { - (format!("(?{}){}", flags.to_string().replace('g', ""), pattern), true) - } else { - (format!("(?{flags}){pattern}"), false) - }; - - // if patterns hashmap already has regexp then use else create and return - let re = match patterns.get(&pattern) { - Some(re) => Ok(re), - None => { - match Regex::new(pattern.as_str()) { - Ok(re) => { - patterns.insert(pattern.clone(), re); - Ok(patterns.get(&pattern).unwrap()) + .zip(pattern_array_iter) + .zip(replacement_array_iter) + .zip(flags_array.iter()) + .map(|(((string, pattern), replacement), flags)| { + match (string, pattern, replacement, flags) { + (Some(string), Some(pattern), Some(replacement), Some(flags)) => { + let replacement = regex_replace_posix_groups(replacement); + + // format flags into rust pattern + let (pattern, replace_all) = if flags == "g" { + (pattern.to_string(), true) + } else if flags.contains('g') { + ( + format!( + "(?{}){}", + flags.to_string().replace('g', ""), + pattern + ), + true, + ) + } else { + (format!("(?{flags}){pattern}"), false) + }; + + // if patterns hashmap already has regexp then use else create and return + let re = match patterns.get(&pattern) { + Some(re) => Ok(re), + None => match Regex::new(pattern.as_str()) { + Ok(re) => { + patterns.insert(pattern.clone(), re); + Ok(patterns.get(&pattern).unwrap()) + } + Err(err) => { + Err(DataFusionError::External(Box::new(err))) + } }, - Err(err) => Err(DataFusionError::External(Box::new(err))), - } + }; + + Some(re.map(|re| { + if replace_all { + re.replace_all(string, replacement.as_str()) + } else { + re.replace(string, replacement.as_str()) + } + })) + .transpose() } - }; - - Some(re.map(|re| { - if replace_all { - re.replace_all(string, replacement.as_str()) - } else { - re.replace(string, replacement.as_str()) - } - })).transpose() - } - _ => Ok(None) - }) - .collect::>>()?; + _ => Ok(None), + } + }) + .collect::>>()?; Ok(Arc::new(result) as ArrayRef) } From 20b829d9d472174ccc84660450c9b3babed6e76a Mon Sep 17 00:00:00 2001 From: Devan Date: Wed, 28 Aug 2024 12:46:36 -0500 Subject: [PATCH 7/8] adds tests for flags + includes type signature for utf8view with flag --- .../functions/src/regex/regexpreplace.rs | 1 + .../sqllogictest/test_files/string_view.slt | 44 +++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/datafusion/functions/src/regex/regexpreplace.rs b/datafusion/functions/src/regex/regexpreplace.rs index 85d6799ed283..650d56941aad 100644 --- a/datafusion/functions/src/regex/regexpreplace.rs +++ b/datafusion/functions/src/regex/regexpreplace.rs @@ -59,6 +59,7 @@ impl RegexpReplaceFunc { Exact(vec![Utf8, Utf8, Utf8]), Exact(vec![Utf8View, Utf8, Utf8]), Exact(vec![Utf8, Utf8, Utf8, Utf8]), + Exact(vec![Utf8View, Utf8, Utf8, Utf8]), ], Volatility::Immutable, ), diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index a50724b36468..2a3159bdbff2 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -473,6 +473,17 @@ Xiangpfng Raphafl NULL +# Should run REGEXP_REPLACE with Scalar value for utf8view with flag +query T +SELECT + REGEXP_REPLACE(column1_utf8view, 'e', 'f', 'i') AS k +FROM test; +---- +Andrfw +Xiangpfng +Raphafl +NULL + # Should run REGEXP_REPLACE with Scalar value for utf8 query T SELECT @@ -484,6 +495,17 @@ Xiangpfng Raphafl NULL +# Should run REGEXP_REPLACE with Scalar value for utf8 with flag +query T +SELECT + REGEXP_REPLACE(column1_utf8, 'e', 'f', 'i') AS k +FROM test; +---- +Andrfw +Xiangpfng +Raphafl +NULL + # Should run REGEXP_REPLACE with ScalarArray value for utf8view query T SELECT @@ -495,6 +517,17 @@ Xiangpeng Raphael NULL +# Should run REGEXP_REPLACE with ScalarArray value for utf8view with flag +query T +SELECT + REGEXP_REPLACE(column1_utf8view, lower(column1_utf8view), 'bar', 'g') AS k +FROM test; +---- +Andrew +Xiangpeng +Raphael +NULL + # Should run REGEXP_REPLACE with ScalarArray value for utf8 query T SELECT @@ -506,6 +539,17 @@ Xiangpeng Raphael NULL +# Should run REGEXP_REPLACE with ScalarArray value for utf8 with flag +query T +SELECT + REGEXP_REPLACE(column1_utf8, lower(column1_utf8), 'bar', 'g') AS k +FROM test; +---- +Andrew +Xiangpeng +Raphael +NULL + ### Initcap query TT From 0ebf99089451b82722f03f6f3cbff5fea022612c Mon Sep 17 00:00:00 2001 From: Devan Date: Wed, 11 Sep 2024 11:25:35 -0500 Subject: [PATCH 8/8] fix: adding collect for string view type --- .../functions/src/regex/regexpreplace.rs | 76 ++++++++++--------- 1 file changed, 42 insertions(+), 34 deletions(-) diff --git a/datafusion/functions/src/regex/regexpreplace.rs b/datafusion/functions/src/regex/regexpreplace.rs index 650d56941aad..9693bb6e8b5e 100644 --- a/datafusion/functions/src/regex/regexpreplace.rs +++ b/datafusion/functions/src/regex/regexpreplace.rs @@ -16,15 +16,15 @@ // under the License. //! Regx expressions -use arrow::array::ArrayAccessor; use arrow::array::ArrayDataBuilder; use arrow::array::BufferBuilder; use arrow::array::GenericStringArray; use arrow::array::StringViewBuilder; use arrow::array::{new_null_array, ArrayIter, AsArray}; use arrow::array::{Array, ArrayRef, OffsetSizeTrait}; +use arrow::array::{ArrayAccessor, StringViewArray}; use arrow::datatypes::DataType; -use datafusion_common::cast::{as_string_array, as_string_view_array}; +use datafusion_common::cast::as_string_view_array; use datafusion_common::exec_err; use datafusion_common::plan_err; use datafusion_common::ScalarValue; @@ -204,13 +204,15 @@ where // creating Regex is expensive so create hashmap for memoization let mut patterns: HashMap = HashMap::new(); + let datatype = string_array.data_type().to_owned(); + let string_array_iter = ArrayIter::new(string_array); let pattern_array_iter = ArrayIter::new(pattern_array); let replacement_array_iter = ArrayIter::new(replacement_array); match flags { None => { - let result = string_array_iter + let result_iter = string_array_iter .zip(pattern_array_iter) .zip(replacement_array_iter) .map(|((string, pattern), replacement)| { @@ -236,15 +238,29 @@ where } _ => Ok(None), } - }) - .collect::>>()?; + }); - Ok(Arc::new(result) as ArrayRef) + match datatype { + DataType::Utf8 | DataType::LargeUtf8 => { + let result = + result_iter.collect::>>()?; + Ok(Arc::new(result) as ArrayRef) + } + DataType::Utf8View => { + let result = result_iter.collect::>()?; + Ok(Arc::new(result) as ArrayRef) + } + other => { + exec_err!( + "Unsupported data type {other:?} for function regex_replace" + ) + } + } } Some(flags) => { let flags_array = as_generic_string_array::(flags)?; - let result = string_array_iter + let result_iter = string_array_iter .zip(pattern_array_iter) .zip(replacement_array_iter) .zip(flags_array.iter()) @@ -294,10 +310,24 @@ where } _ => Ok(None), } - }) - .collect::>>()?; + }); - Ok(Arc::new(result) as ArrayRef) + match datatype { + DataType::Utf8 | DataType::LargeUtf8 => { + let result = + result_iter.collect::>>()?; + Ok(Arc::new(result) as ArrayRef) + } + DataType::Utf8View => { + let result = result_iter.collect::>()?; + Ok(Arc::new(result) as ArrayRef) + } + other => { + exec_err!( + "Unsupported data type {other:?} for function regex_replace" + ) + } + } } } } @@ -516,34 +546,12 @@ pub fn specialize_regexp_replace( let string_array = args[0].as_string_view(); let pattern_array = args[1].as_string::(); let replacement_array = args[2].as_string::(); - let regexp_replace_result = regexp_replace::( + regexp_replace::( string_array, pattern_array, replacement_array, args.get(3), - )?; - - if regexp_replace_result.data_type() == &DataType::Utf8 { - let string_view_array = - as_string_array(®exp_replace_result)?.to_owned(); - - let mut builder = - StringViewBuilder::with_capacity(string_view_array.len()) - .with_block_size(1024 * 1024 * 2); - - for val in string_view_array.iter() { - if let Some(val) = val { - builder.append_value(val); - } else { - builder.append_null(); - } - } - - let result = builder.finish(); - Ok(Arc::new(result) as ArrayRef) - } else { - Ok(regexp_replace_result) - } + ) } DataType::Utf8 => { let string_array = args[0].as_string::();