Skip to content

Commit

Permalink
Add uft8 support and bench
Browse files Browse the repository at this point in the history
  • Loading branch information
xinlifoobar committed Aug 22, 2024
1 parent ee14adf commit 08343dd
Show file tree
Hide file tree
Showing 4 changed files with 160 additions and 8 deletions.
32 changes: 31 additions & 1 deletion datafusion/functions/benches/regx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@
extern crate criterion;

use arrow::array::builder::StringBuilder;
use arrow::array::{ArrayRef, StringArray};
use arrow::array::{ArrayRef, Int64Array, StringArray};
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use datafusion_functions::regex::regexpcount::regexp_count;
use datafusion_functions::regex::regexplike::regexp_like;
use datafusion_functions::regex::regexpmatch::regexp_match;
use datafusion_functions::regex::regexpreplace::regexp_replace;
Expand Down Expand Up @@ -59,6 +60,15 @@ fn regex(rng: &mut ThreadRng) -> StringArray {
StringArray::from(data)
}

fn start(rng: &mut ThreadRng) -> Int64Array {
let mut data: Vec<i64> = vec![];
for _ in 0..1000 {
data.push(rng.gen_range(1..5));
}

Int64Array::from(data)
}

fn flags(rng: &mut ThreadRng) -> StringArray {
let samples = [Some("i".to_string()), Some("im".to_string()), None];
let mut sb = StringBuilder::new();
Expand All @@ -75,6 +85,26 @@ fn flags(rng: &mut ThreadRng) -> StringArray {
}

fn criterion_benchmark(c: &mut Criterion) {
c.bench_function("regexp_count_1000", |b| {
let mut rng = rand::thread_rng();
let data = Arc::new(data(&mut rng)) as ArrayRef;
let regex = Arc::new(regex(&mut rng)) as ArrayRef;
let start = Arc::new(start(&mut rng)) as ArrayRef;
let flags = Arc::new(flags(&mut rng)) as ArrayRef;

b.iter(|| {
black_box(
regexp_count::<i32>(&[
Arc::clone(&data),
Arc::clone(&regex),
Arc::clone(&start),
Arc::clone(&flags),
])
.expect("regexp_count should work on valid values"),
)
})
});

c.bench_function("regexp_like_1000", |b| {
let mut rng = rand::thread_rng();
let data = Arc::new(data(&mut rng)) as ArrayRef;
Expand Down
12 changes: 11 additions & 1 deletion datafusion/functions/src/regex/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,18 @@ make_udf_function!(
pub mod expr_fn {
use datafusion_expr::Expr;

pub fn regexp_count(values: Expr, regex: Expr, flags: Option<Expr>) -> Expr {
/// Returns the number of consecutive occurrences of a regular expression in a string.
pub fn regexp_count(
values: Expr,
regex: Expr,
start: Option<Expr>,
flags: Option<Expr>,
) -> Expr {
let mut args = vec![values, regex];
if let Some(start) = start {
args.push(start);
};

if let Some(flags) = flags {
args.push(flags);
};
Expand Down
11 changes: 5 additions & 6 deletions datafusion/functions/src/regex/regexpcount.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,14 @@ use arrow::array::{
Array, ArrayRef, AsArray, Datum, GenericStringArray, Int64Array, OffsetSizeTrait,
Scalar,
};
use arrow::datatypes::DataType::{self, Int64, LargeUtf8, Utf8};
use arrow::datatypes::DataType::{self, Int64, LargeUtf8, Utf8, Utf8View};
use arrow::datatypes::Int64Type;
use arrow::error::ArrowError;
use datafusion_common::cast::{as_generic_string_array, as_primitive_array};
use datafusion_common::{
arrow_err, exec_err, internal_err, DataFusionError, Result, ScalarValue,
};
use datafusion_expr::TypeSignature::Exact;
use datafusion_expr::TypeSignature::{Exact, Uniform};
use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility};
use itertools::izip;
use regex::Regex;
Expand All @@ -51,14 +51,13 @@ impl RegexpCountFunc {
Self {
signature: Signature::one_of(
vec![
Exact(vec![Utf8, Utf8]),
Uniform(2, vec![Utf8, LargeUtf8, Utf8View]),
Exact(vec![Utf8, Utf8, Int64]),
Exact(vec![Utf8, Utf8, Int64, Utf8]),
Exact(vec![Utf8, Utf8, Int64, LargeUtf8]),
Exact(vec![LargeUtf8, LargeUtf8]),
Exact(vec![LargeUtf8, LargeUtf8, Int64]),
Exact(vec![LargeUtf8, LargeUtf8, Int64, Utf8]),
Exact(vec![LargeUtf8, LargeUtf8, Int64, LargeUtf8]),
Exact(vec![Utf8View, Utf8View, Int64]),
Exact(vec![Utf8View, Utf8View, Int64, Utf8View]),
],
Volatility::Immutable,
),
Expand Down
113 changes: 113 additions & 0 deletions datafusion/sqllogictest/test_files/regexp.slt
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,119 @@ SELECT regexp_count(str, 'ab', 1, 'i') from t;
0


query I
SELECT regexp_count(str, pattern) from t;
----
1
1
0
0
0
0
1
1
1
1
1

query I
SELECT regexp_count(str, pattern, start) from t;
----
1
1
0
0
0
0
0
1
1
1
1

query I
SELECT regexp_count(str, pattern, start, flags) from t;
----
1
1
1
0
0
0
0
1
1
1
1

# test type coercion
query I
SELECT regexp_count(arrow_cast(str, 'Utf8'), arrow_cast(pattern, 'LargeUtf8'), arrow_cast(start, 'Int32'), flags) from t;
----
1
1
1
0
0
0
0
1
1
1
1

# test string views

statement ok
CREATE TABLE t_stringview AS
SELECT arrow_cast(str, 'Utf8View') as str, arrow_cast(pattern, 'Utf8View') as pattern, arrow_cast(start, 'Int64') as start, arrow_cast(flags, 'Utf8View') as flags FROM t;

query I
SELECT regexp_count(str, '\w') from t;
----
3
3
3
3
3
4
4
10
6
4
7

query I
SELECT regexp_count(str, '\w{2}', start) from t;
----
1
1
1
1
0
2
1
4
1
2
3

query I
SELECT regexp_count(str, 'ab', 1, 'i') from t;
----
1
1
1
1
1
0
0
0
0
0
0


query I
SELECT regexp_count(str, pattern) from t;
----
Expand Down

0 comments on commit 08343dd

Please sign in to comment.