Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: support Utf8View type in starts_with function #11787

Merged
merged 13 commits into from
Aug 6, 2024
1 change: 1 addition & 0 deletions datafusion/expr/src/expr_schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ impl ExprSchemable for Expr {
.iter()
.map(|e| e.get_type(schema))
.collect::<Result<Vec<_>>>()?;

// verify that function is invoked with correct number and type of arguments as defined in `TypeSignature`
data_types_with_scalar_udf(&arg_data_types, func).map_err(|err| {
plan_datafusion_err!(
Expand Down
14 changes: 14 additions & 0 deletions datafusion/expr/src/type_coercion/functions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -573,6 +573,8 @@ fn coerced_from<'a>(
(Interval(_), _) if matches!(type_from, Utf8 | LargeUtf8) => {
Some(type_into.clone())
}
// We can go into a Utf8View from a Utf8 or LargeUtf8
(Utf8View, _) if matches!(type_from, Utf8 | LargeUtf8) => Some(type_into.clone()),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LargeUtf8 has 64 bit length, Utf8View only got 32 bit length.
Loos like LargeUtf8 -> Utf8View is not possible?

Copy link
Contributor

@alamb alamb Aug 3, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The arrow cast kernel supports this conversion

Though I am not sure if we should allow the automatic coercion as it could potentially fail if the strings were over 2GB 🤔

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, the semantics is < 32 bit length LargeUtf8 conversion is supported, otherwise, some runtime error will occur. This makes sense.

// Any type can be coerced into strings
(Utf8 | LargeUtf8, _) => Some(type_into.clone()),
(Null, _) if can_cast_types(type_from, type_into) => Some(type_into.clone()),
Expand Down Expand Up @@ -636,6 +638,18 @@ mod tests {
use super::*;
use arrow::datatypes::Field;

#[test]
fn test_string_conversion() {
let cases = vec![
(DataType::Utf8View, DataType::Utf8, true),
(DataType::Utf8View, DataType::LargeUtf8, true),
];

for case in cases {
assert_eq!(can_coerce_from(&case.0, &case.1), case.2);
}
}

#[test]
fn test_maybe_data_types() {
// this vec contains: arg1, arg2, expected result
Expand Down
89 changes: 69 additions & 20 deletions datafusion/functions/src/string/starts_with.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@
use std::any::Any;
use std::sync::Arc;

use arrow::array::{ArrayRef, OffsetSizeTrait};
use arrow::array::ArrayRef;
use arrow::datatypes::DataType;

use datafusion_common::{cast::as_generic_string_array, internal_err, Result};
use datafusion_common::{internal_err, Result};
use datafusion_expr::ColumnarValue;
use datafusion_expr::TypeSignature::*;
use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
Expand All @@ -30,12 +30,8 @@ use crate::utils::make_scalar_function;

/// Returns true if string starts with prefix.
/// starts_with('alphabet', 'alph') = 't'
pub fn starts_with<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
let left = as_generic_string_array::<T>(&args[0])?;
let right = as_generic_string_array::<T>(&args[1])?;

let result = arrow::compute::kernels::comparison::starts_with(left, right)?;

pub fn starts_with(args: &[ArrayRef]) -> Result<ArrayRef> {
let result = arrow::compute::kernels::comparison::starts_with(&args[0], &args[1])?;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

Ok(Arc::new(result) as ArrayRef)
}

Expand All @@ -52,14 +48,12 @@ impl Default for StartsWithFunc {

impl StartsWithFunc {
pub fn new() -> Self {
use DataType::*;
Self {
signature: Signature::one_of(
vec![
Exact(vec![Utf8, Utf8]),
Exact(vec![Utf8, LargeUtf8]),
Exact(vec![LargeUtf8, Utf8]),
Exact(vec![LargeUtf8, LargeUtf8]),
Exact(vec![DataType::Utf8View, DataType::Utf8View]),
tshauck marked this conversation as resolved.
Show resolved Hide resolved
Exact(vec![DataType::Utf8, DataType::Utf8]),
Exact(vec![DataType::LargeUtf8, DataType::LargeUtf8]),
],
Volatility::Immutable,
),
Expand All @@ -81,18 +75,73 @@ impl ScalarUDFImpl for StartsWithFunc {
}

fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
use DataType::*;

Ok(Boolean)
Ok(DataType::Boolean)
}

fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
match args[0].data_type() {
DataType::Utf8 => make_scalar_function(starts_with::<i32>, vec![])(args),
DataType::LargeUtf8 => {
return make_scalar_function(starts_with::<i64>, vec![])(args);
DataType::Utf8View | DataType::Utf8 | DataType::LargeUtf8 => {
make_scalar_function(starts_with, vec![])(args)
}
_ => internal_err!("Unsupported data type"),
_ => internal_err!("Unsupported data types for starts_with")?,
tshauck marked this conversation as resolved.
Show resolved Hide resolved
}
}
}

#[cfg(test)]
mod tests {
use crate::utils::test::test_function;
use arrow::array::{Array, BooleanArray};
use arrow::datatypes::DataType::Boolean;
use datafusion_common::{Result, ScalarValue};
use datafusion_expr::{ColumnarValue, ScalarUDFImpl};

use super::*;

#[test]
fn test_functions() -> Result<()> {
// Generate test cases for starts_with
let test_cases = vec![
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if this is the best way to test this change. It tests the function but doesn't a view wouldn't be coerced.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have tests for the coercion in #11753 -- you beat me to filing the ticket ❤️ -- I plan to make an epic / start filing items for the various other functions

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I recommend using a slt test (example of sql test above)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to throw it out there, if you make the top level epic, I'd be happy to work though that PR and make individual issues for the problematic functions.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Deal! I filed #11790

What I recommend doing is starting with a single ticket and making it a complete template / description with the idea being that that someone that is not familar DataFusion or Utf8View can work on it

Then when we file tickets for the rest of the functions we can reuse the same description over again. This

  1. Reduces the barrier to contribution
  2. I have found people love working on the code when there is a clear description

I took a shot at updating #11786 with background for use a template. Let me know what you think

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good to me. I'll have a go at replicating one of the other functions following #11786, and then if we're happy with it, I can grind through the others.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good -- thank you. What I have found works well in the past is to do like 10 at a time (if you make more than that the number of parallel threads / work I found overwhelmed our ability to review them)

(Some("alphabet"), Some("alph"), Some(true)),
(Some("alphabet"), Some("bet"), Some(false)),
(
Some("somewhat large string"),
Some("somewhat large"),
Some(true),
),
(Some("somewhat large string"), Some("large"), Some(false)),
]
.into_iter()
.flat_map(|(a, b, c)| {
let utf_8_args = vec![
ColumnarValue::Scalar(ScalarValue::Utf8(a.map(|s| s.to_string()))),
ColumnarValue::Scalar(ScalarValue::Utf8(b.map(|s| s.to_string()))),
];

let large_utf_8_args = vec![
ColumnarValue::Scalar(ScalarValue::LargeUtf8(a.map(|s| s.to_string()))),
ColumnarValue::Scalar(ScalarValue::LargeUtf8(b.map(|s| s.to_string()))),
];

let utf_8_view_args = vec![
ColumnarValue::Scalar(ScalarValue::Utf8View(a.map(|s| s.to_string()))),
ColumnarValue::Scalar(ScalarValue::Utf8View(b.map(|s| s.to_string()))),
];

vec![(utf_8_args, c), (large_utf_8_args, c), (utf_8_view_args, c)]
});

for (args, expected) in test_cases {
test_function!(
StartsWithFunc::new(),
&args,
Ok(expected),
bool,
Boolean,
BooleanArray
);
}

Ok(())
}
}
10 changes: 9 additions & 1 deletion datafusion/sqllogictest/test_files/string_view.slt
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,15 @@ logical_plan
01)Aggregate: groupBy=[[]], aggr=[[count(DISTINCT test.column1_utf8), count(DISTINCT test.column1_utf8view), count(DISTINCT test.column1_dict)]]
02)--TableScan: test projection=[column1_utf8, column1_utf8view, column1_dict]

query TT
EXPLAIN SELECT
STARTS_WITH(column1_utf8view, 'foo') as c,
STARTS_WITH(column1_utf8view, column2_utf8view) as c2
FROM test;
----
logical_plan
01)Projection: starts_with(test.column1_utf8view, Utf8View("foo")) AS c, starts_with(test.column1_utf8view, test.column2_utf8view) AS c2
02)--TableScan: test projection=[column1_utf8view, column2_utf8view]

statement ok
drop table test;
Expand All @@ -376,6 +385,5 @@ select t.dt from dates t where arrow_cast('2024-01-01', 'Utf8View') < t.dt;
----
2024-01-23


statement ok
drop table dates;