Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support unicode character for initcap function #13752

Merged
merged 6 commits into from
Dec 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion datafusion/functions/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -211,4 +211,4 @@ required-features = ["math_expressions"]
[[bench]]
harness = false
name = "initcap"
required-features = ["string_expressions"]
required-features = ["unicode_expressions"]
4 changes: 2 additions & 2 deletions datafusion/functions/benches/initcap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ use arrow::util::bench_util::{
};
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
use datafusion_functions::string;
use datafusion_functions::unicode;
use std::sync::Arc;

fn create_args<O: OffsetSizeTrait>(
Expand All @@ -46,7 +46,7 @@ fn create_args<O: OffsetSizeTrait>(
}

fn criterion_benchmark(c: &mut Criterion) {
let initcap = string::initcap();
let initcap = unicode::initcap();
for size in [1024, 4096] {
let args = create_args::<i32>(size, 8, true);
c.bench_function(
Expand Down
7 changes: 0 additions & 7 deletions datafusion/functions/src/string/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ pub mod concat;
pub mod concat_ws;
pub mod contains;
pub mod ends_with;
pub mod initcap;
pub mod levenshtein;
pub mod lower;
pub mod ltrim;
Expand All @@ -52,7 +51,6 @@ make_udf_function!(chr::ChrFunc, chr);
make_udf_function!(concat::ConcatFunc, concat);
make_udf_function!(concat_ws::ConcatWsFunc, concat_ws);
make_udf_function!(ends_with::EndsWithFunc, ends_with);
make_udf_function!(initcap::InitcapFunc, initcap);
make_udf_function!(levenshtein::LevenshteinFunc, levenshtein);
make_udf_function!(ltrim::LtrimFunc, ltrim);
make_udf_function!(lower::LowerFunc, lower);
Expand Down Expand Up @@ -94,10 +92,6 @@ pub mod expr_fn {
ends_with,
"Returns true if the `string` ends with the `suffix`, false otherwise.",
string suffix
),(
initcap,
"Converts the first letter of each word in `string` in uppercase and the remaining characters in lowercase",
string
),(
levenshtein,
"Returns the Levenshtein distance between the two given strings",
Expand Down Expand Up @@ -177,7 +171,6 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
concat(),
concat_ws(),
ends_with(),
initcap(),
levenshtein(),
lower(),
ltrim(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@
use std::any::Any;
use std::sync::{Arc, OnceLock};

use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait, StringArray};
use arrow::array::{
Array, ArrayRef, GenericStringBuilder, OffsetSizeTrait, StringViewBuilder,
};
use arrow::datatypes::DataType;

use crate::utils::{make_scalar_function, utf8_to_str_type};
Expand Down Expand Up @@ -74,7 +76,7 @@ impl ScalarUDFImpl for InitcapFunc {
DataType::LargeUtf8 => make_scalar_function(initcap::<i64>, vec![])(args),
DataType::Utf8View => make_scalar_function(initcap_utf8view, vec![])(args),
other => {
exec_err!("Unsupported data type {other:?} for function initcap")
exec_err!("Unsupported data type {other:?} for function `initcap`")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
exec_err!("Unsupported data type {other:?} for function `initcap`")
internal_err!("Unsupported data type {other:?} for function `initcap`")

Incompatible arg should be checked during planning before, thus here is unreachable, and we can use internal_err to indicate a potential bug if it's executed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@2010YOUY01 Thanks for reviewing,

Incompatible arg should be checked during planning before, thus here is unreachable, and we can use internal_err to indicate a potential bug if it's executed

Make sense to me👍

}
}
}
Expand All @@ -90,9 +92,8 @@ fn get_initcap_doc() -> &'static Documentation {
DOCUMENTATION.get_or_init(|| {
Documentation::builder(
DOC_SECTION_STRING,
"Capitalizes the first character in each word in the ASCII input string. \
Words are delimited by non-alphanumeric characters.\n\n\
Note this function does not support UTF-8 characters.",
"Capitalizes the first character in each word in the input string. \
Words are delimited by non-alphanumeric characters.",
"initcap(str)",
)
.with_sql_example(
Expand Down Expand Up @@ -123,50 +124,70 @@ fn get_initcap_doc() -> &'static Documentation {
fn initcap<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
let string_array = as_generic_string_array::<T>(&args[0])?;

// first map is the iterator, second is for the `Option<_>`
let result = string_array
.iter()
.map(initcap_string)
.collect::<GenericStringArray<T>>();
let mut builder = GenericStringBuilder::<T>::with_capacity(
string_array.len(),
string_array.value_data().len(),
);

Ok(Arc::new(result) as ArrayRef)
string_array.iter().for_each(|str| match str {
Some(s) => {
let initcap_str = initcap_string(s);
builder.append_value(initcap_str);
}
None => builder.append_null(),
});

Ok(Arc::new(builder.finish()) as ArrayRef)
}

fn initcap_utf8view(args: &[ArrayRef]) -> Result<ArrayRef> {
let string_view_array = as_string_view_array(&args[0])?;

let result = string_view_array
.iter()
.map(initcap_string)
.collect::<StringArray>();
let mut builder = StringViewBuilder::with_capacity(string_view_array.len());

string_view_array.iter().for_each(|str| match str {
Some(s) => {
let initcap_str = initcap_string(s);
builder.append_value(initcap_str);
}
None => builder.append_null(),
});

Ok(Arc::new(result) as ArrayRef)
Ok(Arc::new(builder.finish()) as ArrayRef)
}

fn initcap_string(input: Option<&str>) -> Option<String> {
input.map(|s| {
let mut result = String::with_capacity(s.len());
let mut prev_is_alphanumeric = false;
fn initcap_string(input: &str) -> String {
let mut result = String::with_capacity(input.len());
let mut prev_is_alphanumeric = false;

for c in s.chars() {
let transformed = if prev_is_alphanumeric {
c.to_ascii_lowercase()
if input.is_ascii() {
for c in input.chars() {
if prev_is_alphanumeric {
result.push(c.to_ascii_lowercase());
} else {
c.to_ascii_uppercase()
result.push(c.to_ascii_uppercase());
};
result.push(transformed);
prev_is_alphanumeric = c.is_ascii_alphanumeric();
}
} else {
for c in input.chars() {
if prev_is_alphanumeric {
result.extend(c.to_lowercase());
} else {
result.extend(c.to_uppercase());
}
prev_is_alphanumeric = c.is_alphanumeric();
}
}

result
})
result
}

#[cfg(test)]
mod tests {
use crate::string::initcap::InitcapFunc;
use crate::unicode::initcap::InitcapFunc;
use crate::utils::test::test_function;
use arrow::array::{Array, StringArray};
use arrow::array::{Array, StringArray, StringViewArray};
use arrow::datatypes::DataType::Utf8;
use datafusion_common::{Result, ScalarValue};
use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
Expand All @@ -181,6 +202,19 @@ mod tests {
Utf8,
StringArray
);
test_function!(
InitcapFunc::new(),
vec![ColumnarValue::Scalar(ScalarValue::Utf8(Some(
"êM ả ñAnDÚ ÁrBOL ОлЕГ ИвАНОВИч ÍslENsku ÞjóðaRiNNaR εΛλΗΝΙκΉ"
.to_string()
)))],
Ok(Some(
"Êm Ả Ñandú Árbol Олег Иванович Íslensku Þjóðarinnar Ελληνική"
)),
&str,
Utf8,
StringArray
);
test_function!(
InitcapFunc::new(),
vec![ColumnarValue::Scalar(ScalarValue::from(""))],
Expand All @@ -205,6 +239,7 @@ mod tests {
Utf8,
StringArray
);

test_function!(
InitcapFunc::new(),
vec![ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
Expand All @@ -213,7 +248,7 @@ mod tests {
Ok(Some("Hi Thomas")),
&str,
Utf8,
StringArray
StringViewArray
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

);
test_function!(
InitcapFunc::new(),
Expand All @@ -223,7 +258,20 @@ mod tests {
Ok(Some("Hi Thomas With M0re Than 12 Chars")),
&str,
Utf8,
StringArray
StringViewArray
);
test_function!(
InitcapFunc::new(),
vec![ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
"đẸp đẼ êM ả ñAnDÚ ÁrBOL ОлЕГ ИвАНОВИч ÍslENsku ÞjóðaRiNNaR εΛλΗΝΙκΉ"
.to_string()
)))],
Ok(Some(
"Đẹp Đẽ Êm Ả Ñandú Árbol Олег Иванович Íslensku Þjóðarinnar Ελληνική"
)),
&str,
Utf8,
StringViewArray
);
test_function!(
InitcapFunc::new(),
Expand All @@ -233,15 +281,15 @@ mod tests {
Ok(Some("")),
&str,
Utf8,
StringArray
StringViewArray
);
test_function!(
InitcapFunc::new(),
vec![ColumnarValue::Scalar(ScalarValue::Utf8View(None))],
Ok(None),
&str,
Utf8,
StringArray
StringViewArray
);

Ok(())
Expand Down
7 changes: 7 additions & 0 deletions datafusion/functions/src/unicode/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ use datafusion_expr::ScalarUDF;

pub mod character_length;
pub mod find_in_set;
pub mod initcap;
pub mod left;
pub mod lpad;
pub mod reverse;
Expand All @@ -36,6 +37,7 @@ pub mod translate;
// create UDFs
make_udf_function!(character_length::CharacterLengthFunc, character_length);
make_udf_function!(find_in_set::FindInSetFunc, find_in_set);
make_udf_function!(initcap::InitcapFunc, initcap);
make_udf_function!(left::LeftFunc, left);
make_udf_function!(lpad::LPadFunc, lpad);
make_udf_function!(right::RightFunc, right);
Expand Down Expand Up @@ -94,6 +96,10 @@ pub mod expr_fn {
left,
"returns the first `n` characters in the `string`",
string n
),(
initcap,
"converts the first letter of each word in `string` in uppercase and the remaining characters in lowercase",
string
),(
find_in_set,
"Returns a value in the range of 1 to N if the string str is in the string list strlist consisting of N substrings",
Expand Down Expand Up @@ -126,6 +132,7 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
vec![
character_length(),
find_in_set(),
initcap(),
left(),
lpad(),
reverse(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -460,7 +460,7 @@ Andrew Datafusion📊🔥
Xiangpeng Datafusion数据融合
Raphael Datafusionдатафусион
Under_Score Un Iść Core
Percent Pan Tadeusz Ma Iść W KąT
Percent Pan Tadeusz Ma Iść W Kąt
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've checked and this result is compatible with PostgreSQL

(empty) (empty)
(empty) (empty)
% (empty)
Expand Down
4 changes: 1 addition & 3 deletions docs/source/user-guide/sql/scalar_functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -1046,9 +1046,7 @@ find_in_set(str, strlist)

### `initcap`

Capitalizes the first character in each word in the ASCII input string. Words are delimited by non-alphanumeric characters.

Note this function does not support UTF-8 characters.
Capitalizes the first character in each word in the input string. Words are delimited by non-alphanumeric characters.

```
initcap(str)
Expand Down
Loading