Skip to content

Commit

Permalink
Update INITCAP scalar function to support Utf8View
Browse files Browse the repository at this point in the history
  • Loading branch information
xinlifoobar committed Aug 8, 2024
1 parent 2521043 commit 5adad9a
Show file tree
Hide file tree
Showing 2 changed files with 97 additions and 19 deletions.
81 changes: 62 additions & 19 deletions datafusion/functions/src/string/initcap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ use std::sync::Arc;
use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait};
use arrow::datatypes::DataType;

use datafusion_common::cast::as_generic_string_array;
use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
use datafusion_common::{exec_err, Result};
use datafusion_expr::{ColumnarValue, Volatility};
use datafusion_expr::{ScalarUDFImpl, Signature};
Expand All @@ -45,7 +45,7 @@ impl InitcapFunc {
Self {
signature: Signature::uniform(
1,
vec![Utf8, LargeUtf8],
vec![Utf8, LargeUtf8, Utf8View],
Volatility::Immutable,
),
}
Expand Down Expand Up @@ -73,6 +73,9 @@ impl ScalarUDFImpl for InitcapFunc {
match args[0].data_type() {
DataType::Utf8 => make_scalar_function(initcap::<i32>, vec![])(args),
DataType::LargeUtf8 => make_scalar_function(initcap::<i64>, vec![])(args),
DataType::Utf8View => {
make_scalar_function(initcap_utf8view::<i32>, vec![])(args)
}
other => {
exec_err!("Unsupported data type {other:?} for function initcap")
}
Expand All @@ -88,28 +91,40 @@ fn initcap<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
// first map is the iterator, second is for the `Option<_>`
let result = string_array
.iter()
.map(|string| {
string.map(|string: &str| {
let mut char_vector = Vec::<char>::new();
let mut previous_character_letter_or_number = false;
for c in string.chars() {
if previous_character_letter_or_number {
char_vector.push(c.to_ascii_lowercase());
} else {
char_vector.push(c.to_ascii_uppercase());
}
previous_character_letter_or_number = c.is_ascii_uppercase()
|| c.is_ascii_lowercase()
|| c.is_ascii_digit();
}
char_vector.iter().collect::<String>()
})
})
.map(initcap_string)
.collect::<GenericStringArray<T>>();

Ok(Arc::new(result) as ArrayRef)
}

fn initcap_utf8view<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
let string_view_array = as_string_view_array(&args[0])?;

let result = string_view_array
.iter()
.map(initcap_string)
.collect::<GenericStringArray<T>>();

Ok(Arc::new(result) as ArrayRef)
}

fn initcap_string(string: Option<&str>) -> Option<String> {
string.map(|string: &str| {
let mut char_vector = Vec::<char>::new();
let mut previous_character_letter_or_number = false;
for c in string.chars() {
if previous_character_letter_or_number {
char_vector.push(c.to_ascii_lowercase());
} else {
char_vector.push(c.to_ascii_uppercase());
}
previous_character_letter_or_number =
c.is_ascii_uppercase() || c.is_ascii_lowercase() || c.is_ascii_digit();
}
char_vector.iter().collect::<String>()
})
}

#[cfg(test)]
mod tests {
use crate::string::initcap::InitcapFunc;
Expand Down Expand Up @@ -153,6 +168,34 @@ mod tests {
Utf8,
StringArray
);
test_function!(
InitcapFunc::new(),
&[ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
"hi THOMAS".to_string()
)))],
Ok(Some("Hi Thomas")),
&str,
Utf8,
StringArray
);
test_function!(
InitcapFunc::new(),
&[ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
"".to_string()
)))],
Ok(Some("")),
&str,
Utf8,
StringArray
);
test_function!(
InitcapFunc::new(),
&[ColumnarValue::Scalar(ScalarValue::Utf8View(None))],
Ok(None),
&str,
Utf8,
StringArray
);

Ok(())
}
Expand Down
35 changes: 35 additions & 0 deletions datafusion/sqllogictest/test_files/string_view.slt
Original file line number Diff line number Diff line change
Expand Up @@ -425,6 +425,41 @@ logical_plan
01)Projection: starts_with(test.column1_utf8view, Utf8View("äöüß")) AS c1, starts_with(test.column1_utf8view, Utf8View("")) AS c2, starts_with(test.column1_utf8view, Utf8View(NULL)) AS c3, starts_with(Utf8View(NULL), test.column1_utf8view) AS c4
02)--TableScan: test projection=[column1_utf8view]

### Initcap

# Create a table with lowercase strings
statement ok
CREATE TABLE test_lowercase AS SELECT
lower(column1_utf8) as column1_utf8_lower,
lower(column1_large_utf8) as column1_large_utf8_lower,
lower(column1_utf8view) as column1_utf8view_lower
FROM test;

# Test INITCAP with utf8view, utf8, and largeutf8
# Should not cast anything
query TT
EXPLAIN SELECT
INITCAP(column1_utf8view_lower) as c1,
INITCAP(column1_utf8_lower) as c2,
INITCAP(column1_large_utf8_lower) as c3
FROM test_lowercase;
----
logical_plan
01)Projection: initcap(test_lowercase.column1_utf8view_lower) AS c1, initcap(test_lowercase.column1_utf8_lower) AS c2, initcap(test_lowercase.column1_large_utf8_lower) AS c3
02)--TableScan: test_lowercase projection=[column1_utf8_lower, column1_large_utf8_lower, column1_utf8view_lower]

query TTT
SELECT
INITCAP(column1_utf8view_lower) as c1,
INITCAP(column1_utf8_lower) as c2,
INITCAP(column1_large_utf8_lower) as c3
FROM test_lowercase;
----
Andrew Andrew Andrew
Xiangpeng Xiangpeng Xiangpeng
Raphael Raphael Raphael
NULL NULL NULL

statement ok
drop table test;

Expand Down

0 comments on commit 5adad9a

Please sign in to comment.