Skip to content

Commit

Permalink
Support dictionary data type in array_to_string (apache#10908)
Browse files Browse the repository at this point in the history
* Support dictionary data type in array_to_string

* Fix import

* Some tests

* Update datafusion/functions-array/src/string.rs

Co-authored-by: Alex Huang <[email protected]>

* Add some tests showing incorrect results

* Get logical array

* apply rust fmt

* Simplify implementation, avoid panics

---------

Co-authored-by: Alex Huang <[email protected]>
Co-authored-by: Andrew Lamb <[email protected]>
  • Loading branch information
3 people authored and findepi committed Jul 16, 2024
1 parent da2d371 commit 87e69b9
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 4 deletions.
29 changes: 25 additions & 4 deletions datafusion/functions-array/src/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,15 @@ use arrow::array::{
use arrow::datatypes::{DataType, Field};
use datafusion_expr::TypeSignature;

use datafusion_common::{plan_err, DataFusionError, Result};
use datafusion_common::{not_impl_err, plan_err, DataFusionError, Result};

use std::any::{type_name, Any};

use crate::utils::{downcast_arg, make_scalar_function};
use arrow_schema::DataType::{FixedSizeList, LargeList, LargeUtf8, List, Null, Utf8};
use arrow::compute::cast;
use arrow_schema::DataType::{
Dictionary, FixedSizeList, LargeList, LargeUtf8, List, Null, Utf8,
};
use datafusion_common::cast::{
as_generic_string_array, as_large_list_array, as_list_array, as_string_array,
};
Expand Down Expand Up @@ -76,7 +79,7 @@ macro_rules! call_array_function {
DataType::UInt16 => array_function!(UInt16Array),
DataType::UInt32 => array_function!(UInt32Array),
DataType::UInt64 => array_function!(UInt64Array),
_ => unreachable!(),
dt => not_impl_err!("Unsupported data type in array_to_string: {dt}"),
}
};
($DATATYPE:expr, $INCLUDE_LIST:expr) => {{
Expand All @@ -95,7 +98,7 @@ macro_rules! call_array_function {
DataType::UInt16 => array_function!(UInt16Array),
DataType::UInt32 => array_function!(UInt32Array),
DataType::UInt64 => array_function!(UInt64Array),
_ => unreachable!(),
dt => not_impl_err!("Unsupported data type in array_to_string: {dt}"),
}
}};
}
Expand Down Expand Up @@ -245,6 +248,8 @@ pub(super) fn array_to_string_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
with_null_string = true;
}

/// Creates a single string from single element of a ListArray (which is
/// itself another Array)
fn compute_array_to_string(
arg: &mut String,
arr: ArrayRef,
Expand Down Expand Up @@ -281,6 +286,22 @@ pub(super) fn array_to_string_inner(args: &[ArrayRef]) -> Result<ArrayRef> {

Ok(arg)
}
Dictionary(_key_type, value_type) => {
// Call cast to unwrap the dictionary. This could be optimized if we wanted
// to accept the overhead of extra code
let values = cast(&arr, value_type.as_ref()).map_err(|e| {
DataFusionError::from(e).context(
"Casting dictionary to values in compute_array_to_string",
)
})?;
compute_array_to_string(
arg,
values,
delimiter,
null_string,
with_null_string,
)
}
Null => Ok(arg),
data_type => {
macro_rules! array_function {
Expand Down
48 changes: 48 additions & 0 deletions datafusion/sqllogictest/test_files/array.slt
Original file line number Diff line number Diff line change
Expand Up @@ -3769,6 +3769,54 @@ select array_to_string(make_array(), ',')
----
(empty)

# array to string dictionary
statement ok
CREATE TABLE table1 AS VALUES
(1, 'foo'),
(3, 'bar'),
(1, 'foo'),
(2, NULL),
(NULL, 'baz')
;

# expect 1-3-1-2 (dictionary values should be repeated)
query T
SELECT array_to_string(array_agg(column1),'-')
FROM (
SELECT arrow_cast(column1, 'Dictionary(Int32, Int32)') as column1
FROM table1
);
----
1-3-1-2

# expect foo,bar,foo,baz (dictionary values should be repeated)
query T
SELECT array_to_string(array_agg(column2),',')
FROM (
SELECT arrow_cast(column2, 'Dictionary(Int64, Utf8)') as column2
FROM table1
);
----
foo,bar,foo,baz

# Expect only values that are in the group
query I?T
SELECT column1, array_agg(column2), array_to_string(array_agg(column2),',')
FROM (
SELECT column1, arrow_cast(column2, 'Dictionary(Int32, Utf8)') as column2
FROM table1
)
GROUP BY column1
ORDER BY column1;
----
1 [foo, foo] foo,foo
2 [] (empty)
3 [bar] bar
NULL [baz] baz

statement ok
drop table table1;


## array_union (aliases: `list_union`)

Expand Down

0 comments on commit 87e69b9

Please sign in to comment.