Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement min max support for string/binary view types #6053

Merged
merged 3 commits into from
Jul 15, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
188 changes: 142 additions & 46 deletions arrow-arith/src/aggregate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -415,21 +415,41 @@ pub fn max_binary<T: OffsetSizeTrait>(array: &GenericBinaryArray<T>) -> Option<&
min_max_helper::<&[u8], _, _>(array, |a, b| *a < *b)
}

/// Returns the maximum value in the binary view array, according to the natural order.
pub fn max_binary_view(array: &BinaryViewArray) -> Option<&[u8]> {
min_max_helper::<&[u8], _, _>(array, |a, b| *a < *b)
}

/// Returns the minimum value in the binary array, according to the natural order.
pub fn min_binary<T: OffsetSizeTrait>(array: &GenericBinaryArray<T>) -> Option<&[u8]> {
min_max_helper::<&[u8], _, _>(array, |a, b| *a > *b)
}

/// Returns the minimum value in the binary view array, according to the natural order.
pub fn min_binary_view(array: &BinaryViewArray) -> Option<&[u8]> {
min_max_helper::<&[u8], _, _>(array, |a, b| *a > *b)
}

/// Returns the maximum value in the string array, according to the natural order.
pub fn max_string<T: OffsetSizeTrait>(array: &GenericStringArray<T>) -> Option<&str> {
min_max_helper::<&str, _, _>(array, |a, b| *a < *b)
}

/// Returns the maximum value in the string view array, according to the natural order.
pub fn max_string_view(array: &StringViewArray) -> Option<&str> {
min_max_helper::<&str, _, _>(array, |a, b| *a < *b)
}

/// Returns the minimum value in the string array, according to the natural order.
pub fn min_string<T: OffsetSizeTrait>(array: &GenericStringArray<T>) -> Option<&str> {
min_max_helper::<&str, _, _>(array, |a, b| *a > *b)
}

/// Returns the minimum value in the string view array, according to the natural order.
pub fn min_string_view(array: &StringViewArray) -> Option<&str> {
min_max_helper::<&str, _, _>(array, |a, b| *a > *b)
}

/// Returns the sum of values in the array.
///
/// This doesn't detect overflow. Once overflowing, the result will wrap around.
Expand Down Expand Up @@ -1132,61 +1152,137 @@ mod tests {
assert!(max(&a).unwrap().is_nan());
}

#[test]
fn test_binary_min_max_with_nulls() {
let a = BinaryArray::from(vec![
Some("b".as_bytes()),
macro_rules! test_binary {
($NAME:ident, $ARRAY:expr, $EXPECTED_MIN:expr, $EXPECTED_MAX: expr) => {
#[test]
fn $NAME() {
let binary = BinaryArray::from($ARRAY);
assert_eq!($EXPECTED_MIN, min_binary(&binary));
assert_eq!($EXPECTED_MAX, max_binary(&binary));

let large_binary = LargeBinaryArray::from($ARRAY);
assert_eq!($EXPECTED_MIN, min_binary(&large_binary));
assert_eq!($EXPECTED_MAX, max_binary(&large_binary));

let binary_view = BinaryViewArray::from($ARRAY);
assert_eq!($EXPECTED_MIN, min_binary_view(&binary_view));
assert_eq!($EXPECTED_MAX, max_binary_view(&binary_view));
}
};
}

test_binary!(
test_binary_min_max_with_nulls,
vec![
Some("b01234567890123".as_bytes()), // long bytes
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you please add coverage here so it compares more than one large string (and maybe update the comments to reflect why this is important)?

Perhaps something like

            Some("b01234XXXXXXXX".as_bytes()), // longer than 12 bytes, handled differenty than StringView
            Some("b01234567890123".as_bytes()), 

Likewise for the string cases too

None,
None,
Some(b"a"),
Some(b"c"),
]);
assert_eq!(Some("a".as_bytes()), min_binary(&a));
assert_eq!(Some("c".as_bytes()), max_binary(&a));
}

#[test]
fn test_binary_min_max_no_null() {
let a = BinaryArray::from(vec![Some("b".as_bytes()), Some(b"a"), Some(b"c")]);
assert_eq!(Some("a".as_bytes()), min_binary(&a));
assert_eq!(Some("c".as_bytes()), max_binary(&a));
}
Some(b"abcdedfg0123456"),
],
Some("a".as_bytes()),
Some("c".as_bytes())
);

test_binary!(
test_binary_min_max_no_null,
vec![
Some("b".as_bytes()),
Some(b"abcdefghijklmnopqrst"), // long bytes
Some(b"c"),
Some(b"b01234567890123"), // long bytes for view types
],
Some("abcdefghijklmnopqrst".as_bytes()),
Some("c".as_bytes())
);

#[test]
fn test_binary_min_max_all_nulls() {
let a = BinaryArray::from(vec![None, None]);
assert_eq!(None, min_binary(&a));
assert_eq!(None, max_binary(&a));
}
test_binary!(test_binary_min_max_all_nulls, vec![None, None], None, None);

#[test]
fn test_binary_min_max_1() {
let a = BinaryArray::from(vec![None, None, Some("b".as_bytes()), Some(b"a")]);
assert_eq!(Some("a".as_bytes()), min_binary(&a));
assert_eq!(Some("b".as_bytes()), max_binary(&a));
}

#[test]
fn test_string_min_max_with_nulls() {
let a = StringArray::from(vec![Some("b"), None, None, Some("a"), Some("c")]);
assert_eq!(Some("a"), min_string(&a));
assert_eq!(Some("c"), max_string(&a));
test_binary!(
test_binary_min_max_1,
vec![
None,
Some("b01234567890123435".as_bytes()), // long bytes for view types
None,
Some(b"b0123xxxxxxxxxxx"),
Some(b"a")
],
Some("a".as_bytes()),
Some("b0123xxxxxxxxxxx".as_bytes())
);

macro_rules! test_string {
($NAME:ident, $ARRAY:expr, $EXPECTED_MIN:expr, $EXPECTED_MAX: expr) => {
#[test]
fn $NAME() {
let string = StringArray::from($ARRAY);
assert_eq!($EXPECTED_MIN, min_string(&string));
assert_eq!($EXPECTED_MAX, max_string(&string));

let large_string = LargeStringArray::from($ARRAY);
assert_eq!($EXPECTED_MIN, min_string(&large_string));
assert_eq!($EXPECTED_MAX, max_string(&large_string));

let string_view = StringViewArray::from($ARRAY);
assert_eq!($EXPECTED_MIN, min_string_view(&string_view));
assert_eq!($EXPECTED_MAX, max_string_view(&string_view));
}
};
}

#[test]
fn test_string_min_max_all_nulls() {
let v: Vec<Option<&str>> = vec![None, None];
let a = StringArray::from(v);
assert_eq!(None, min_string(&a));
assert_eq!(None, max_string(&a));
}
test_string!(
test_string_min_max_with_nulls,
vec![
Some("b012345678901234"), // long bytes for view types
None,
None,
Some("a"),
Some("c"),
Some("b0123xxxxxxxxxxx")
],
Some("a"),
Some("c")
);

test_string!(
test_string_min_max_no_null,
vec![
Some("b"),
Some("b012345678901234"), // long bytes for view types
Some("a"),
Some("b012xxxxxxxxxxxx")
],
Some("a"),
Some("b012xxxxxxxxxxxx")
);

test_string!(
test_string_min_max_all_nulls,
Vec::<Option<&str>>::from_iter([None, None]),
None,
None
);

#[test]
fn test_string_min_max_1() {
let a = StringArray::from(vec![None, None, Some("b"), Some("a")]);
assert_eq!(Some("a"), min_string(&a));
assert_eq!(Some("b"), max_string(&a));
}
test_string!(
test_string_min_max_1,
vec![
None,
Some("c12345678901234"), // long bytes for view types
None,
Some("b"),
Some("c1234xxxxxxxxxx")
],
Some("b"),
Some("c1234xxxxxxxxxx")
);

test_string!(
test_string_min_max_empty,
Vec::<Option<&str>>::new(),
None,
None
);

#[test]
fn test_boolean_min_max_empty() {
Expand Down
Loading