From 5e27cb18322359eed9512d29d50dbc9c7cfc87d7 Mon Sep 17 00:00:00 2001
From: kf zheng <100595273+kev1n8@users.noreply.github.com>
Date: Fri, 2 Aug 2024 15:55:12 +0800
Subject: [PATCH 1/6] Add StringView and BinaryView support for the macro
`get_statistics`
---
parquet/src/arrow/arrow_reader/statistics.rs | 114 +++++++++++++++++--
1 file changed, 106 insertions(+), 8 deletions(-)
diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs
index 6a1434bce906..f7af5e86fbda 100644
--- a/parquet/src/arrow/arrow_reader/statistics.rs
+++ b/parquet/src/arrow/arrow_reader/statistics.rs
@@ -26,7 +26,8 @@ use crate::file::page_index::index::{Index, PageIndex};
use crate::file::statistics::Statistics as ParquetStatistics;
use crate::schema::types::SchemaDescriptor;
use arrow_array::builder::{
- BooleanBuilder, FixedSizeBinaryBuilder, LargeStringBuilder, StringBuilder,
+ BinaryViewBuilder, BooleanBuilder, FixedSizeBinaryBuilder, LargeStringBuilder, StringBuilder,
+ StringViewBuilder,
};
use arrow_array::{
new_empty_array, new_null_array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array,
@@ -446,14 +447,43 @@ macro_rules! get_statistics {
},
DataType::Dictionary(_, value_type) => {
[<$stat_type_prefix:lower _ statistics>](value_type, $iterator)
+ },
+ DataType::Utf8View => {
+ let iterator = [<$stat_type_prefix ByteArrayStatsIterator>]::new($iterator);
+ let mut builder = StringViewBuilder::new();
+ for x in iterator {
+ let Some(x) = x else {
+ builder.append_null(); // no statistics value
+ continue;
+ };
+
+ let Ok(x) = std::str::from_utf8(x) else {
+ builder.append_null();
+ continue;
+ };
+
+ builder.append_value(x);
+ }
+ Ok(Arc::new(builder.finish()))
+ },
+ DataType::BinaryView => {
+ let iterator = [<$stat_type_prefix ByteArrayStatsIterator>]::new($iterator);
+ let mut builder = BinaryViewBuilder::new();
+ for x in iterator {
+ let Some(x) = x else {
+ builder.append_null(); // no statistics value
+ continue;
+ };
+
+ builder.append_value(x);
+ }
+ Ok(Arc::new(builder.finish()))
}
DataType::Map(_,_) |
DataType::Duration(_) |
DataType::Interval(_) |
DataType::Null |
- DataType::BinaryView |
- DataType::Utf8View |
DataType::List(_) |
DataType::ListView(_) |
DataType::FixedSizeList(_, _) |
@@ -919,7 +949,7 @@ macro_rules! get_data_page_statistics {
}
})
},
- Some(DataType::FixedSizeBinary(size)) => {
+ Some(DataType::FixedSizeBinary(size)) => {
let mut builder = FixedSizeBinaryBuilder::new(*size);
let iterator = [<$stat_type_prefix FixedLenByteArrayDataPageStatsIterator>]::new($iterator);
for x in iterator {
@@ -1499,10 +1529,10 @@ mod test {
use arrow::datatypes::{i256, Date32Type, Date64Type};
use arrow::util::test_util::parquet_test_data;
use arrow_array::{
- new_empty_array, new_null_array, Array, ArrayRef, BinaryArray, BooleanArray, Date32Array,
- Date64Array, Decimal128Array, Decimal256Array, Float32Array, Float64Array, Int16Array,
- Int32Array, Int64Array, Int8Array, LargeBinaryArray, RecordBatch, StringArray, StructArray,
- TimestampNanosecondArray,
+ new_empty_array, new_null_array, Array, ArrayRef, BinaryArray, BinaryViewArray,
+ BooleanArray, Date32Array, Date64Array, Decimal128Array, Decimal256Array, Float32Array,
+ Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray, RecordBatch,
+ StringArray, StringViewArray, StructArray, TimestampNanosecondArray,
};
use arrow_schema::{DataType, Field, SchemaRef};
use bytes::Bytes;
@@ -1916,6 +1946,59 @@ mod test {
.run()
}
+ #[test]
+ fn roundtrip_string_view() {
+ Test {
+ input: string_view_array([
+ // row group 1
+ Some("A"),
+ None,
+ Some("Q"),
+ // row group 2
+ Some("ZZ"),
+ Some("AA"),
+ None,
+ // row group 3
+ None,
+ None,
+ None,
+ ]),
+ expected_min: string_view_array([Some("A"), Some("AA"), None]),
+ expected_max: string_view_array([Some("Q"), Some("ZZ"), None]),
+ }
+ .run()
+ }
+
+ #[test]
+ fn roundtrip_binary_view() {
+ let input: Vec