Skip to content

Commit

Permalink
Boolean parquet get datapage stat (apache#11054)
Browse files Browse the repository at this point in the history
* test and implement boolean data page statistics

* left out a collect & forgot to change the Check to Both

* Update datafusion/core/src/datasource/physical_plan/parquet/statistics.rs

---------

Co-authored-by: Andrew Lamb <[email protected]>
  • Loading branch information
2 people authored and findepi committed Jul 16, 2024
1 parent 3a8e406 commit 9265317
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 1 deletion.
26 changes: 26 additions & 0 deletions datafusion/core/src/datasource/physical_plan/parquet/statistics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -549,6 +549,18 @@ macro_rules! make_data_page_stats_iterator {
};
}

make_data_page_stats_iterator!(
MinBooleanDataPageStatsIterator,
|x: &PageIndex<bool>| { x.min },
Index::BOOLEAN,
bool
);
make_data_page_stats_iterator!(
MaxBooleanDataPageStatsIterator,
|x: &PageIndex<bool>| { x.max },
Index::BOOLEAN,
bool
);
make_data_page_stats_iterator!(
MinInt32DataPageStatsIterator,
|x: &PageIndex<i32>| { x.min },
Expand Down Expand Up @@ -613,6 +625,15 @@ macro_rules! get_data_page_statistics {
($stat_type_prefix: ident, $data_type: ident, $iterator: ident) => {
paste! {
match $data_type {
Some(DataType::Boolean) => Ok(Arc::new(
BooleanArray::from_iter(
[<$stat_type_prefix BooleanDataPageStatsIterator>]::new($iterator)
.flatten()
// BooleanArray::from_iter required a sized iterator, so collect into Vec first
.collect::<Vec<_>>()
.into_iter()
)
)),
Some(DataType::UInt8) => Ok(Arc::new(
UInt8Array::from_iter(
[<$stat_type_prefix Int32DataPageStatsIterator>]::new($iterator)
Expand Down Expand Up @@ -778,6 +799,11 @@ where
{
let iter = iterator.flat_map(|(len, index)| match index {
Index::NONE => vec![None; len],
Index::BOOLEAN(native_index) => native_index
.indexes
.iter()
.map(|x| x.null_count.map(|x| x as u64))
.collect::<Vec<_>>(),
Index::INT32(native_index) => native_index
.indexes
.iter()
Expand Down
2 changes: 1 addition & 1 deletion datafusion/core/tests/parquet/arrow_statistics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1953,7 +1953,7 @@ async fn test_boolean() {
expected_null_counts: UInt64Array::from(vec![1, 0]),
expected_row_counts: Some(UInt64Array::from(vec![5, 5])),
column_name: "bool",
check: Check::RowGroup,
check: Check::Both,
}
.run();
}
Expand Down

0 comments on commit 9265317

Please sign in to comment.