Skip to content

Commit

Permalink
Revert changes to parquet writing, update comments
Browse files Browse the repository at this point in the history
  • Loading branch information
alamb committed Aug 15, 2024
1 parent a23cf8b commit 7d4e650
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 27 deletions.
19 changes: 18 additions & 1 deletion parquet/src/file/statistics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,8 @@ pub fn from_thrift(
Ok(match thrift_stats {
Some(stats) => {
// Number of nulls recorded, when it is not available, we just mark it as 0.
// TODO this should be `None` if there is no information about NULLS.
// see https://github.com/apache/arrow-rs/pull/6216/files
let null_count = stats.null_count.unwrap_or(0);

if null_count < 0 {
Expand Down Expand Up @@ -242,10 +244,19 @@ pub fn from_thrift(
pub fn to_thrift(stats: Option<&Statistics>) -> Option<TStatistics> {
let stats = stats?;

// record null counts if greater than zero.
//
// TODO: This should be Some(0) if there are no nulls.
// see https://github.com/apache/arrow-rs/pull/6216/files
let null_count = stats
.null_count_opt()
.map(|value| value as i64)
.filter(|&x| x > 0);

let mut thrift_stats = TStatistics {
max: None,
min: None,
null_count: stats.null_count_opt().map(|value| value as i64),
null_count,
distinct_count: stats.distinct_count().map(|value| value as i64),
max_value: None,
min_value: None,
Expand Down Expand Up @@ -375,6 +386,8 @@ impl Statistics {

/// Returns number of null values for the column.
/// Note that this includes all nulls when column is part of the complex type.
///
/// Note this API returns 0 if the null count is not available.
#[deprecated(since = "53.0.0", note = "Use `null_count_opt` method instead")]
pub fn null_count(&self) -> u64 {
// 0 to remain consistent behavior prior to `null_count_opt`
Expand All @@ -390,6 +403,10 @@ impl Statistics {

/// Returns number of null values for the column, if known.
/// Note that this includes all nulls when column is part of the complex type.
///
/// Note this API returns Some(0) even if the null count was not present
/// in the statistics.
/// See <https://github.com/apache/arrow-rs/pull/6216/files>
pub fn null_count_opt(&self) -> Option<u64> {
statistics_enum_func![self, null_count_opt]
}
Expand Down
52 changes: 26 additions & 26 deletions parquet/tests/arrow_writer_layout.rs
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ fn test_primitive() {
pages: (0..8)
.map(|_| Page {
rows: 250,
page_header_size: 38,
page_header_size: 36,
compressed_size: 1000,
encoding: Encoding::PLAIN,
page_type: PageType::DATA_PAGE,
Expand Down Expand Up @@ -218,22 +218,22 @@ fn test_primitive() {
pages: vec![
Page {
rows: 250,
page_header_size: 38,
page_header_size: 36,
compressed_size: 258,
encoding: Encoding::RLE_DICTIONARY,
page_type: PageType::DATA_PAGE,
},
Page {
rows: 1750,
page_header_size: 38,
page_header_size: 36,
compressed_size: 7000,
encoding: Encoding::PLAIN,
page_type: PageType::DATA_PAGE,
},
],
dictionary_page: Some(Page {
rows: 250,
page_header_size: 38,
page_header_size: 36,
compressed_size: 1000,
encoding: Encoding::PLAIN,
page_type: PageType::DICTIONARY_PAGE,
Expand All @@ -260,50 +260,50 @@ fn test_primitive() {
pages: vec![
Page {
rows: 400,
page_header_size: 38,
page_header_size: 36,
compressed_size: 452,
encoding: Encoding::RLE_DICTIONARY,
page_type: PageType::DATA_PAGE,
},
Page {
rows: 370,
page_header_size: 38,
page_header_size: 36,
compressed_size: 472,
encoding: Encoding::RLE_DICTIONARY,
page_type: PageType::DATA_PAGE,
},
Page {
rows: 330,
page_header_size: 38,
page_header_size: 36,
compressed_size: 464,
encoding: Encoding::RLE_DICTIONARY,
page_type: PageType::DATA_PAGE,
},
Page {
rows: 330,
page_header_size: 38,
page_header_size: 36,
compressed_size: 464,
encoding: Encoding::RLE_DICTIONARY,
page_type: PageType::DATA_PAGE,
},
Page {
rows: 330,
page_header_size: 38,
page_header_size: 36,
compressed_size: 464,
encoding: Encoding::RLE_DICTIONARY,
page_type: PageType::DATA_PAGE,
},
Page {
rows: 240,
page_header_size: 38,
page_header_size: 36,
compressed_size: 332,
encoding: Encoding::RLE_DICTIONARY,
page_type: PageType::DATA_PAGE,
},
],
dictionary_page: Some(Page {
rows: 2000,
page_header_size: 38,
page_header_size: 36,
compressed_size: 8000,
encoding: Encoding::PLAIN,
page_type: PageType::DICTIONARY_PAGE,
Expand All @@ -329,7 +329,7 @@ fn test_primitive() {
pages: (0..20)
.map(|_| Page {
rows: 100,
page_header_size: 38,
page_header_size: 36,
compressed_size: 400,
encoding: Encoding::PLAIN,
page_type: PageType::DATA_PAGE,
Expand Down Expand Up @@ -364,14 +364,14 @@ fn test_string() {
pages: (0..15)
.map(|_| Page {
rows: 130,
page_header_size: 38,
page_header_size: 36,
compressed_size: 1040,
encoding: Encoding::PLAIN,
page_type: PageType::DATA_PAGE,
})
.chain(std::iter::once(Page {
rows: 50,
page_header_size: 37,
page_header_size: 35,
compressed_size: 400,
encoding: Encoding::PLAIN,
page_type: PageType::DATA_PAGE,
Expand Down Expand Up @@ -400,29 +400,29 @@ fn test_string() {
pages: vec![
Page {
rows: 130,
page_header_size: 38,
page_header_size: 36,
compressed_size: 138,
encoding: Encoding::RLE_DICTIONARY,
page_type: PageType::DATA_PAGE,
},
Page {
rows: 1250,
page_header_size: 40,
page_header_size: 38,
compressed_size: 10000,
encoding: Encoding::PLAIN,
page_type: PageType::DATA_PAGE,
},
Page {
rows: 620,
page_header_size: 38,
page_header_size: 36,
compressed_size: 4960,
encoding: Encoding::PLAIN,
page_type: PageType::DATA_PAGE,
},
],
dictionary_page: Some(Page {
rows: 130,
page_header_size: 38,
page_header_size: 36,
compressed_size: 1040,
encoding: Encoding::PLAIN,
page_type: PageType::DICTIONARY_PAGE,
Expand All @@ -449,50 +449,50 @@ fn test_string() {
pages: vec![
Page {
rows: 400,
page_header_size: 38,
page_header_size: 36,
compressed_size: 452,
encoding: Encoding::RLE_DICTIONARY,
page_type: PageType::DATA_PAGE,
},
Page {
rows: 370,
page_header_size: 38,
page_header_size: 36,
compressed_size: 472,
encoding: Encoding::RLE_DICTIONARY,
page_type: PageType::DATA_PAGE,
},
Page {
rows: 330,
page_header_size: 38,
page_header_size: 36,
compressed_size: 464,
encoding: Encoding::RLE_DICTIONARY,
page_type: PageType::DATA_PAGE,
},
Page {
rows: 330,
page_header_size: 38,
page_header_size: 36,
compressed_size: 464,
encoding: Encoding::RLE_DICTIONARY,
page_type: PageType::DATA_PAGE,
},
Page {
rows: 330,
page_header_size: 38,
page_header_size: 36,
compressed_size: 464,
encoding: Encoding::RLE_DICTIONARY,
page_type: PageType::DATA_PAGE,
},
Page {
rows: 240,
page_header_size: 38,
page_header_size: 36,
compressed_size: 332,
encoding: Encoding::RLE_DICTIONARY,
page_type: PageType::DATA_PAGE,
},
],
dictionary_page: Some(Page {
rows: 2000,
page_header_size: 38,
page_header_size: 36,
compressed_size: 16000,
encoding: Encoding::PLAIN,
page_type: PageType::DICTIONARY_PAGE,
Expand Down Expand Up @@ -532,7 +532,7 @@ fn test_list() {
pages: (0..10)
.map(|_| Page {
rows: 20,
page_header_size: 38,
page_header_size: 36,
compressed_size: 672,
encoding: Encoding::PLAIN,
page_type: PageType::DATA_PAGE,
Expand Down

0 comments on commit 7d4e650

Please sign in to comment.