From a598739ad7ab9d135272d9633c562589b4ef7083 Mon Sep 17 00:00:00 2001 From: wiedld Date: Sat, 27 Jul 2024 04:18:58 -0700 Subject: [PATCH] Ensure statistic defaults in parquet writers are in sync (#11656) * test(11367): update tests to indicate that the diffferent compression setting is expected * chore(11367): update default settings for statistics_enabled * test(11367): fix test cases, as we set the same variable for from_datafusion_defaults twice (in both the original DF options and in the builder too); only should set once * test(11367): fix bug in the test case, should have set the arrow-rs to true (default is false) * test(11367): fix test for fpp and ndv as defaults, when bloom filter turned on * test(11367): update readme and sqllogictests for updated default config --- datafusion/common/src/config.rs | 2 +- .../common/src/file_options/parquet_writer.rs | 118 +++++------------- .../test_files/information_schema.slt | 4 +- docs/source/user-guide/configs.md | 2 +- 4 files changed, 35 insertions(+), 91 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index c3082546b497..8af71d5abbb3 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -400,7 +400,7 @@ config_namespace! { /// Valid values are: "none", "chunk", and "page" /// These values are not case sensitive. If NULL, uses /// default parquet writer setting - pub statistics_enabled: Option, default = None + pub statistics_enabled: Option, default = Some("page".into()) /// (writing) Sets max statistics size for any column. If NULL, uses /// default parquet writer setting diff --git a/datafusion/common/src/file_options/parquet_writer.rs b/datafusion/common/src/file_options/parquet_writer.rs index e63a7e5ef78d..34b7379823f8 100644 --- a/datafusion/common/src/file_options/parquet_writer.rs +++ b/datafusion/common/src/file_options/parquet_writer.rs @@ -392,7 +392,7 @@ mod tests { ColumnOptions { compression: Some("zstd(22)".into()), dictionary_enabled: src_col_defaults.dictionary_enabled.map(|v| !v), - statistics_enabled: Some("page".into()), + statistics_enabled: Some("none".into()), max_statistics_size: Some(72), encoding: Some("RLE".into()), bloom_filter_enabled: Some(true), @@ -614,23 +614,7 @@ mod tests { "should indicate that table_parquet_opts defaults came from datafusion", ); - // Expected: the remaining should match - let same_created_by = default_table_writer_opts.global.created_by.clone(); - let mut from_extern_parquet = - session_config_from_writer_props(&default_writer_props); - from_extern_parquet.global.created_by = same_created_by; - // TODO: the remaining defaults do not match! - // refer to https://github.com/apache/datafusion/issues/11367 - assert_ne!( - default_table_writer_opts, - from_extern_parquet, - "the default writer_props should have the same configuration as the session's default TableParquetOptions", - ); - - // Below here itemizes how the defaults **should** match, but do not. - - // TODO: compression defaults do not match - // refer to https://github.com/apache/datafusion/issues/11367 + // Expected: the datafusion default compression is different from arrow-rs's parquet assert_eq!( default_writer_props.compression(&"default".into()), Compression::UNCOMPRESSED, @@ -644,35 +628,13 @@ mod tests { "datafusion's default is zstd" ); - // datafusion's `None` for Option => becomes parquet's EnabledStatistics::Page - // TODO: should this be changed? - // refer to https://github.com/apache/datafusion/issues/11367 - assert_eq!( - default_writer_props.statistics_enabled(&"default".into()), - EnabledStatistics::Page, - "extern parquet's default is page" - ); - assert_eq!( - default_table_writer_opts.global.statistics_enabled, None, - "datafusion's has no default" - ); - assert_eq!( - from_datafusion_defaults.statistics_enabled(&"default".into()), - EnabledStatistics::Page, - "should see the extern parquet's default over-riding datafusion's None", - ); - - // Confirm all other settings are equal. - // First resolve the known discrepancies, (set as the same). - // TODO: once we fix the above mis-matches, we should be able to remove this. + // Expected: the remaining should match + let same_created_by = default_table_writer_opts.global.created_by.clone(); let mut from_extern_parquet = session_config_from_writer_props(&default_writer_props); + from_extern_parquet.global.created_by = same_created_by; from_extern_parquet.global.compression = Some("zstd(3)".into()); - from_extern_parquet.global.statistics_enabled = None; - // Expected: the remaining should match - let same_created_by = default_table_writer_opts.global.created_by.clone(); // we expect these to be different - from_extern_parquet.global.created_by = same_created_by; // we expect these to be different assert_eq!( default_table_writer_opts, from_extern_parquet, @@ -685,31 +647,25 @@ mod tests { // the TableParquetOptions::default, with only the bloom filter turned on let mut default_table_writer_opts = TableParquetOptions::default(); default_table_writer_opts.global.bloom_filter_on_write = true; - - // the WriterProperties::default, with only the bloom filter turned on - let default_writer_props = WriterProperties::new(); let from_datafusion_defaults = WriterPropertiesBuilder::try_from(&default_table_writer_opts) .unwrap() - .set_bloom_filter_enabled(true) .build(); - // TODO: should have same behavior in either. - // refer to https://github.com/apache/datafusion/issues/11367 - assert_ne!( + // the WriterProperties::default, with only the bloom filter turned on + let default_writer_props = WriterProperties::builder() + .set_bloom_filter_enabled(true) + .build(); + + assert_eq!( default_writer_props.bloom_filter_properties(&"default".into()), from_datafusion_defaults.bloom_filter_properties(&"default".into()), - "parquet and datafusion props, will not have the same bloom filter props", + "parquet and datafusion props, should have the same bloom filter props", ); assert_eq!( default_writer_props.bloom_filter_properties(&"default".into()), - None, - "extern parquet's default remains None" - ); - assert_eq!( - from_datafusion_defaults.bloom_filter_properties(&"default".into()), Some(&BloomFilterProperties::default()), - "datafusion's has BloomFilterProperties::default", + "should use the default bloom filter props" ); } @@ -719,35 +675,29 @@ mod tests { let mut default_table_writer_opts = TableParquetOptions::default(); default_table_writer_opts.global.bloom_filter_on_write = true; default_table_writer_opts.global.bloom_filter_fpp = Some(0.42); - - // the WriterProperties::default, with only fpp set - let default_writer_props = WriterProperties::new(); let from_datafusion_defaults = WriterPropertiesBuilder::try_from(&default_table_writer_opts) .unwrap() - .set_bloom_filter_enabled(true) - .set_bloom_filter_fpp(0.42) .build(); - // TODO: should have same behavior in either. - // refer to https://github.com/apache/datafusion/issues/11367 - assert_ne!( + // the WriterProperties::default, with only fpp set + let default_writer_props = WriterProperties::builder() + .set_bloom_filter_enabled(true) + .set_bloom_filter_fpp(0.42) + .build(); + + assert_eq!( default_writer_props.bloom_filter_properties(&"default".into()), from_datafusion_defaults.bloom_filter_properties(&"default".into()), - "parquet and datafusion props, will not have the same bloom filter props", + "parquet and datafusion props, should have the same bloom filter props", ); assert_eq!( default_writer_props.bloom_filter_properties(&"default".into()), - None, - "extern parquet's default remains None" - ); - assert_eq!( - from_datafusion_defaults.bloom_filter_properties(&"default".into()), Some(&BloomFilterProperties { fpp: 0.42, ndv: DEFAULT_BLOOM_FILTER_NDV }), - "datafusion's has BloomFilterProperties", + "should have only the fpp set, and the ndv at default", ); } @@ -757,35 +707,29 @@ mod tests { let mut default_table_writer_opts = TableParquetOptions::default(); default_table_writer_opts.global.bloom_filter_on_write = true; default_table_writer_opts.global.bloom_filter_ndv = Some(42); - - // the WriterProperties::default, with only ndv set - let default_writer_props = WriterProperties::new(); let from_datafusion_defaults = WriterPropertiesBuilder::try_from(&default_table_writer_opts) .unwrap() - .set_bloom_filter_enabled(true) - .set_bloom_filter_ndv(42) .build(); - // TODO: should have same behavior in either. - // refer to https://github.com/apache/datafusion/issues/11367 - assert_ne!( + // the WriterProperties::default, with only ndv set + let default_writer_props = WriterProperties::builder() + .set_bloom_filter_enabled(true) + .set_bloom_filter_ndv(42) + .build(); + + assert_eq!( default_writer_props.bloom_filter_properties(&"default".into()), from_datafusion_defaults.bloom_filter_properties(&"default".into()), - "parquet and datafusion props, will not have the same bloom filter props", + "parquet and datafusion props, should have the same bloom filter props", ); assert_eq!( default_writer_props.bloom_filter_properties(&"default".into()), - None, - "extern parquet's default remains None" - ); - assert_eq!( - from_datafusion_defaults.bloom_filter_properties(&"default".into()), Some(&BloomFilterProperties { fpp: DEFAULT_BLOOM_FILTER_FPP, ndv: 42 }), - "datafusion's has BloomFilterProperties", + "should have only the ndv set, and the fpp at default", ); } } diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 431060a1f6f8..e85159fd137a 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -202,7 +202,7 @@ datafusion.execution.parquet.pruning true datafusion.execution.parquet.pushdown_filters false datafusion.execution.parquet.reorder_filters false datafusion.execution.parquet.skip_metadata true -datafusion.execution.parquet.statistics_enabled NULL +datafusion.execution.parquet.statistics_enabled page datafusion.execution.parquet.write_batch_size 1024 datafusion.execution.parquet.writer_version 1.0 datafusion.execution.planning_concurrency 13 @@ -288,7 +288,7 @@ datafusion.execution.parquet.pruning true (reading) If true, the parquet reader datafusion.execution.parquet.pushdown_filters false (reading) If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded. This optimization is sometimes called "late materialization". datafusion.execution.parquet.reorder_filters false (reading) If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query datafusion.execution.parquet.skip_metadata true (reading) If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata -datafusion.execution.parquet.statistics_enabled NULL (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting +datafusion.execution.parquet.statistics_enabled page (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting datafusion.execution.parquet.write_batch_size 1024 (writing) Sets write_batch_size in bytes datafusion.execution.parquet.writer_version 1.0 (writing) Sets parquet writer version valid values are "1.0" and "2.0" datafusion.execution.planning_concurrency 13 Fan-out during initial physical planning. This is mostly use to plan `UNION` children in parallel. Defaults to the number of CPU cores on the system diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index e992361755d3..5814d88c7dd8 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -62,7 +62,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.execution.parquet.compression | zstd(3) | (writing) Sets default parquet compression codec. Valid values are: uncompressed, snappy, gzip(level), lzo, brotli(level), lz4, zstd(level), and lz4_raw. These values are not case sensitive. If NULL, uses default parquet writer setting Note that this default setting is not the same as the default parquet writer setting. | | datafusion.execution.parquet.dictionary_enabled | true | (writing) Sets if dictionary encoding is enabled. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.dictionary_page_size_limit | 1048576 | (writing) Sets best effort maximum dictionary page size, in bytes | -| datafusion.execution.parquet.statistics_enabled | NULL | (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting | +| datafusion.execution.parquet.statistics_enabled | page | (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.max_statistics_size | 4096 | (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.max_row_group_size | 1048576 | (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. | | datafusion.execution.parquet.created_by | datafusion version 40.0.0 | (writing) Sets "created by" property |