diff --git a/datafusion/common/src/file_options/parquet_writer.rs b/datafusion/common/src/file_options/parquet_writer.rs index a14cbdecf601..dd4bb8ce505e 100644 --- a/datafusion/common/src/file_options/parquet_writer.rs +++ b/datafusion/common/src/file_options/parquet_writer.rs @@ -369,7 +369,13 @@ pub(crate) fn parse_statistics_string(str_setting: &str) -> Result => becomes parquet's true + // TODO: should this be changed? + // refer to https://github.com/apache/datafusion/issues/11367 + assert!( + default_writer_props.dictionary_enabled(&"default".into()), + "extern parquet's default is true" + ); + assert_eq!( + default_table_writer_opts.global.dictionary_enabled, None, + "datafusion's has no default" + ); + assert!( + from_datafusion_defaults.dictionary_enabled(&"default".into()), + "should see the extern parquet's default over-riding datafusion's None", + ); + + // datafusion's `None` for Option => becomes parquet's EnabledStatistics::Page + // TODO: should this be changed? + // refer to https://github.com/apache/datafusion/issues/11367 + assert_eq!( + default_writer_props.statistics_enabled(&"default".into()), + EnabledStatistics::Page, + "extern parquet's default is page" + ); + assert_eq!( + default_table_writer_opts.global.statistics_enabled, None, + "datafusion's has no default" + ); + assert_eq!( + from_datafusion_defaults.statistics_enabled(&"default".into()), + EnabledStatistics::Page, + "should see the extern parquet's default over-riding datafusion's None", + ); + + // datafusion's `None` for Option => becomes parquet's 4096 + // TODO: should this be changed? + // refer to https://github.com/apache/datafusion/issues/11367 + assert_eq!( + default_writer_props.max_statistics_size(&"default".into()), + 4096, + "extern parquet's default is 4096" + ); + assert_eq!( + default_table_writer_opts.global.max_statistics_size, None, + "datafusion's has no default" + ); + assert_eq!( + default_writer_props.max_statistics_size(&"default".into()), + 4096, + "should see the extern parquet's default over-riding datafusion's None", + ); + + // Confirm all other settings are equal. + // First resolve the known discrepancies, (set as the same). + // TODO: once we fix the above mis-matches, we should be able to remove this. + let mut from_extern_parquet = + session_config_from_writer_props(&default_writer_props); + from_extern_parquet.global.compression = Some("zstd(3)".into()); + from_extern_parquet.global.data_page_row_count_limit = usize::MAX; + from_extern_parquet.global.column_index_truncate_length = None; + from_extern_parquet.global.dictionary_enabled = None; + from_extern_parquet.global.statistics_enabled = None; + from_extern_parquet.global.max_statistics_size = None; + + // Expected: the remaining should match + let same_created_by = default_table_writer_opts.global.created_by.clone(); // we expect these to be different + from_extern_parquet.global.created_by = same_created_by; // we expect these to be different + assert_eq!( + default_table_writer_opts, + from_extern_parquet, + "the default writer_props should have the same configuration as the session's default TableParquetOptions", + ); + } + + #[test] + fn test_bloom_filter_defaults() { + // the TableParquetOptions::default, with only the bloom filter turned on + let mut default_table_writer_opts = TableParquetOptions::default(); + default_table_writer_opts.global.bloom_filter_on_write = true; + + // the WriterProperties::default, with only the bloom filter turned on + let default_writer_props = WriterProperties::new(); + let from_datafusion_defaults = + WriterPropertiesBuilder::try_from(&default_table_writer_opts) + .unwrap() + .set_bloom_filter_enabled(true) + .build(); + + // TODO: should have same behavior in either. + // refer to https://github.com/apache/datafusion/issues/11367 + assert_ne!( + default_writer_props.bloom_filter_properties(&"default".into()), + from_datafusion_defaults.bloom_filter_properties(&"default".into()), + "parquet and datafusion props, will not have the same bloom filter props", + ); + assert_eq!( + default_writer_props.bloom_filter_properties(&"default".into()), + None, + "extern parquet's default remains None" + ); + assert_eq!( + from_datafusion_defaults.bloom_filter_properties(&"default".into()), + Some(&BloomFilterProperties::default()), + "datafusion's has BloomFilterProperties::default", + ); + } + + #[test] + fn test_bloom_filter_set_fpp_only() { + // the TableParquetOptions::default, with only fpp set + let mut default_table_writer_opts = TableParquetOptions::default(); + default_table_writer_opts.global.bloom_filter_on_write = true; + default_table_writer_opts.global.bloom_filter_fpp = Some(0.42); + + // the WriterProperties::default, with only fpp set + let default_writer_props = WriterProperties::new(); + let from_datafusion_defaults = + WriterPropertiesBuilder::try_from(&default_table_writer_opts) + .unwrap() + .set_bloom_filter_enabled(true) + .set_bloom_filter_fpp(0.42) + .build(); + + // TODO: should have same behavior in either. + // refer to https://github.com/apache/datafusion/issues/11367 + assert_ne!( + default_writer_props.bloom_filter_properties(&"default".into()), + from_datafusion_defaults.bloom_filter_properties(&"default".into()), + "parquet and datafusion props, will not have the same bloom filter props", + ); + assert_eq!( + default_writer_props.bloom_filter_properties(&"default".into()), + None, + "extern parquet's default remains None" + ); + assert_eq!( + from_datafusion_defaults.bloom_filter_properties(&"default".into()), + Some(&BloomFilterProperties { + fpp: 0.42, + ndv: DEFAULT_BLOOM_FILTER_NDV + }), + "datafusion's has BloomFilterProperties", + ); + } + + #[test] + fn test_bloom_filter_set_ndv_only() { + // the TableParquetOptions::default, with only ndv set + let mut default_table_writer_opts = TableParquetOptions::default(); + default_table_writer_opts.global.bloom_filter_on_write = true; + default_table_writer_opts.global.bloom_filter_ndv = Some(42); + + // the WriterProperties::default, with only ndv set + let default_writer_props = WriterProperties::new(); + let from_datafusion_defaults = + WriterPropertiesBuilder::try_from(&default_table_writer_opts) + .unwrap() + .set_bloom_filter_enabled(true) + .set_bloom_filter_ndv(42) + .build(); + + // TODO: should have same behavior in either. + // refer to https://github.com/apache/datafusion/issues/11367 + assert_ne!( + default_writer_props.bloom_filter_properties(&"default".into()), + from_datafusion_defaults.bloom_filter_properties(&"default".into()), + "parquet and datafusion props, will not have the same bloom filter props", + ); + assert_eq!( + default_writer_props.bloom_filter_properties(&"default".into()), + None, + "extern parquet's default remains None" + ); + assert_eq!( + from_datafusion_defaults.bloom_filter_properties(&"default".into()), + Some(&BloomFilterProperties { + fpp: DEFAULT_BLOOM_FILTER_FPP, + ndv: 42 + }), + "datafusion's has BloomFilterProperties", + ); + } }