From f2de2cd3c8a8496e1f96e58b50786227bfe915f7 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 2 Aug 2024 11:05:01 -0700 Subject: [PATCH] No longer write Parquet column metadata after column chunks *and* in the footer (#6117) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * bump `tonic` to 0.12 and `prost` to 0.13 for `arrow-flight` (#6041) * bump `tonic` to 0.12 and `prost` to 0.13 for `arrow-flight` Signed-off-by: Bugen Zhao * fix example tests Signed-off-by: Bugen Zhao --------- Signed-off-by: Bugen Zhao * Remove `impl> From for Buffer` that easily accidentally copies data (#6043) * deprecate auto copy, ask explicit reference * update comments * make cargo doc happy * Make display of interval types more pretty (#6006) * improve dispaly for interval. * update test in pretty, and fix display problem. * tmp * fix tests in arrow-cast. * fix tests in pretty. * fix style. * Update snafu (#5930) * Update Parquet thrift generated structures (#6045) * update to latest thrift (as of 11 Jul 2024) from parquet-format * pass None for optional size statistics * escape HTML tags * don't need to escape brackets in arrays * Revert "Revert "Write Bloom filters between row groups instead of the end (#…" (#5933) This reverts commit 22e0b4432c9838f2536284015271d3de9a165135. * Revert "Update snafu (#5930)" (#6069) This reverts commit 756b1fb26d1702f36f446faf9bb40a4869c3e840. * Update pyo3 requirement from 0.21.1 to 0.22.1 (fixed) (#6075) * Update pyo3 requirement from 0.21.1 to 0.22.1 Updates the requirements on [pyo3](https://github.com/pyo3/pyo3) to permit the latest version. - [Release notes](https://github.com/pyo3/pyo3/releases) - [Changelog](https://github.com/PyO3/pyo3/blob/main/CHANGELOG.md) - [Commits](https://github.com/pyo3/pyo3/compare/v0.21.1...v0.22.1) --- updated-dependencies: - dependency-name: pyo3 dependency-type: direct:production ... Signed-off-by: dependabot[bot] * refactor: remove deprecated `FromPyArrow::from_pyarrow` "GIL Refs" are being phased out. * chore: update `pyo3` in integration tests --------- Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * remove repeated codes to make the codes more concise. (#6080) * Add `unencoded_byte_array_data_bytes` to `ParquetMetaData` (#6068) * update to latest thrift (as of 11 Jul 2024) from parquet-format * pass None for optional size statistics * escape HTML tags * don't need to escape brackets in arrays * add support for unencoded_byte_array_data_bytes * add comments * change sig of ColumnMetrics::update_variable_length_bytes() * rename ParquetOffsetIndex to OffsetSizeIndex * rename some functions * suggestion from review Co-authored-by: Andrew Lamb * add Default trait to ColumnMetrics as suggested in review * rename OffsetSizeIndex to OffsetIndexMetaData --------- Co-authored-by: Andrew Lamb * Update pyo3 requirement from 0.21.1 to 0.22.2 (#6085) Updates the requirements on [pyo3](https://github.com/pyo3/pyo3) to permit the latest version. - [Release notes](https://github.com/pyo3/pyo3/releases) - [Changelog](https://github.com/PyO3/pyo3/blob/v0.22.2/CHANGELOG.md) - [Commits](https://github.com/pyo3/pyo3/compare/v0.21.1...v0.22.2) --- updated-dependencies: - dependency-name: pyo3 dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * Deprecate read_page_locations() and simplify offset index in `ParquetMetaData` (#6095) * deprecate read_page_locations * add to_thrift() to OffsetIndexMetaData * no longer write inline column metadata * Update parquet/src/column/writer/mod.rs Co-authored-by: Ed Seidl * suggestion from review Co-authored-by: Andrew Lamb * add some more documentation * remove write_metadata from PageWriter --------- Signed-off-by: Bugen Zhao Signed-off-by: dependabot[bot] Co-authored-by: Bugen Zhao Co-authored-by: Xiangpeng Hao Co-authored-by: kamille Co-authored-by: Jesse Co-authored-by: Andrew Lamb Co-authored-by: Marco Neumann Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- parquet/src/arrow/arrow_writer/mod.rs | 7 +------ parquet/src/column/page.rs | 8 +------- parquet/src/column/writer/mod.rs | 18 +++--------------- parquet/src/file/metadata/mod.rs | 16 ++++++++++++++-- parquet/src/file/writer.rs | 12 ------------ 5 files changed, 19 insertions(+), 42 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 8f7b514ccf71..db5d4502cfec 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -43,7 +43,7 @@ use crate::column::writer::{ }; use crate::data_type::{ByteArray, FixedLenByteArray}; use crate::errors::{ParquetError, Result}; -use crate::file::metadata::{ColumnChunkMetaData, KeyValue, RowGroupMetaData}; +use crate::file::metadata::{KeyValue, RowGroupMetaData}; use crate::file::properties::{WriterProperties, WriterPropertiesPtr}; use crate::file::reader::{ChunkReader, Length}; use crate::file::writer::{SerializedFileWriter, SerializedRowGroupWriter}; @@ -489,11 +489,6 @@ impl PageWriter for ArrowPageWriter { Ok(spec) } - fn write_metadata(&mut self, _metadata: &ColumnChunkMetaData) -> Result<()> { - // Skip writing metadata as won't be copied anyway - Ok(()) - } - fn close(&mut self) -> Result<()> { Ok(()) } diff --git a/parquet/src/column/page.rs b/parquet/src/column/page.rs index 947a633f48a2..585d1951c14a 100644 --- a/parquet/src/column/page.rs +++ b/parquet/src/column/page.rs @@ -21,7 +21,7 @@ use bytes::Bytes; use crate::basic::{Encoding, PageType}; use crate::errors::{ParquetError, Result}; -use crate::file::{metadata::ColumnChunkMetaData, statistics::Statistics}; +use crate::file::statistics::Statistics; use crate::format::PageHeader; /// Parquet Page definition. @@ -350,12 +350,6 @@ pub trait PageWriter: Send { /// either data page or dictionary page. fn write_page(&mut self, page: CompressedPage) -> Result; - /// Writes column chunk metadata into the output stream/sink. - /// - /// This method is called once before page writer is closed, normally when writes are - /// finalised in column writer. - fn write_metadata(&mut self, metadata: &ColumnChunkMetaData) -> Result<()>; - /// Closes resources and flushes underlying sink. /// Page writer should not be used after this method is called. fn close(&mut self) -> Result<()>; diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 54d8fd3cc13e..519a219f943d 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -579,7 +579,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { self.write_dictionary_page()?; } self.flush_data_pages()?; - let metadata = self.write_column_metadata()?; + let metadata = self.build_column_metadata()?; self.page_writer.close()?; let boundary_order = match ( @@ -1041,8 +1041,8 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { Ok(()) } - /// Assembles and writes column chunk metadata. - fn write_column_metadata(&mut self) -> Result { + /// Assembles column chunk metadata. + fn build_column_metadata(&mut self) -> Result { let total_compressed_size = self.column_metrics.total_compressed_size as i64; let total_uncompressed_size = self.column_metrics.total_uncompressed_size as i64; let num_values = self.column_metrics.total_num_values as i64; @@ -1050,15 +1050,9 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { // If data page offset is not set, then no pages have been written let data_page_offset = self.column_metrics.data_page_offset.unwrap_or(0) as i64; - let file_offset = match dict_page_offset { - Some(dict_offset) => dict_offset + total_compressed_size, - None => data_page_offset + total_compressed_size, - }; - let mut builder = ColumnChunkMetaData::builder(self.descr.clone()) .set_compression(self.codec) .set_encodings(self.encodings.iter().cloned().collect()) - .set_file_offset(file_offset) .set_total_compressed_size(total_compressed_size) .set_total_uncompressed_size(total_uncompressed_size) .set_num_values(num_values) @@ -1138,8 +1132,6 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { } let metadata = builder.build()?; - self.page_writer.write_metadata(&metadata)?; - Ok(metadata) } @@ -3589,10 +3581,6 @@ mod tests { Ok(res) } - fn write_metadata(&mut self, _metadata: &ColumnChunkMetaData) -> Result<()> { - Ok(()) - } - fn close(&mut self) -> Result<()> { Ok(()) } diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index d99cd951037c..86c673bbdbc6 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -682,7 +682,12 @@ impl ColumnChunkMetaData { self.file_path.as_deref() } - /// Byte offset in `file_path()`. + /// Byte offset of `ColumnMetaData` in `file_path()`. + /// + /// Note that the meaning of this field has been inconsistent between implementations + /// so its use has since been deprecated in the Parquet specification. Modern implementations + /// will set this to `0` to indicate that the `ColumnMetaData` is solely contained in the + /// `ColumnChunk` struct. pub fn file_offset(&self) -> i64 { self.file_offset } @@ -1040,6 +1045,14 @@ impl ColumnChunkMetaDataBuilder { } /// Sets file offset in bytes. + /// + /// This field was meant to provide an alternate to storing `ColumnMetadata` directly in + /// the `ColumnChunkMetadata`. However, most Parquet readers assume the `ColumnMetadata` + /// is stored inline and ignore this field. + #[deprecated( + since = "53.0.0", + note = "The Parquet specification requires this field to be 0" + )] pub fn set_file_offset(mut self, value: i64) -> Self { self.0.file_offset = value; self @@ -1453,7 +1466,6 @@ mod tests { let col_metadata = ColumnChunkMetaData::builder(column_descr.clone()) .set_encodings(vec![Encoding::PLAIN, Encoding::RLE]) .set_file_path("file_path".to_owned()) - .set_file_offset(100) .set_num_values(1000) .set_compression(Compression::SNAPPY) .set_total_compressed_size(2000) diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 89aaf028d1b9..bcf8449b4420 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -649,13 +649,10 @@ impl<'a, W: Write + Send> SerializedRowGroupWriter<'a, W> { )); } - let file_offset = self.buf.bytes_written() as i64; - let map_offset = |x| x - src_offset + write_offset as i64; let mut builder = ColumnChunkMetaData::builder(metadata.column_descr_ptr()) .set_compression(metadata.compression()) .set_encodings(metadata.encodings().clone()) - .set_file_offset(file_offset) .set_total_compressed_size(metadata.compressed_size()) .set_total_uncompressed_size(metadata.uncompressed_size()) .set_num_values(metadata.num_values()) @@ -680,7 +677,6 @@ impl<'a, W: Write + Send> SerializedRowGroupWriter<'a, W> { } } - SerializedPageWriter::new(self.buf).write_metadata(&metadata)?; let (_, on_close) = self.get_on_close(); on_close(close) } @@ -808,14 +804,6 @@ impl<'a, W: Write + Send> PageWriter for SerializedPageWriter<'a, W> { Ok(spec) } - fn write_metadata(&mut self, metadata: &ColumnChunkMetaData) -> Result<()> { - let mut protocol = TCompactOutputProtocol::new(&mut self.sink); - metadata - .to_column_metadata_thrift() - .write_to_out_protocol(&mut protocol)?; - Ok(()) - } - fn close(&mut self) -> Result<()> { self.sink.flush()?; Ok(())