From 7b44d31367675b2168c9e6420d2104583b414cd2 Mon Sep 17 00:00:00 2001 From: Christian Thiel Date: Wed, 2 Oct 2024 08:46:13 +0200 Subject: [PATCH] Merge Builder MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 2c9d7e4855ad6ab7269b2ff5b3b3b180f7dcbd2d Author: Christian Thiel Date: Wed Oct 2 07:51:50 2024 +0200 New builder commit fea1817c45e4da4c7d4a84bb2370aee6c0de1185 Author: Christian Thiel Date: Tue Oct 1 12:56:10 2024 +0200 Merge branch 'feat/safe-partition-spec' commit 1f49c957ab75e7667222d1b9c0d582c020af4014 Author: Christian Thiel Date: Tue Oct 1 12:55:02 2024 +0200 Merge branch 'feat/schema-reassign-field-ids' commit cda4a0c595af2606e2f4076e9ef81d79d4428f4b Author: congyi wang <58715567+wcy-fdu@users.noreply.github.com> Date: Mon Sep 30 11:07:36 2024 +0800 chore: fix typo in FileIO Schemes (#653) * fix typo * fix typo commit af9609d12010295299868a6b7137441b9b5ad40f Author: Scott Donnelly Date: Mon Sep 30 04:06:14 2024 +0100 fix: page index evaluator min/max args inverted (#648) * fix: page index evaluator min/max args inverted * style: fix clippy lint in test commit a6a3fd779dfe895e02885803f4f82f6b39407732 Author: Alon Agmon <54080741+a-agmon@users.noreply.github.com> Date: Sat Sep 28 10:10:08 2024 +0300 test (datafusion): add test for table provider creation (#651) * add test for table provider creation * fix formatting * fixing yet another formatting issue * testing schema using data fusion --------- Co-authored-by: Alon Agmon commit 87483b40a284b2775d61b7712e4bee92d24a2a96 Author: Alon Agmon <54080741+a-agmon@users.noreply.github.com> Date: Fri Sep 27 04:40:08 2024 +0300 making table provider pub (#650) Co-authored-by: Alon Agmon commit 984c91e2f82cdc2fc43434e400e3d4ae9c3bf000 Author: ZENOTME <43447882+ZENOTME@users.noreply.github.com> Date: Thu Sep 26 17:56:02 2024 +0800 avoid to create memory schema operator every time (#635) Co-authored-by: ZENOTME commit 4171275a23aa3f3c9e8f8c24d433e4a0eeadc392 Author: Matheus Alcantara Date: Wed Sep 25 08:28:42 2024 -0300 scan: change ErrorKind when table dont have spanshots (#608) commit ab51355c6b01b4f64b7c0439bb705d42e60d2338 Author: xxchan Date: Tue Sep 24 21:25:45 2024 +0800 fix: compile error due to merge stale PR (#646) Signed-off-by: xxchan commit 420b4e2fb476b621862327f7eb02031fe292d755 Author: Scott Donnelly Date: Tue Sep 24 08:20:23 2024 +0100 Table Scan: Add Row Selection Filtering (#565) * feat(scan): add row selection capability via PageIndexEvaluator * test(row-selection): add first few row selection tests * feat(scan): add more tests, fix bug where min/max args swapped * fix: ad test and fix for logic bug in PageIndexEvaluator in-clause handler * feat: changes suggested from PR review commit b3709baaabf1260571f0b7d3e54ea258f4baa357 Author: Christian Date: Tue Sep 24 04:47:04 2024 +0200 feat: Add NamespaceIdent.parent() (#641) * Add NamespaceIdent.parent() * Use split_last commit 1533c4376e751c82097134857c551bd78b8f7b64 Author: Alon Agmon <54080741+a-agmon@users.noreply.github.com> Date: Mon Sep 23 13:39:46 2024 +0300 feat (datafusion integration): convert datafusion expr filters to Iceberg Predicate (#588) * adding main function and tests * adding tests, removing integration test for now * fixing typos and lints * fixing typing issue * - added support in schmema to convert Date32 to correct arrow type - refactored scan to use new predicate converter as visitor and seperated it to a new mod - added support for simple predicates with column cast expressions - added testing, mostly around date functions * fixing format and lic * reducing number of tests (17 -> 7) * fix formats * fix naming * refactoring to use TreeNodeVisitor * fixing fmt * small refactor * adding swapped op and fixing CR comments --------- Co-authored-by: Alon Agmon commit e967deb333a2c666218c48cd6f4870fbe7e535bd Author: xxchan Date: Mon Sep 23 18:34:59 2024 +0800 feat: expose remove_all in FileIO (#643) Signed-off-by: xxchan commit d03c4f865cc9775d1dc96fabf32c19a5b8c72d7b Author: Scott Donnelly Date: Mon Sep 23 08:28:52 2024 +0100 Migrate to arrow-* v53 (#626) * chore: migrate to arrow-* v53 * chore: update datafusion to 42 * test: fix incorrect test assertion * chore: update python bindings to arrow 53 commit 88e5e4aa3bd88d516ecc63606a7e6d4823284e27 Author: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon Sep 23 15:26:18 2024 +0800 chore(deps): Bump crate-ci/typos from 1.24.5 to 1.24.6 (#640) Bumps [crate-ci/typos](https://github.com/crate-ci/typos) from 1.24.5 to 1.24.6. - [Release notes](https://github.com/crate-ci/typos/releases) - [Changelog](https://github.com/crate-ci/typos/blob/master/CHANGELOG.md) - [Commits](https://github.com/crate-ci/typos/compare/v1.24.5...v1.24.6) --- updated-dependencies: - dependency-name: crate-ci/typos dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> commit c3549836796f93aa3ad22276af788aa3d92533a1 Author: xxchan Date: Mon Sep 23 14:50:18 2024 +0800 doc: improve FileIO doc (#642) Signed-off-by: xxchan commit 12e12e274571d7aa75c023f6bcd5164984041881 Author: xxchan Date: Fri Sep 20 19:59:55 2024 +0800 feat: expose arrow type <-> iceberg type (#637) * feat: expose arrow type <-> iceberg type Previously we only exposed the schema conversion. Signed-off-by: xxchan * add tests Signed-off-by: xxchan --------- Signed-off-by: xxchan commit 3b27c9e4da74eda8a47bdbd65d55f1a034911ce2 Author: xxchan Date: Fri Sep 20 18:32:31 2024 +0800 feat: add Sync to TransformFunction (#638) Signed-off-by: xxchan commit 34cb81c3f963e185d5af5f7f49e198237efc9ca5 Author: Xuanwo Date: Wed Sep 18 20:18:40 2024 +0800 chore: Bump opendal to 0.50 (#634) commit cde35ab0eefffae88c521d4e897ba86ee754861c Author: FANNG Date: Fri Sep 13 10:01:16 2024 +0800 feat: support projection pushdown for datafusion iceberg (#594) * support projection pushdown for datafusion iceberg * support projection pushdown for datafusion iceberg * fix ci * fix field id * remove depencences * remove depencences commit eae946437e7fa34e95fd604ac3431c8f86b70597 Author: Xuanwo Date: Thu Sep 12 02:06:31 2024 +0800 refactor(python): Expose transform as a submodule for pyiceberg_core (#628) commit 8a3de4e66fd9672c216985f11044168a325a9177 Author: Christian Date: Mon Sep 9 14:45:16 2024 +0200 Feat: Normalize TableMetadata (#611) * Normalize Table Metadata * Improve readability & comments commit e08c0e51f80b7fc2103962da29e4ede2a4a5aad7 Author: Renjie Liu Date: Mon Sep 9 11:57:22 2024 +0800 fix: Correctly calculate highest_field_id in schema (#590) commit f78c59b19bbcf4f8c2f2be500e0eb8d2883600ca Author: Jack <56563911+jdockerty@users.noreply.github.com> Date: Mon Sep 9 03:35:16 2024 +0100 feat: add `client.region` (#623) commit a5aba9a02c06494d9614a5a556bdcde5871829ef Author: Christian Date: Sun Sep 8 18:36:05 2024 +0200 feat: SortOrder methods should take schema ref if possible (#613) * SortOrder methods should take schema ref if possible * Fix test type * with_order_id should not take reference commit 58123990c304b72139651a16f45f6505bc868cc3 Author: Christian Date: Sun Sep 8 18:18:41 2024 +0200 feat: partition compatibility (#612) * Partition compatability * Partition compatability * Rename compatible_with -> is_compatible_with commit ede472051aa67cd3f7d8712a5c8291dca1a91fae Author: Christian Date: Sun Sep 8 16:49:39 2024 +0200 fix: Less Panics for Snapshot timestamps (#614) commit ced661f72532a9fe9ce8284461aadef1128cfbb8 Author: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun Sep 8 22:43:38 2024 +0800 chore(deps): Bump crate-ci/typos from 1.24.3 to 1.24.5 (#616) Bumps [crate-ci/typos](https://github.com/crate-ci/typos) from 1.24.3 to 1.24.5. - [Release notes](https://github.com/crate-ci/typos/releases) - [Changelog](https://github.com/crate-ci/typos/blob/master/CHANGELOG.md) - [Commits](https://github.com/crate-ci/typos/compare/v1.24.3...v1.24.5) --- updated-dependencies: - dependency-name: crate-ci/typos dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> commit cbbd086c92ef5a93d7dc98bbb1b26b9cf044910a Author: Xuanwo Date: Sun Sep 8 10:29:31 2024 +0800 feat: Add more fields in FileScanTask (#609) Signed-off-by: Xuanwo commit 620d58e81f85858e01b0a2f81656425269061dbe Author: Callum Ryan <19956159+callum-ryan@users.noreply.github.com> Date: Thu Sep 5 03:44:55 2024 +0100 feat: SQL Catalog - namespaces (#534) * feat: SQL Catalog - namespaces Signed-off-by: callum-ryan <19956159+callum-ryan@users.noreply.github.com> * feat: use transaction for updates and creates Signed-off-by: callum-ryan <19956159+callum-ryan@users.noreply.github.com> * fix: pull out query param builder to fn Signed-off-by: callum-ryan <19956159+callum-ryan@users.noreply.github.com> * feat: add drop and tests Signed-off-by: callum-ryan <19956159+callum-ryan@users.noreply.github.com> * fix: String to str, remove pub and optimise query builder Signed-off-by: callum-ryan <19956159+callum-ryan@users.noreply.github.com> * fix: nested match, remove ok() Signed-off-by: callum-ryan <19956159+callum-ryan@users.noreply.github.com> * fix: remove pub, add set, add comments Signed-off-by: callum-ryan <19956159+callum-ryan@users.noreply.github.com> * fix: refactor list_namespaces slightly Signed-off-by: callum-ryan <19956159+callum-ryan@users.noreply.github.com> * fix: add default properties to all new namespaces Signed-off-by: callum-ryan <19956159+callum-ryan@users.noreply.github.com> * fix: remove check for nested namespace Signed-off-by: callum-ryan <19956159+callum-ryan@users.noreply.github.com> * chore: add more comments to the CatalogConfig to explain bind styles Signed-off-by: callum-ryan <19956159+callum-ryan@users.noreply.github.com> * fix: edit test for nested namespaces Signed-off-by: callum-ryan <19956159+callum-ryan@users.noreply.github.com> --------- Signed-off-by: callum-ryan <19956159+callum-ryan@users.noreply.github.com> commit ae75f964ec5aa7d85b0a35b530b676f7e4f588fc Author: Søren Dalby Larsen Date: Tue Sep 3 13:46:48 2024 +0200 chore: bump crate-ci/typos to 1.24.3 (#598) commit 7aa8bddeb09420b2f81a50112603de28aeaf3be7 Author: Scott Donnelly Date: Thu Aug 29 04:37:48 2024 +0100 Table Scan: Add Row Group Skipping (#558) * feat(scan): add row group and page index row selection filtering * fix(row selection): off-by-one error * feat: remove row selection to defer to a second PR * feat: better min/max val conversion in RowGroupMetricsEvaluator * test(row_group_filtering): first three tests * test(row_group_filtering): next few tests * test: add more tests for RowGroupMetricsEvaluator * chore: refactor test assertions to silence clippy lints * refactor: consolidate parquet stat min/max parsing in one place commit da08e8d9793942c3864f9f4ca04df5ab8ff3efc5 Author: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed Aug 28 14:35:55 2024 +0800 chore(deps): Bump crate-ci/typos from 1.23.6 to 1.24.1 (#583) commit ecbb4c310e6dc479bfee881d3cea95ad62e6b704 Author: Sung Yun <107272191+sungwy@users.noreply.github.com> Date: Mon Aug 26 23:57:01 2024 -0400 Expose Transforms to Python Binding (#556) * bucket transform rust binding * format * poetry x maturin * ignore poetry.lock in license check * update bindings_python_ci to use makefile * newline * https://github.com/python-poetry/poetry/pull/9135 * use hatch instead of poetry * refactor * revert licenserc change * adopt review feedback * comments * unused dependency * adopt review comment * newline * I like this approach a lot better * more tests commit 905ebd2f332cd37fc86a3ddf845c335229d0abcd Author: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon Aug 26 20:49:07 2024 +0800 chore(deps): Update typed-builder requirement from 0.19 to 0.20 (#582) --- updated-dependencies: - dependency-name: typed-builder dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> commit f9c92b7a3cd7b1f682949a20bcfaaf3fdbda9feb Author: FANNG Date: Sun Aug 25 22:31:36 2024 +0800 fix: Update sqlx from 0.8.0 to 0.8.1 (#584) commit ba66665cbb325f82bbf82fa8640f39964dabcba6 Author: FANNG Date: Sat Aug 24 12:35:36 2024 +0800 fix: correct partition-id to field-id in UnboundPartitionField (#576) * correct partition-id to field id in PartitionSpec * correct partition-id to field id in PartitionSpec * correct partition-id to field id in PartitionSpec * xx --- crates/catalog/glue/src/catalog.rs | 4 +- crates/catalog/glue/src/schema.rs | 4 +- crates/catalog/glue/src/utils.rs | 4 +- crates/catalog/hms/src/catalog.rs | 4 +- crates/catalog/memory/src/catalog.rs | 8 +- crates/iceberg/src/arrow/schema.rs | 20 +- crates/iceberg/src/catalog/mod.rs | 53 +- .../src/expr/visitors/expression_evaluator.rs | 131 +- .../visitors/inclusive_metrics_evaluator.rs | 11 +- .../src/expr/visitors/inclusive_projection.rs | 155 +- crates/iceberg/src/io/object_cache.rs | 2 +- crates/iceberg/src/scan.rs | 8 +- crates/iceberg/src/spec/datatypes.rs | 6 + crates/iceberg/src/spec/manifest.rs | 512 +++-- crates/iceberg/src/spec/mod.rs | 1 + crates/iceberg/src/spec/partition.rs | 555 ++++- crates/iceberg/src/spec/schema.rs | 413 +++- crates/iceberg/src/spec/table_metadata.rs | 414 ++-- .../src/spec/table_metadata_builder.rs | 1909 +++++++++++++++++ crates/iceberg/src/spec/values.rs | 22 +- .../writer/file_writer/location_generator.rs | 5 +- 21 files changed, 3496 insertions(+), 745 deletions(-) create mode 100644 crates/iceberg/src/spec/table_metadata_builder.rs diff --git a/crates/catalog/glue/src/catalog.rs b/crates/catalog/glue/src/catalog.rs index 18e30f3d0..ad216a3a7 100644 --- a/crates/catalog/glue/src/catalog.rs +++ b/crates/catalog/glue/src/catalog.rs @@ -355,7 +355,9 @@ impl Catalog for GlueCatalog { } }; - let metadata = TableMetadataBuilder::from_table_creation(creation)?.build()?; + let metadata = TableMetadataBuilder::from_table_creation(creation)? + .build()? + .metadata; let metadata_location = create_metadata_location(&location, 0)?; self.file_io diff --git a/crates/catalog/glue/src/schema.rs b/crates/catalog/glue/src/schema.rs index bb676e36e..1b490d13d 100644 --- a/crates/catalog/glue/src/schema.rs +++ b/crates/catalog/glue/src/schema.rs @@ -198,7 +198,9 @@ mod tests { .location("my_location".to_string()) .schema(schema) .build(); - let metadata = TableMetadataBuilder::from_table_creation(table_creation)?.build()?; + let metadata = TableMetadataBuilder::from_table_creation(table_creation)? + .build()? + .metadata; Ok(metadata) } diff --git a/crates/catalog/glue/src/utils.rs b/crates/catalog/glue/src/utils.rs index a99fb19c7..c43af500b 100644 --- a/crates/catalog/glue/src/utils.rs +++ b/crates/catalog/glue/src/utils.rs @@ -299,7 +299,9 @@ mod tests { .location("my_location".to_string()) .schema(schema) .build(); - let metadata = TableMetadataBuilder::from_table_creation(table_creation)?.build()?; + let metadata = TableMetadataBuilder::from_table_creation(table_creation)? + .build()? + .metadata; Ok(metadata) } diff --git a/crates/catalog/hms/src/catalog.rs b/crates/catalog/hms/src/catalog.rs index 6e5db1968..e19cf4ce5 100644 --- a/crates/catalog/hms/src/catalog.rs +++ b/crates/catalog/hms/src/catalog.rs @@ -346,7 +346,9 @@ impl Catalog for HmsCatalog { } }; - let metadata = TableMetadataBuilder::from_table_creation(creation)?.build()?; + let metadata = TableMetadataBuilder::from_table_creation(creation)? + .build()? + .metadata; let metadata_location = create_metadata_location(&location, 0)?; self.file_io diff --git a/crates/catalog/memory/src/catalog.rs b/crates/catalog/memory/src/catalog.rs index 1da044821..eaae0e615 100644 --- a/crates/catalog/memory/src/catalog.rs +++ b/crates/catalog/memory/src/catalog.rs @@ -194,7 +194,9 @@ impl Catalog for MemoryCatalog { } }; - let metadata = TableMetadataBuilder::from_table_creation(table_creation)?.build()?; + let metadata = TableMetadataBuilder::from_table_creation(table_creation)? + .build()? + .metadata; let metadata_location = format!( "{}/metadata/{}-{}.metadata.json", &location, @@ -355,7 +357,7 @@ mod tests { assert_eq!(metadata.current_schema().as_ref(), expected_schema); - let expected_partition_spec = PartitionSpec::builder(expected_schema) + let expected_partition_spec = PartitionSpec::builder((*expected_schema).clone()) .with_spec_id(0) .build() .unwrap(); @@ -365,7 +367,7 @@ mod tests { .partition_specs_iter() .map(|p| p.as_ref()) .collect_vec(), - vec![&expected_partition_spec] + vec![&expected_partition_spec.into_schemaless()] ); let expected_sorted_order = SortOrder::builder() diff --git a/crates/iceberg/src/arrow/schema.rs b/crates/iceberg/src/arrow/schema.rs index a32c10a22..e73b409c6 100644 --- a/crates/iceberg/src/arrow/schema.rs +++ b/crates/iceberg/src/arrow/schema.rs @@ -826,8 +826,8 @@ mod tests { fn arrow_schema_for_arrow_schema_to_schema_test() -> ArrowSchema { let fields = Fields::from(vec![ - simple_field("key", DataType::Int32, false, "17"), - simple_field("value", DataType::Utf8, true, "18"), + simple_field("key", DataType::Int32, false, "28"), + simple_field("value", DataType::Utf8, true, "29"), ]); let r#struct = DataType::Struct(fields); @@ -1057,9 +1057,9 @@ mod tests { "required": true, "type": { "type": "map", - "key-id": 17, + "key-id": 28, "key": "int", - "value-id": 18, + "value-id": 29, "value-required": false, "value": "string" } @@ -1110,8 +1110,8 @@ mod tests { fn arrow_schema_for_schema_to_arrow_schema_test() -> ArrowSchema { let fields = Fields::from(vec![ - simple_field("key", DataType::Int32, false, "17"), - simple_field("value", DataType::Utf8, true, "18"), + simple_field("key", DataType::Int32, false, "28"), + simple_field("value", DataType::Utf8, true, "29"), ]); let r#struct = DataType::Struct(fields); @@ -1200,7 +1200,7 @@ mod tests { ), simple_field("map", map, false, "16"), simple_field("struct", r#struct, false, "17"), - simple_field("uuid", DataType::FixedSizeBinary(16), false, "26"), + simple_field("uuid", DataType::FixedSizeBinary(16), false, "30"), ]) } @@ -1344,9 +1344,9 @@ mod tests { "required": true, "type": { "type": "map", - "key-id": 17, + "key-id": 28, "key": "int", - "value-id": 18, + "value-id": 29, "value-required": false, "value": "string" } @@ -1380,7 +1380,7 @@ mod tests { } }, { - "id":26, + "id":30, "name":"uuid", "required":true, "type":"uuid" diff --git a/crates/iceberg/src/catalog/mod.rs b/crates/iceberg/src/catalog/mod.rs index 212648878..9556da435 100644 --- a/crates/iceberg/src/catalog/mod.rs +++ b/crates/iceberg/src/catalog/mod.rs @@ -445,8 +445,46 @@ impl TableUpdate { /// Applies the update to the table metadata builder. pub fn apply(self, builder: TableMetadataBuilder) -> Result { match self { - TableUpdate::AssignUuid { uuid } => builder.assign_uuid(uuid), - _ => unimplemented!(), + TableUpdate::AssignUuid { uuid } => Ok(builder.assign_uuid(uuid)), + TableUpdate::AddSchema { + schema, + last_column_id, + } => { + if let Some(last_column_id) = last_column_id { + if builder.last_column_id() < last_column_id { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Invalid last column ID: {last_column_id} < {} (previous last column ID)", + builder.last_column_id() + ), + )); + } + }; + Ok(builder.add_schema(schema)) + } + TableUpdate::SetCurrentSchema { schema_id } => builder.set_current_schema(schema_id), + TableUpdate::AddSpec { spec } => builder.add_partition_spec(spec), + TableUpdate::SetDefaultSpec { spec_id } => builder.set_default_partition_spec(spec_id), + TableUpdate::AddSortOrder { sort_order } => builder.add_sort_order(sort_order), + TableUpdate::SetDefaultSortOrder { sort_order_id } => { + builder.set_default_sort_order(sort_order_id) + } + TableUpdate::AddSnapshot { snapshot } => builder.add_snapshot(snapshot), + TableUpdate::SetSnapshotRef { + ref_name, + reference, + } => builder.set_ref(&ref_name, reference), + TableUpdate::RemoveSnapshots { snapshot_ids } => { + Ok(builder.remove_snapshots(&snapshot_ids)) + } + TableUpdate::RemoveSnapshotRef { ref_name } => Ok(builder.remove_ref(&ref_name)), + TableUpdate::SetLocation { location } => Ok(builder.set_location(location)), + TableUpdate::SetProperties { updates } => builder.set_properties(updates), + TableUpdate::RemoveProperties { removals } => Ok(builder.remove_properties(&removals)), + TableUpdate::UpgradeFormatVersion { format_version } => { + builder.upgrade_format_version(format_version) + } } } } @@ -1183,8 +1221,12 @@ mod tests { let table_metadata = TableMetadataBuilder::from_table_creation(table_creation) .unwrap() .build() - .unwrap(); - let table_metadata_builder = TableMetadataBuilder::new(table_metadata); + .unwrap() + .metadata; + let table_metadata_builder = TableMetadataBuilder::new_from_metadata( + table_metadata, + "s3://db/table/metadata/metadata1.gz.json", + ); let uuid = uuid::Uuid::new_v4(); let update = TableUpdate::AssignUuid { uuid }; @@ -1192,7 +1234,8 @@ mod tests { .apply(table_metadata_builder) .unwrap() .build() - .unwrap(); + .unwrap() + .metadata; assert_eq!(updated_metadata.uuid(), uuid); } } diff --git a/crates/iceberg/src/expr/visitors/expression_evaluator.rs b/crates/iceberg/src/expr/visitors/expression_evaluator.rs index 8f3c2971c..94a003ef6 100644 --- a/crates/iceberg/src/expr/visitors/expression_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/expression_evaluator.rs @@ -259,14 +259,11 @@ mod tests { }; use crate::spec::{ DataContentType, DataFile, DataFileFormat, Datum, Literal, NestedField, PartitionSpec, - PartitionSpecRef, PrimitiveType, Schema, SchemaRef, Struct, Transform, Type, - UnboundPartitionField, + PartitionSpecRef, PrimitiveType, Schema, Struct, Transform, Type, UnboundPartitionField, }; use crate::Result; - fn create_schema_and_partition_spec( - r#type: PrimitiveType, - ) -> Result<(SchemaRef, PartitionSpecRef)> { + fn create_partition_spec(r#type: PrimitiveType) -> Result { let schema = Schema::builder() .with_fields(vec![Arc::new(NestedField::optional( 1, @@ -275,7 +272,7 @@ mod tests { ))]) .build()?; - let spec = PartitionSpec::builder(&schema) + let spec = PartitionSpec::builder(schema.clone()) .with_spec_id(1) .add_unbound_fields(vec![UnboundPartitionField::builder() .source_id(1) @@ -287,16 +284,15 @@ mod tests { .build() .unwrap(); - Ok((Arc::new(schema), Arc::new(spec))) + Ok(Arc::new(spec)) } fn create_partition_filter( - schema: &Schema, partition_spec: PartitionSpecRef, predicate: &BoundPredicate, case_sensitive: bool, ) -> Result { - let partition_type = partition_spec.partition_type(schema)?; + let partition_type = partition_spec.partition_type(); let partition_fields = partition_type.fields().to_owned(); let partition_schema = Schema::builder() @@ -304,7 +300,8 @@ mod tests { .with_fields(partition_fields) .build()?; - let mut inclusive_projection = InclusiveProjection::new(partition_spec); + let mut inclusive_projection = + InclusiveProjection::new((*partition_spec).clone().into_schemaless().into()); let partition_filter = inclusive_projection .project(predicate)? @@ -315,13 +312,11 @@ mod tests { } fn create_expression_evaluator( - schema: &Schema, partition_spec: PartitionSpecRef, predicate: &BoundPredicate, case_sensitive: bool, ) -> Result { - let partition_filter = - create_partition_filter(schema, partition_spec, predicate, case_sensitive)?; + let partition_filter = create_partition_filter(partition_spec, predicate, case_sensitive)?; Ok(ExpressionEvaluator::new(partition_filter)) } @@ -375,7 +370,7 @@ mod tests { #[test] fn test_expr_or() -> Result<()> { let case_sensitive = true; - let (schema, partition_spec) = create_schema_and_partition_spec(PrimitiveType::Float)?; + let partition_spec = create_partition_spec(PrimitiveType::Float)?; let predicate = Predicate::Binary(BinaryExpression::new( PredicateOperator::LessThan, @@ -387,10 +382,10 @@ mod tests { Reference::new("a"), Datum::float(0.4), ))) - .bind(schema.clone(), case_sensitive)?; + .bind(partition_spec.schema_ref().clone(), case_sensitive)?; let expression_evaluator = - create_expression_evaluator(&schema, partition_spec, &predicate, case_sensitive)?; + create_expression_evaluator(partition_spec, &predicate, case_sensitive)?; let data_file = create_data_file_float(); @@ -404,7 +399,7 @@ mod tests { #[test] fn test_expr_and() -> Result<()> { let case_sensitive = true; - let (schema, partition_spec) = create_schema_and_partition_spec(PrimitiveType::Float)?; + let partition_spec = create_partition_spec(PrimitiveType::Float)?; let predicate = Predicate::Binary(BinaryExpression::new( PredicateOperator::LessThan, @@ -416,10 +411,10 @@ mod tests { Reference::new("a"), Datum::float(0.4), ))) - .bind(schema.clone(), case_sensitive)?; + .bind(partition_spec.schema_ref().clone(), case_sensitive)?; let expression_evaluator = - create_expression_evaluator(&schema, partition_spec, &predicate, case_sensitive)?; + create_expression_evaluator(partition_spec, &predicate, case_sensitive)?; let data_file = create_data_file_float(); @@ -433,17 +428,17 @@ mod tests { #[test] fn test_expr_not_in() -> Result<()> { let case_sensitive = true; - let (schema, partition_spec) = create_schema_and_partition_spec(PrimitiveType::Float)?; + let partition_spec = create_partition_spec(PrimitiveType::Float)?; let predicate = Predicate::Set(SetExpression::new( PredicateOperator::NotIn, Reference::new("a"), FnvHashSet::from_iter([Datum::float(0.9), Datum::float(1.2), Datum::float(2.4)]), )) - .bind(schema.clone(), case_sensitive)?; + .bind(partition_spec.schema_ref().clone(), case_sensitive)?; let expression_evaluator = - create_expression_evaluator(&schema, partition_spec, &predicate, case_sensitive)?; + create_expression_evaluator(partition_spec, &predicate, case_sensitive)?; let data_file = create_data_file_float(); @@ -457,17 +452,17 @@ mod tests { #[test] fn test_expr_in() -> Result<()> { let case_sensitive = true; - let (schema, partition_spec) = create_schema_and_partition_spec(PrimitiveType::Float)?; + let partition_spec = create_partition_spec(PrimitiveType::Float)?; let predicate = Predicate::Set(SetExpression::new( PredicateOperator::In, Reference::new("a"), FnvHashSet::from_iter([Datum::float(1.0), Datum::float(1.2), Datum::float(2.4)]), )) - .bind(schema.clone(), case_sensitive)?; + .bind(partition_spec.schema_ref().clone(), case_sensitive)?; let expression_evaluator = - create_expression_evaluator(&schema, partition_spec, &predicate, case_sensitive)?; + create_expression_evaluator(partition_spec, &predicate, case_sensitive)?; let data_file = create_data_file_float(); @@ -481,17 +476,17 @@ mod tests { #[test] fn test_expr_not_starts_with() -> Result<()> { let case_sensitive = true; - let (schema, partition_spec) = create_schema_and_partition_spec(PrimitiveType::String)?; + let partition_spec = create_partition_spec(PrimitiveType::String)?; let predicate = Predicate::Binary(BinaryExpression::new( PredicateOperator::NotStartsWith, Reference::new("a"), Datum::string("not"), )) - .bind(schema.clone(), case_sensitive)?; + .bind(partition_spec.schema_ref().clone(), case_sensitive)?; let expression_evaluator = - create_expression_evaluator(&schema, partition_spec, &predicate, case_sensitive)?; + create_expression_evaluator(partition_spec, &predicate, case_sensitive)?; let data_file = create_data_file_string(); @@ -505,17 +500,17 @@ mod tests { #[test] fn test_expr_starts_with() -> Result<()> { let case_sensitive = true; - let (schema, partition_spec) = create_schema_and_partition_spec(PrimitiveType::String)?; + let partition_spec = create_partition_spec(PrimitiveType::String)?; let predicate = Predicate::Binary(BinaryExpression::new( PredicateOperator::StartsWith, Reference::new("a"), Datum::string("test"), )) - .bind(schema.clone(), case_sensitive)?; + .bind(partition_spec.schema_ref().clone(), case_sensitive)?; let expression_evaluator = - create_expression_evaluator(&schema, partition_spec, &predicate, case_sensitive)?; + create_expression_evaluator(partition_spec, &predicate, case_sensitive)?; let data_file = create_data_file_string(); @@ -529,17 +524,17 @@ mod tests { #[test] fn test_expr_not_eq() -> Result<()> { let case_sensitive = true; - let (schema, partition_spec) = create_schema_and_partition_spec(PrimitiveType::Float)?; + let partition_spec = create_partition_spec(PrimitiveType::Float)?; let predicate = Predicate::Binary(BinaryExpression::new( PredicateOperator::NotEq, Reference::new("a"), Datum::float(0.9), )) - .bind(schema.clone(), case_sensitive)?; + .bind(partition_spec.schema_ref().clone(), case_sensitive)?; let expression_evaluator = - create_expression_evaluator(&schema, partition_spec, &predicate, case_sensitive)?; + create_expression_evaluator(partition_spec, &predicate, case_sensitive)?; let data_file = create_data_file_float(); @@ -553,17 +548,17 @@ mod tests { #[test] fn test_expr_eq() -> Result<()> { let case_sensitive = true; - let (schema, partition_spec) = create_schema_and_partition_spec(PrimitiveType::Float)?; + let partition_spec = create_partition_spec(PrimitiveType::Float)?; let predicate = Predicate::Binary(BinaryExpression::new( PredicateOperator::Eq, Reference::new("a"), Datum::float(1.0), )) - .bind(schema.clone(), case_sensitive)?; + .bind(partition_spec.schema_ref().clone(), case_sensitive)?; let expression_evaluator = - create_expression_evaluator(&schema, partition_spec, &predicate, case_sensitive)?; + create_expression_evaluator(partition_spec, &predicate, case_sensitive)?; let data_file = create_data_file_float(); @@ -577,17 +572,17 @@ mod tests { #[test] fn test_expr_greater_than_or_eq() -> Result<()> { let case_sensitive = true; - let (schema, partition_spec) = create_schema_and_partition_spec(PrimitiveType::Float)?; + let partition_spec = create_partition_spec(PrimitiveType::Float)?; let predicate = Predicate::Binary(BinaryExpression::new( PredicateOperator::GreaterThanOrEq, Reference::new("a"), Datum::float(1.0), )) - .bind(schema.clone(), case_sensitive)?; + .bind(partition_spec.schema_ref().clone(), case_sensitive)?; let expression_evaluator = - create_expression_evaluator(&schema, partition_spec, &predicate, case_sensitive)?; + create_expression_evaluator(partition_spec, &predicate, case_sensitive)?; let data_file = create_data_file_float(); @@ -601,17 +596,17 @@ mod tests { #[test] fn test_expr_greater_than() -> Result<()> { let case_sensitive = true; - let (schema, partition_spec) = create_schema_and_partition_spec(PrimitiveType::Float)?; + let partition_spec = create_partition_spec(PrimitiveType::Float)?; let predicate = Predicate::Binary(BinaryExpression::new( PredicateOperator::GreaterThan, Reference::new("a"), Datum::float(0.9), )) - .bind(schema.clone(), case_sensitive)?; + .bind(partition_spec.schema_ref().clone(), case_sensitive)?; let expression_evaluator = - create_expression_evaluator(&schema, partition_spec, &predicate, case_sensitive)?; + create_expression_evaluator(partition_spec, &predicate, case_sensitive)?; let data_file = create_data_file_float(); @@ -625,17 +620,17 @@ mod tests { #[test] fn test_expr_less_than_or_eq() -> Result<()> { let case_sensitive = true; - let (schema, partition_spec) = create_schema_and_partition_spec(PrimitiveType::Float)?; + let partition_spec = create_partition_spec(PrimitiveType::Float)?; let predicate = Predicate::Binary(BinaryExpression::new( PredicateOperator::LessThanOrEq, Reference::new("a"), Datum::float(1.0), )) - .bind(schema.clone(), case_sensitive)?; + .bind(partition_spec.schema_ref().clone(), case_sensitive)?; let expression_evaluator = - create_expression_evaluator(&schema, partition_spec, &predicate, case_sensitive)?; + create_expression_evaluator(partition_spec, &predicate, case_sensitive)?; let data_file = create_data_file_float(); @@ -649,17 +644,17 @@ mod tests { #[test] fn test_expr_less_than() -> Result<()> { let case_sensitive = true; - let (schema, partition_spec) = create_schema_and_partition_spec(PrimitiveType::Float)?; + let partition_spec = create_partition_spec(PrimitiveType::Float)?; let predicate = Predicate::Binary(BinaryExpression::new( PredicateOperator::LessThan, Reference::new("a"), Datum::float(1.1), )) - .bind(schema.clone(), case_sensitive)?; + .bind(partition_spec.schema_ref().clone(), case_sensitive)?; let expression_evaluator = - create_expression_evaluator(&schema, partition_spec, &predicate, case_sensitive)?; + create_expression_evaluator(partition_spec, &predicate, case_sensitive)?; let data_file = create_data_file_float(); @@ -673,15 +668,15 @@ mod tests { #[test] fn test_expr_is_not_nan() -> Result<()> { let case_sensitive = true; - let (schema, partition_spec) = create_schema_and_partition_spec(PrimitiveType::Float)?; + let partition_spec = create_partition_spec(PrimitiveType::Float)?; let predicate = Predicate::Unary(UnaryExpression::new( PredicateOperator::NotNan, Reference::new("a"), )) - .bind(schema.clone(), case_sensitive)?; + .bind(partition_spec.schema_ref().clone(), case_sensitive)?; let expression_evaluator = - create_expression_evaluator(&schema, partition_spec, &predicate, case_sensitive)?; + create_expression_evaluator(partition_spec, &predicate, case_sensitive)?; let data_file = create_data_file_float(); @@ -695,15 +690,15 @@ mod tests { #[test] fn test_expr_is_nan() -> Result<()> { let case_sensitive = true; - let (schema, partition_spec) = create_schema_and_partition_spec(PrimitiveType::Float)?; + let partition_spec = create_partition_spec(PrimitiveType::Float)?; let predicate = Predicate::Unary(UnaryExpression::new( PredicateOperator::IsNan, Reference::new("a"), )) - .bind(schema.clone(), case_sensitive)?; + .bind(partition_spec.schema_ref().clone(), case_sensitive)?; let expression_evaluator = - create_expression_evaluator(&schema, partition_spec, &predicate, case_sensitive)?; + create_expression_evaluator(partition_spec, &predicate, case_sensitive)?; let data_file = create_data_file_float(); @@ -717,15 +712,15 @@ mod tests { #[test] fn test_expr_is_not_null() -> Result<()> { let case_sensitive = true; - let (schema, partition_spec) = create_schema_and_partition_spec(PrimitiveType::Float)?; + let partition_spec = create_partition_spec(PrimitiveType::Float)?; let predicate = Predicate::Unary(UnaryExpression::new( PredicateOperator::NotNull, Reference::new("a"), )) - .bind(schema.clone(), case_sensitive)?; + .bind(partition_spec.schema_ref().clone(), case_sensitive)?; let expression_evaluator = - create_expression_evaluator(&schema, partition_spec, &predicate, case_sensitive)?; + create_expression_evaluator(partition_spec, &predicate, case_sensitive)?; let data_file = create_data_file_float(); @@ -739,15 +734,15 @@ mod tests { #[test] fn test_expr_is_null() -> Result<()> { let case_sensitive = true; - let (schema, partition_spec) = create_schema_and_partition_spec(PrimitiveType::Float)?; + let partition_spec = create_partition_spec(PrimitiveType::Float)?; let predicate = Predicate::Unary(UnaryExpression::new( PredicateOperator::IsNull, Reference::new("a"), )) - .bind(schema.clone(), case_sensitive)?; + .bind(partition_spec.schema_ref().clone(), case_sensitive)?; let expression_evaluator = - create_expression_evaluator(&schema, partition_spec, &predicate, case_sensitive)?; + create_expression_evaluator(partition_spec, &predicate, case_sensitive)?; let data_file = create_data_file_float(); @@ -761,11 +756,12 @@ mod tests { #[test] fn test_expr_always_false() -> Result<()> { let case_sensitive = true; - let (schema, partition_spec) = create_schema_and_partition_spec(PrimitiveType::Float)?; - let predicate = Predicate::AlwaysFalse.bind(schema.clone(), case_sensitive)?; + let partition_spec = create_partition_spec(PrimitiveType::Float)?; + let predicate = + Predicate::AlwaysFalse.bind(partition_spec.schema_ref().clone(), case_sensitive)?; let expression_evaluator = - create_expression_evaluator(&schema, partition_spec, &predicate, case_sensitive)?; + create_expression_evaluator(partition_spec, &predicate, case_sensitive)?; let data_file = create_data_file_float(); @@ -779,11 +775,12 @@ mod tests { #[test] fn test_expr_always_true() -> Result<()> { let case_sensitive = true; - let (schema, partition_spec) = create_schema_and_partition_spec(PrimitiveType::Float)?; - let predicate = Predicate::AlwaysTrue.bind(schema.clone(), case_sensitive)?; + let partition_spec = create_partition_spec(PrimitiveType::Float)?; + let predicate = + Predicate::AlwaysTrue.bind(partition_spec.schema_ref().clone(), case_sensitive)?; let expression_evaluator = - create_expression_evaluator(&schema, partition_spec, &predicate, case_sensitive)?; + create_expression_evaluator(partition_spec, &predicate, case_sensitive)?; let data_file = create_data_file_float(); diff --git a/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs b/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs index a2ee4722f..89bbd6f56 100644 --- a/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/inclusive_metrics_evaluator.rs @@ -504,10 +504,10 @@ mod test { #[test] fn test_data_file_no_partitions() { - let (table_schema_ref, _partition_spec_ref) = create_test_schema_and_partition_spec(); + let partition_spec_ref = create_test_partition_spec(); let partition_filter = Predicate::AlwaysTrue - .bind(table_schema_ref.clone(), false) + .bind(partition_spec_ref.schema_ref().clone(), false) .unwrap(); let case_sensitive = false; @@ -1645,7 +1645,7 @@ mod test { assert!(result, "Should read: NotIn on no nulls column"); } - fn create_test_schema_and_partition_spec() -> (Arc, Arc) { + fn create_test_partition_spec() -> Arc { let table_schema = Schema::builder() .with_fields(vec![Arc::new(NestedField::optional( 1, @@ -1656,7 +1656,7 @@ mod test { .unwrap(); let table_schema_ref = Arc::new(table_schema); - let partition_spec = PartitionSpec::builder(&table_schema_ref) + let partition_spec = PartitionSpec::builder(table_schema_ref.clone()) .with_spec_id(1) .add_unbound_fields(vec![UnboundPartitionField::builder() .source_id(1) @@ -1667,8 +1667,7 @@ mod test { .unwrap() .build() .unwrap(); - let partition_spec_ref = Arc::new(partition_spec); - (table_schema_ref, partition_spec_ref) + Arc::new(partition_spec) } fn not_null(reference: &str) -> BoundPredicate { diff --git a/crates/iceberg/src/expr/visitors/inclusive_projection.rs b/crates/iceberg/src/expr/visitors/inclusive_projection.rs index 2087207ea..7594e6458 100644 --- a/crates/iceberg/src/expr/visitors/inclusive_projection.rs +++ b/crates/iceberg/src/expr/visitors/inclusive_projection.rs @@ -21,16 +21,16 @@ use fnv::FnvHashSet; use crate::expr::visitors::bound_predicate_visitor::{visit, BoundPredicateVisitor}; use crate::expr::{BoundPredicate, BoundReference, Predicate}; -use crate::spec::{Datum, PartitionField, PartitionSpecRef}; +use crate::spec::{Datum, PartitionField, SchemalessPartitionSpecRef}; use crate::Error; pub(crate) struct InclusiveProjection { - partition_spec: PartitionSpecRef, + partition_spec: SchemalessPartitionSpecRef, cached_parts: HashMap>, } impl InclusiveProjection { - pub(crate) fn new(partition_spec: PartitionSpecRef) -> Self { + pub(crate) fn new(partition_spec: SchemalessPartitionSpecRef) -> Self { Self { partition_spec, cached_parts: HashMap::new(), @@ -235,7 +235,7 @@ mod tests { use crate::expr::visitors::inclusive_projection::InclusiveProjection; use crate::expr::{Bind, Predicate, Reference}; use crate::spec::{ - Datum, NestedField, PartitionField, PartitionSpec, PrimitiveType, Schema, Transform, Type, + Datum, NestedField, PartitionSpec, PrimitiveType, Schema, Transform, Type, UnboundPartitionField, }; @@ -265,13 +265,14 @@ mod tests { #[test] fn test_inclusive_projection_logic_ops() { let schema = build_test_schema(); + let arc_schema = Arc::new(schema); - let partition_spec = PartitionSpec::builder(&schema) + let partition_spec = PartitionSpec::builder(arc_schema.clone()) .with_spec_id(1) .build() - .unwrap(); + .unwrap() + .into_schemaless(); - let arc_schema = Arc::new(schema); let arc_partition_spec = Arc::new(partition_spec); // this predicate contains only logic operators, @@ -295,8 +296,9 @@ mod tests { #[test] fn test_inclusive_projection_identity_transform() { let schema = build_test_schema(); + let arc_schema = Arc::new(schema); - let partition_spec = PartitionSpec::builder(&schema) + let partition_spec = PartitionSpec::builder(arc_schema.clone()) .with_spec_id(1) .add_unbound_field( UnboundPartitionField::builder() @@ -308,9 +310,9 @@ mod tests { ) .unwrap() .build() - .unwrap(); + .unwrap() + .into_schemaless(); - let arc_schema = Arc::new(schema); let arc_partition_spec = Arc::new(partition_spec); let unbound_predicate = Reference::new("a").less_than(Datum::int(10)); @@ -321,7 +323,7 @@ mod tests { // should result in the same Predicate as the original // `unbound_predicate`, since we have just a single partition field, // and it has an Identity transform - let mut inclusive_projection = InclusiveProjection::new(arc_partition_spec.clone()); + let mut inclusive_projection = InclusiveProjection::new(arc_partition_spec); let result = inclusive_projection.project(&bound_predicate).unwrap(); let expected = "a < 10".to_string(); @@ -330,34 +332,95 @@ mod tests { } #[test] - fn test_inclusive_projection_date_transforms() { + fn test_inclusive_projection_date_year_transform() { let schema = build_test_schema(); + let arc_schema = Arc::new(schema); + + let partition_spec = PartitionSpec::builder(arc_schema.clone()) + .with_spec_id(1) + .add_unbound_fields(vec![UnboundPartitionField { + source_id: 2, + name: "year".to_string(), + field_id: Some(1000), + transform: Transform::Year, + }]) + .unwrap() + .build() + .unwrap() + .into_schemaless(); + + let arc_partition_spec = Arc::new(partition_spec); + + let unbound_predicate = + Reference::new("date").less_than(Datum::date_from_str("2024-01-01").unwrap()); + + let bound_predicate = unbound_predicate.bind(arc_schema.clone(), false).unwrap(); + + // applying InclusiveProjection to bound_predicate + // should result in a predicate that correctly handles + // year, month and date + let mut inclusive_projection = InclusiveProjection::new(arc_partition_spec); + let result = inclusive_projection.project(&bound_predicate).unwrap(); + + let expected = "year <= 53".to_string(); + + assert_eq!(result.to_string(), expected); + } + + #[test] + fn test_inclusive_projection_date_month_transform() { + let schema = build_test_schema(); + let arc_schema = Arc::new(schema); + + let partition_spec = PartitionSpec::builder(arc_schema.clone()) + .with_spec_id(1) + .add_unbound_fields(vec![UnboundPartitionField { + source_id: 2, + name: "month".to_string(), + field_id: Some(1000), + transform: Transform::Month, + }]) + .unwrap() + .build() + .unwrap() + .into_schemaless(); + + let arc_partition_spec = Arc::new(partition_spec); + + let unbound_predicate = + Reference::new("date").less_than(Datum::date_from_str("2024-01-01").unwrap()); + + let bound_predicate = unbound_predicate.bind(arc_schema.clone(), false).unwrap(); - let partition_spec = PartitionSpec { - spec_id: 1, - fields: vec![ - PartitionField { - source_id: 2, - name: "year".to_string(), - field_id: 1000, - transform: Transform::Year, - }, - PartitionField { - source_id: 2, - name: "month".to_string(), - field_id: 1001, - transform: Transform::Month, - }, - PartitionField { - source_id: 2, - name: "day".to_string(), - field_id: 1002, - transform: Transform::Day, - }, - ], - }; + // applying InclusiveProjection to bound_predicate + // should result in a predicate that correctly handles + // year, month and date + let mut inclusive_projection = InclusiveProjection::new(arc_partition_spec); + let result = inclusive_projection.project(&bound_predicate).unwrap(); + + let expected = "month <= 647".to_string(); + + assert_eq!(result.to_string(), expected); + } + #[test] + fn test_inclusive_projection_date_day_transform() { + let schema = build_test_schema(); let arc_schema = Arc::new(schema); + + let partition_spec = PartitionSpec::builder(arc_schema.clone()) + .with_spec_id(1) + .add_unbound_fields(vec![UnboundPartitionField { + source_id: 2, + name: "day".to_string(), + field_id: Some(1000), + transform: Transform::Day, + }]) + .unwrap() + .build() + .unwrap() + .into_schemaless(); + let arc_partition_spec = Arc::new(partition_spec); let unbound_predicate = @@ -368,10 +431,10 @@ mod tests { // applying InclusiveProjection to bound_predicate // should result in a predicate that correctly handles // year, month and date - let mut inclusive_projection = InclusiveProjection::new(arc_partition_spec.clone()); + let mut inclusive_projection = InclusiveProjection::new(arc_partition_spec); let result = inclusive_projection.project(&bound_predicate).unwrap(); - let expected = "((year <= 53) AND (month <= 647)) AND (day <= 19722)".to_string(); + let expected = "day <= 19722".to_string(); assert_eq!(result.to_string(), expected); } @@ -379,8 +442,9 @@ mod tests { #[test] fn test_inclusive_projection_truncate_transform() { let schema = build_test_schema(); + let arc_schema = Arc::new(schema); - let partition_spec = PartitionSpec::builder(&schema) + let partition_spec = PartitionSpec::builder(arc_schema.clone()) .with_spec_id(1) .add_unbound_field( UnboundPartitionField::builder() @@ -392,9 +456,9 @@ mod tests { ) .unwrap() .build() - .unwrap(); + .unwrap() + .into_schemaless(); - let arc_schema = Arc::new(schema); let arc_partition_spec = Arc::new(partition_spec); let unbound_predicate = Reference::new("name").starts_with(Datum::string("Testy McTest")); @@ -408,7 +472,7 @@ mod tests { // name that start with "Testy McTest" into a partition // for values of name that start with the first four letters // of that, ie "Test". - let mut inclusive_projection = InclusiveProjection::new(arc_partition_spec.clone()); + let mut inclusive_projection = InclusiveProjection::new(arc_partition_spec); let result = inclusive_projection.project(&bound_predicate).unwrap(); let expected = "name_truncate STARTS WITH \"Test\"".to_string(); @@ -419,8 +483,9 @@ mod tests { #[test] fn test_inclusive_projection_bucket_transform() { let schema = build_test_schema(); + let arc_schema = Arc::new(schema); - let partition_spec = PartitionSpec::builder(&schema) + let partition_spec = PartitionSpec::builder(arc_schema.clone()) .with_spec_id(1) .add_unbound_field( UnboundPartitionField::builder() @@ -432,9 +497,9 @@ mod tests { ) .unwrap() .build() - .unwrap(); + .unwrap() + .into_schemaless(); - let arc_schema = Arc::new(schema); let arc_partition_spec = Arc::new(partition_spec); let unbound_predicate = Reference::new("a").equal_to(Datum::int(10)); @@ -445,7 +510,7 @@ mod tests { // should result in the "a = 10" predicate being // transformed into "a = 2", since 10 gets bucketed // to 2 with a Bucket(7) partition - let mut inclusive_projection = InclusiveProjection::new(arc_partition_spec.clone()); + let mut inclusive_projection = InclusiveProjection::new(arc_partition_spec); let result = inclusive_projection.project(&bound_predicate).unwrap(); let expected = "a_bucket[7] = 2".to_string(); diff --git a/crates/iceberg/src/io/object_cache.rs b/crates/iceberg/src/io/object_cache.rs index 731072a5a..35b6a2c94 100644 --- a/crates/iceberg/src/io/object_cache.rs +++ b/crates/iceberg/src/io/object_cache.rs @@ -262,7 +262,7 @@ mod tests { ) .write(Manifest::new( ManifestMetadata::builder() - .schema((*current_schema).clone()) + .schema(current_schema.clone()) .content(ManifestContentType::Data) .format_version(FormatVersion::V2) .partition_spec((**current_partition_spec).clone()) diff --git a/crates/iceberg/src/scan.rs b/crates/iceberg/src/scan.rs index f5cbbcf06..ab00b1eec 100644 --- a/crates/iceberg/src/scan.rs +++ b/crates/iceberg/src/scan.rs @@ -688,7 +688,7 @@ impl PartitionFilterCache { &self, spec_id: i32, table_metadata: &TableMetadataRef, - schema: &SchemaRef, + schema: &Schema, case_sensitive: bool, filter: BoundPredicate, ) -> Result> { @@ -714,11 +714,11 @@ impl PartitionFilterCache { format!("Could not find partition spec for id {}", spec_id), ))?; - let partition_type = partition_spec.partition_type(schema.as_ref())?; + let partition_type = partition_spec.partition_type(schema)?; let partition_fields = partition_type.fields().to_owned(); let partition_schema = Arc::new( Schema::builder() - .with_schema_id(partition_spec.spec_id) + .with_schema_id(partition_spec.spec_id()) .with_fields(partition_fields) .build()?, ); @@ -1012,7 +1012,7 @@ mod tests { ) .write(Manifest::new( ManifestMetadata::builder() - .schema((*current_schema).clone()) + .schema(current_schema.clone()) .content(ManifestContentType::Data) .format_version(FormatVersion::V2) .partition_spec((**current_partition_spec).clone()) diff --git a/crates/iceberg/src/spec/datatypes.rs b/crates/iceberg/src/spec/datatypes.rs index d38245960..bce10ad5f 100644 --- a/crates/iceberg/src/spec/datatypes.rs +++ b/crates/iceberg/src/spec/datatypes.rs @@ -668,6 +668,12 @@ impl NestedField { self.write_default = Some(value); self } + + /// Set the id of the field. + pub(crate) fn with_id(mut self, id: i32) -> Self { + self.id = id; + self + } } impl fmt::Display for NestedField { diff --git a/crates/iceberg/src/spec/manifest.rs b/crates/iceberg/src/spec/manifest.rs index f0dfdf47c..12d52e494 100644 --- a/crates/iceberg/src/spec/manifest.rs +++ b/crates/iceberg/src/spec/manifest.rs @@ -31,7 +31,7 @@ use typed_builder::TypedBuilder; use self::_const_schema::{manifest_schema_v1, manifest_schema_v2}; use super::{ Datum, FieldSummary, FormatVersion, ManifestContentType, ManifestFile, PartitionSpec, Schema, - SchemaId, Struct, INITIAL_SEQUENCE_NUMBER, UNASSIGNED_SEQUENCE_NUMBER, + SchemaId, SchemaRef, Struct, INITIAL_SEQUENCE_NUMBER, UNASSIGNED_SEQUENCE_NUMBER, }; use crate::error::Result; use crate::io::OutputFile; @@ -55,7 +55,7 @@ impl Manifest { let metadata = ManifestMetadata::parse(meta)?; // Parse manifest entries - let partition_type = metadata.partition_spec.partition_type(&metadata.schema)?; + let partition_type = metadata.partition_spec.partition_type(); let entries = match metadata.format_version { FormatVersion::V1 => { @@ -65,7 +65,7 @@ impl Manifest { .into_iter() .map(|value| { from_value::<_serde::ManifestEntryV1>(&value?)? - .try_into(&partition_type, &metadata.schema) + .try_into(partition_type, &metadata.schema) }) .collect::>>()? } @@ -76,7 +76,7 @@ impl Manifest { .into_iter() .map(|value| { from_value::<_serde::ManifestEntryV2>(&value?)? - .try_into(&partition_type, &metadata.schema) + .try_into(partition_type, &metadata.schema) }) .collect::>>()? } @@ -206,10 +206,7 @@ impl ManifestWriter { /// Write a manifest. pub async fn write(mut self, manifest: Manifest) -> Result { // Create the avro writer - let partition_type = manifest - .metadata - .partition_spec - .partition_type(&manifest.metadata.schema)?; + let partition_type = manifest.metadata.partition_spec.partition_type(); let table_schema = &manifest.metadata.schema; let avro_schema = match manifest.metadata.format_version { FormatVersion::V1 => manifest_schema_v1(partition_type.clone())?, @@ -284,12 +281,12 @@ impl ManifestWriter { let value = match manifest.metadata.format_version { FormatVersion::V1 => to_value(_serde::ManifestEntryV1::try_from( (*entry).clone(), - &partition_type, + partition_type, )?)? .resolve(&avro_schema)?, FormatVersion::V2 => to_value(_serde::ManifestEntryV2::try_from( (*entry).clone(), - &partition_type, + partition_type, )?)? .resolve(&avro_schema)?, }; @@ -705,7 +702,7 @@ mod _const_schema { pub struct ManifestMetadata { /// The table schema at the time the manifest /// was written - schema: Schema, + schema: SchemaRef, /// ID of the schema used to write the manifest as a string schema_id: SchemaId, /// The partition spec used to write the manifest @@ -719,7 +716,7 @@ pub struct ManifestMetadata { impl ManifestMetadata { /// Parse from metadata in avro file. pub fn parse(meta: &HashMap>) -> Result { - let schema = { + let schema = Arc::new({ let bs = meta.get("schema").ok_or_else(|| { Error::new( ErrorKind::DataInvalid, @@ -733,7 +730,7 @@ impl ManifestMetadata { ) .with_source(err) })? - }; + }); let schema_id: i32 = meta .get("schema-id") .map(|bs| { @@ -776,7 +773,10 @@ impl ManifestMetadata { }) .transpose()? .unwrap_or(0); - PartitionSpec { spec_id, fields } + PartitionSpec::builder(schema.clone()) + .with_spec_id(spec_id) + .add_unbound_fields(fields.into_iter().map(|f| f.into_unbound()))? + .build()? }; let format_version = if let Some(bs) = meta.get("format-version") { serde_json::from_slice::(bs).map_err(|err| { @@ -1514,82 +1514,82 @@ mod tests { #[tokio::test] async fn test_parse_manifest_v2_unpartition() { + let schema = Arc::new( + Schema::builder() + .with_fields(vec![ + // id v_int v_long v_float v_double v_varchar v_bool v_date v_timestamp v_decimal v_ts_ntz + Arc::new(NestedField::optional( + 1, + "id", + Type::Primitive(PrimitiveType::Long), + )), + Arc::new(NestedField::optional( + 2, + "v_int", + Type::Primitive(PrimitiveType::Int), + )), + Arc::new(NestedField::optional( + 3, + "v_long", + Type::Primitive(PrimitiveType::Long), + )), + Arc::new(NestedField::optional( + 4, + "v_float", + Type::Primitive(PrimitiveType::Float), + )), + Arc::new(NestedField::optional( + 5, + "v_double", + Type::Primitive(PrimitiveType::Double), + )), + Arc::new(NestedField::optional( + 6, + "v_varchar", + Type::Primitive(PrimitiveType::String), + )), + Arc::new(NestedField::optional( + 7, + "v_bool", + Type::Primitive(PrimitiveType::Boolean), + )), + Arc::new(NestedField::optional( + 8, + "v_date", + Type::Primitive(PrimitiveType::Date), + )), + Arc::new(NestedField::optional( + 9, + "v_timestamp", + Type::Primitive(PrimitiveType::Timestamptz), + )), + Arc::new(NestedField::optional( + 10, + "v_decimal", + Type::Primitive(PrimitiveType::Decimal { + precision: 36, + scale: 10, + }), + )), + Arc::new(NestedField::optional( + 11, + "v_ts_ntz", + Type::Primitive(PrimitiveType::Timestamp), + )), + Arc::new(NestedField::optional( + 12, + "v_ts_ns_ntz", + Type::Primitive(PrimitiveType::TimestampNs), + )), + ]) + .build() + .unwrap(), + ); let manifest = Manifest { metadata: ManifestMetadata { schema_id: 0, - schema: Schema::builder() - .with_fields(vec![ - // id v_int v_long v_float v_double v_varchar v_bool v_date v_timestamp v_decimal v_ts_ntz - Arc::new(NestedField::optional( - 1, - "id", - Type::Primitive(PrimitiveType::Long), - )), - Arc::new(NestedField::optional( - 2, - "v_int", - Type::Primitive(PrimitiveType::Int), - )), - Arc::new(NestedField::optional( - 3, - "v_long", - Type::Primitive(PrimitiveType::Long), - )), - Arc::new(NestedField::optional( - 4, - "v_float", - Type::Primitive(PrimitiveType::Float), - )), - Arc::new(NestedField::optional( - 5, - "v_double", - Type::Primitive(PrimitiveType::Double), - )), - Arc::new(NestedField::optional( - 6, - "v_varchar", - Type::Primitive(PrimitiveType::String), - )), - Arc::new(NestedField::optional( - 7, - "v_bool", - Type::Primitive(PrimitiveType::Boolean), - )), - Arc::new(NestedField::optional( - 8, - "v_date", - Type::Primitive(PrimitiveType::Date), - )), - Arc::new(NestedField::optional( - 9, - "v_timestamp", - Type::Primitive(PrimitiveType::Timestamptz), - )), - Arc::new(NestedField::optional( - 10, - "v_decimal", - Type::Primitive(PrimitiveType::Decimal { - precision: 36, - scale: 10, - }), - )), - Arc::new(NestedField::optional( - 11, - "v_ts_ntz", - Type::Primitive(PrimitiveType::Timestamp), - )), - Arc::new(NestedField::optional( - 12, - "v_ts_ns_ntz", - Type::Primitive(PrimitiveType::TimestampNs - ))), - ]) - .build() - .unwrap(), - partition_spec: PartitionSpec { - spec_id: 0, - fields: vec![], - }, + schema: schema.clone(), + partition_spec: PartitionSpec::builder(schema).with_spec_id(0).build().unwrap(), content: ManifestContentType::Data, format_version: FormatVersion::V2, }, @@ -1628,94 +1628,83 @@ mod tests { #[tokio::test] async fn test_parse_manifest_v2_partition() { + let schema = Arc::new( + Schema::builder() + .with_fields(vec![ + Arc::new(NestedField::optional( + 1, + "id", + Type::Primitive(PrimitiveType::Long), + )), + Arc::new(NestedField::optional( + 2, + "v_int", + Type::Primitive(PrimitiveType::Int), + )), + Arc::new(NestedField::optional( + 3, + "v_long", + Type::Primitive(PrimitiveType::Long), + )), + Arc::new(NestedField::optional( + 4, + "v_float", + Type::Primitive(PrimitiveType::Float), + )), + Arc::new(NestedField::optional( + 5, + "v_double", + Type::Primitive(PrimitiveType::Double), + )), + Arc::new(NestedField::optional( + 6, + "v_varchar", + Type::Primitive(PrimitiveType::String), + )), + Arc::new(NestedField::optional( + 7, + "v_bool", + Type::Primitive(PrimitiveType::Boolean), + )), + Arc::new(NestedField::optional( + 8, + "v_date", + Type::Primitive(PrimitiveType::Date), + )), + Arc::new(NestedField::optional( + 9, + "v_timestamp", + Type::Primitive(PrimitiveType::Timestamptz), + )), + Arc::new(NestedField::optional( + 10, + "v_decimal", + Type::Primitive(PrimitiveType::Decimal { + precision: 36, + scale: 10, + }), + )), + Arc::new(NestedField::optional( + 11, + "v_ts_ntz", + Type::Primitive(PrimitiveType::Timestamp), + )), + Arc::new(NestedField::optional( + 12, + "v_ts_ns_ntz", + Type::Primitive(PrimitiveType::TimestampNs), + )), + ]) + .build() + .unwrap(), + ); let manifest = Manifest { metadata: ManifestMetadata { schema_id: 0, - schema: Schema::builder() - .with_fields(vec![ - Arc::new(NestedField::optional( - 1, - "id", - Type::Primitive(PrimitiveType::Long), - )), - Arc::new(NestedField::optional( - 2, - "v_int", - Type::Primitive(PrimitiveType::Int), - )), - Arc::new(NestedField::optional( - 3, - "v_long", - Type::Primitive(PrimitiveType::Long), - )), - Arc::new(NestedField::optional( - 4, - "v_float", - Type::Primitive(PrimitiveType::Float), - )), - Arc::new(NestedField::optional( - 5, - "v_double", - Type::Primitive(PrimitiveType::Double), - )), - Arc::new(NestedField::optional( - 6, - "v_varchar", - Type::Primitive(PrimitiveType::String), - )), - Arc::new(NestedField::optional( - 7, - "v_bool", - Type::Primitive(PrimitiveType::Boolean), - )), - Arc::new(NestedField::optional( - 8, - "v_date", - Type::Primitive(PrimitiveType::Date), - )), - Arc::new(NestedField::optional( - 9, - "v_timestamp", - Type::Primitive(PrimitiveType::Timestamptz), - )), - Arc::new(NestedField::optional( - 10, - "v_decimal", - Type::Primitive(PrimitiveType::Decimal { - precision: 36, - scale: 10, - }), - )), - Arc::new(NestedField::optional( - 11, - "v_ts_ntz", - Type::Primitive(PrimitiveType::Timestamp), - )), - Arc::new(NestedField::optional( - 12, - "v_ts_ns_ntz", - Type::Primitive(PrimitiveType::TimestampNs - ))) - ]) - .build() - .unwrap(), - partition_spec: PartitionSpec { - spec_id: 0, - fields: vec![ - PartitionField { - name: "v_int".to_string(), - transform: Transform::Identity, - source_id: 2, - field_id: 1000, - }, - PartitionField { - name: "v_long".to_string(), - transform: Transform::Identity, - source_id: 3, - field_id: 1001, - }, - ], - }, + schema: schema.clone(), + partition_spec: PartitionSpec::builder(schema) + .with_spec_id(0).add_partition_field("v_int", "v_int", Transform::Identity).unwrap() + .add_partition_field("v_long", "v_long", Transform::Identity).unwrap().build().unwrap(), content: ManifestContentType::Data, format_version: FormatVersion::V2, }, @@ -1797,34 +1786,34 @@ mod tests { #[tokio::test] async fn test_parse_manifest_v1_unpartition() { + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + Arc::new(NestedField::optional( + 1, + "id", + Type::Primitive(PrimitiveType::Int), + )), + Arc::new(NestedField::optional( + 2, + "data", + Type::Primitive(PrimitiveType::String), + )), + Arc::new(NestedField::optional( + 3, + "comment", + Type::Primitive(PrimitiveType::String), + )), + ]) + .build() + .unwrap(), + ); let manifest = Manifest { metadata: ManifestMetadata { schema_id: 1, - schema: Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - Arc::new(NestedField::optional( - 1, - "id", - Type::Primitive(PrimitiveType::Int), - )), - Arc::new(NestedField::optional( - 2, - "data", - Type::Primitive(PrimitiveType::String), - )), - Arc::new(NestedField::optional( - 3, - "comment", - Type::Primitive(PrimitiveType::String), - )), - ]) - .build() - .unwrap(), - partition_spec: PartitionSpec { - spec_id: 0, - fields: vec![], - }, + schema: schema.clone(), + partition_spec: PartitionSpec::builder(schema).with_spec_id(0).build().unwrap(), content: ManifestContentType::Data, format_version: FormatVersion::V1, }, @@ -1862,38 +1851,33 @@ mod tests { #[tokio::test] async fn test_parse_manifest_v1_partition() { + let schema = Arc::new( + Schema::builder() + .with_fields(vec![ + Arc::new(NestedField::optional( + 1, + "id", + Type::Primitive(PrimitiveType::Long), + )), + Arc::new(NestedField::optional( + 2, + "data", + Type::Primitive(PrimitiveType::String), + )), + Arc::new(NestedField::optional( + 3, + "category", + Type::Primitive(PrimitiveType::String), + )), + ]) + .build() + .unwrap(), + ); let manifest = Manifest { metadata: ManifestMetadata { schema_id: 0, - schema: Schema::builder() - .with_fields(vec![ - Arc::new(NestedField::optional( - 1, - "id", - Type::Primitive(PrimitiveType::Long), - )), - Arc::new(NestedField::optional( - 2, - "data", - Type::Primitive(PrimitiveType::String), - )), - Arc::new(NestedField::optional( - 3, - "category", - Type::Primitive(PrimitiveType::String), - )), - ]) - .build() - .unwrap(), - partition_spec: PartitionSpec { - spec_id: 0, - fields: vec![PartitionField { - name: "category".to_string(), - transform: Transform::Identity, - source_id: 3, - field_id: 1000, - }], - }, + schema: schema.clone(), + partition_spec: PartitionSpec::builder(schema).add_partition_field("category", "category", Transform::Identity).unwrap().build().unwrap(), content: ManifestContentType::Data, format_version: FormatVersion::V1, }, @@ -1951,28 +1935,28 @@ mod tests { #[tokio::test] async fn test_parse_manifest_with_schema_evolution() { + let schema = Arc::new( + Schema::builder() + .with_fields(vec![ + Arc::new(NestedField::optional( + 1, + "id", + Type::Primitive(PrimitiveType::Long), + )), + Arc::new(NestedField::optional( + 2, + "v_int", + Type::Primitive(PrimitiveType::Int), + )), + ]) + .build() + .unwrap(), + ); let manifest = Manifest { metadata: ManifestMetadata { schema_id: 0, - schema: Schema::builder() - .with_fields(vec![ - Arc::new(NestedField::optional( - 1, - "id", - Type::Primitive(PrimitiveType::Long), - )), - Arc::new(NestedField::optional( - 2, - "v_int", - Type::Primitive(PrimitiveType::Int), - )), - ]) - .build() - .unwrap(), - partition_spec: PartitionSpec { - spec_id: 0, - fields: vec![], - }, + schema: schema.clone(), + partition_spec: PartitionSpec::builder(schema).with_spec_id(0).build().unwrap(), content: ManifestContentType::Data, format_version: FormatVersion::V2, }, @@ -2023,28 +2007,28 @@ mod tests { // Compared with original manifest, the lower_bounds and upper_bounds no longer has data for field 3, and // other parts should be same. + let schema = Arc::new( + Schema::builder() + .with_fields(vec![ + Arc::new(NestedField::optional( + 1, + "id", + Type::Primitive(PrimitiveType::Long), + )), + Arc::new(NestedField::optional( + 2, + "v_int", + Type::Primitive(PrimitiveType::Int), + )), + ]) + .build() + .unwrap(), + ); let expected_manifest = Manifest { metadata: ManifestMetadata { schema_id: 0, - schema: Schema::builder() - .with_fields(vec![ - Arc::new(NestedField::optional( - 1, - "id", - Type::Primitive(PrimitiveType::Long), - )), - Arc::new(NestedField::optional( - 2, - "v_int", - Type::Primitive(PrimitiveType::Int), - )), - ]) - .build() - .unwrap(), - partition_spec: PartitionSpec { - spec_id: 0, - fields: vec![], - }, + schema: schema.clone(), + partition_spec: PartitionSpec::builder(schema).with_spec_id(0).build().unwrap(), content: ManifestContentType::Data, format_version: FormatVersion::V2, }, diff --git a/crates/iceberg/src/spec/mod.rs b/crates/iceberg/src/spec/mod.rs index 793f00d34..9b91d5443 100644 --- a/crates/iceberg/src/spec/mod.rs +++ b/crates/iceberg/src/spec/mod.rs @@ -25,6 +25,7 @@ mod schema; mod snapshot; mod sort; mod table_metadata; +mod table_metadata_builder; mod transform; mod values; mod view_metadata; diff --git a/crates/iceberg/src/spec/partition.rs b/crates/iceberg/src/spec/partition.rs index 0bb810776..770ba551a 100644 --- a/crates/iceberg/src/spec/partition.rs +++ b/crates/iceberg/src/spec/partition.rs @@ -24,7 +24,7 @@ use serde::{Deserialize, Serialize}; use typed_builder::TypedBuilder; use super::transform::Transform; -use super::{NestedField, Schema, StructType}; +use super::{NestedField, Schema, SchemaRef, StructType}; use crate::{Error, ErrorKind, Result}; pub(crate) const UNPARTITIONED_LAST_ASSIGNED_ID: i32 = 999; @@ -54,22 +54,51 @@ impl PartitionField { } } -/// Partition spec that defines how to produce a tuple of partition values from a record. -#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone, Default)] -#[serde(rename_all = "kebab-case")] +/// Partition spec that defines how to produce a tuple of partition values from a record. +/// `PartitionSpec` is bound to a specific schema. +#[derive(Debug, PartialEq, Eq, Clone)] pub struct PartitionSpec { /// Identifier for PartitionSpec - pub spec_id: i32, + spec_id: i32, + /// Details of the partition spec + fields: Vec, + /// The schema this partition spec is bound to + schema: SchemaRef, + /// Type of the partition spec + partition_type: StructType, +} + +/// Reference to [`SchemalessPartitionSpec`]. +pub type SchemalessPartitionSpecRef = Arc; +/// Partition spec that defines how to produce a tuple of partition values from a record. +/// Schemaless partition specs are never constructed manually. They occur when a table is mutated +/// and partition spec and schemas are updated. While old partition specs are retained, the bound +/// schema might not be available anymore as part of the table metadata. +#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)] +#[serde(rename_all = "kebab-case")] +pub struct SchemalessPartitionSpec { + /// Identifier for PartitionSpec + spec_id: i32, /// Details of the partition spec - pub fields: Vec, + fields: Vec, } impl PartitionSpec { /// Create partition spec builder - pub fn builder(schema: &Schema) -> PartitionSpecBuilder { + pub fn builder(schema: impl Into) -> PartitionSpecBuilder { PartitionSpecBuilder::new(schema) } + /// Get a new unpatitioned partition spec + pub fn unpartition_spec(schema: impl Into) -> Self { + Self { + spec_id: DEFAULT_PARTITION_SPEC_ID, + fields: vec![], + schema: schema.into(), + partition_type: StructType::new(vec![]), + } + } + /// Spec id of the partition spec pub fn spec_id(&self) -> i32 { self.spec_id @@ -80,45 +109,32 @@ impl PartitionSpec { &self.fields } + /// The schema this partition spec is bound to + pub fn schema(&self) -> &Schema { + &self.schema + } + + /// The schema ref this partition spec is bound to + pub fn schema_ref(&self) -> &SchemaRef { + &self.schema + } + /// Returns if the partition spec is unpartitioned. /// /// A [`PartitionSpec`] is unpartitioned if it has no fields or all fields are [`Transform::Void`] transform. pub fn is_unpartitioned(&self) -> bool { - self.fields.is_empty() - || self - .fields - .iter() - .all(|f| matches!(f.transform, Transform::Void)) - } - - /// Returns the partition type of this partition spec. - pub fn partition_type(&self, schema: &Schema) -> Result { - let mut fields = Vec::with_capacity(self.fields.len()); - for partition_field in &self.fields { - let field = schema - .field_by_id(partition_field.source_id) - .ok_or_else(|| { - Error::new( - ErrorKind::DataInvalid, - format!( - "No column with source column id {} in schema {:?}", - partition_field.source_id, schema - ), - ) - })?; - let res_type = partition_field.transform.result_type(&field.field_type)?; - let field = - NestedField::optional(partition_field.field_id, &partition_field.name, res_type) - .into(); - fields.push(field); - } - Ok(StructType::new(fields)) + >::is_unpartitioned(self) } /// Turn this partition spec into an unbound partition spec. /// /// The `field_id` is retained as `partition_id` in the unbound partition spec. pub fn into_unbound(self) -> UnboundPartitionSpec { + >::into_unbound(self) + } + + /// Turn this partition spec into a preserved partition spec. + pub fn into_schemaless(self) -> SchemalessPartitionSpec { self.into() } @@ -131,49 +147,64 @@ impl PartitionSpec { /// * Field names /// * Source column ids /// * Transforms - pub fn is_compatible_with(&self, other: &UnboundPartitionSpec) -> bool { - if self.fields.len() != other.fields.len() { - return false; - } - - for (this_field, other_field) in self.fields.iter().zip(&other.fields) { - if this_field.source_id != other_field.source_id - || this_field.transform != other_field.transform - || this_field.name != other_field.name - { - return false; - } - } - - true + pub fn is_compatible_with>( + &self, + other: &T, + ) -> bool { + >::is_compatible_with(self, other) } /// Check if this partition spec has sequential partition ids. /// Sequential ids start from 1000 and increment by 1 for each field. /// This is required for spec version 1 pub fn has_sequential_ids(&self) -> bool { - for (index, field) in self.fields.iter().enumerate() { - let expected_id = (UNPARTITIONED_LAST_ASSIGNED_ID as i64) - .checked_add(1) - .and_then(|id| id.checked_add(index as i64)) - .unwrap_or(i64::MAX); - - if field.field_id as i64 != expected_id { - return false; - } - } - - true + ::has_sequential_ids(self) } /// Get the highest field id in the partition spec. /// If the partition spec is unpartitioned, it returns the last unpartitioned last assigned id (999). pub fn highest_field_id(&self) -> i32 { - self.fields - .iter() - .map(|f| f.field_id) - .max() - .unwrap_or(UNPARTITIONED_LAST_ASSIGNED_ID) + ::highest_field_id(self) + } + + /// Returns the partition type of this partition spec. + pub fn partition_type(&self) -> &StructType { + &self.partition_type + } + + /// Set the spec id for the partition spec. + pub(crate) fn with_spec_id(self, spec_id: i32) -> Self { + Self { spec_id, ..self } + } +} + +impl SchemalessPartitionSpec { + /// Fields of the partition spec + pub fn fields(&self) -> &[PartitionField] { + &self.fields + } + + /// Spec id of the partition spec + pub fn spec_id(&self) -> i32 { + self.spec_id + } + + /// Bind this schemaless partition spec to a schema. + pub fn bind(self, schema: impl Into) -> Result { + PartitionSpecBuilder::new_from_unbound(self.into_unbound(), schema)?.build() + } + + /// Get a new unpatitioned partition spec + pub fn unpartition_spec() -> Self { + Self { + spec_id: DEFAULT_PARTITION_SPEC_ID, + fields: vec![], + } + } + + /// Returns the partition type of this partition spec. + pub fn partition_type(&self, schema: &Schema) -> Result { + PartitionSpecBuilder::partition_type(&self.fields, schema) } } @@ -212,7 +243,7 @@ impl UnboundPartitionSpec { } /// Bind this unbound partition spec to a schema. - pub fn bind(self, schema: &Schema) -> Result { + pub fn bind(self, schema: impl Into) -> Result { PartitionSpecBuilder::new_from_unbound(self, schema)?.build() } @@ -235,6 +266,179 @@ impl UnboundPartitionSpec { } } +/// Trait for common functions between [`PartitionSpec`], [`UnboundPartitionSpec`] and [`PreservedPartitionSpec`] +pub trait UnboundPartitionSpecInterface { + /// Fields of the partition spec + fn fields(&self) -> &[T]; + + /// Turn this partition spec into an unbound partition spec. + fn into_unbound(self) -> UnboundPartitionSpec; + + /// Returns if the partition spec is unpartitioned. + /// + /// A spec is unpartitioned if it has no fields or all fields are [`Transform::Void`] transform. + fn is_unpartitioned(&self) -> bool { + self.fields().is_empty() + || self + .fields() + .iter() + .all(|f| matches!(f.transform(), Transform::Void)) + } + + /// Check if this partition spec is compatible with another partition spec. + /// + /// Returns true if the partition spec is equal to the other spec with partition field ids ignored and + /// spec_id ignored. The following must be identical: + /// * The number of fields + /// * Field order + /// * Field names + /// * Source column ids + /// * Transforms + fn is_compatible_with>( + &self, + other: &O, + ) -> bool { + if self.fields().len() != other.fields().len() { + return false; + } + + for (this_field, other_field) in self.fields().iter().zip(other.fields()) { + if this_field.source_id() != other_field.source_id() + || this_field.transform() != other_field.transform() + || this_field.name() != other_field.name() + { + return false; + } + } + + true + } +} + +/// Trait for common functions between [`PartitionSpec`] and [`PreservedPartitionSpec`] +pub trait PartitionSpecInterface: UnboundPartitionSpecInterface { + /// Spec id of the partition spec + fn spec_id(&self) -> i32; + + /// Check if this partition spec has sequential partition ids. + /// Sequential ids start from 1000 and increment by 1 for each field. + /// This is required for spec version 1 + fn has_sequential_ids(&self) -> bool { + for (index, field) in self.fields().iter().enumerate() { + let expected_id = (UNPARTITIONED_LAST_ASSIGNED_ID as i64) + .checked_add(1) + .and_then(|id| id.checked_add(index as i64)) + .unwrap_or(i64::MAX); + + if field.field_id as i64 != expected_id { + return false; + } + } + + true + } + + /// Get the highest field id in the partition spec. + /// If the partition spec is unpartitioned, it returns the last unpartitioned last assigned id (999). + fn highest_field_id(&self) -> i32 { + self.fields() + .iter() + .map(|f| f.field_id) + .max() + .unwrap_or(UNPARTITIONED_LAST_ASSIGNED_ID) + } +} + +impl PartitionSpecInterface for PartitionSpec { + fn spec_id(&self) -> i32 { + self.spec_id + } +} + +impl PartitionSpecInterface for SchemalessPartitionSpec { + fn spec_id(&self) -> i32 { + self.spec_id + } +} + +impl UnboundPartitionSpecInterface for PartitionSpec { + fn fields(&self) -> &[PartitionField] { + &self.fields + } + + fn into_unbound(self) -> UnboundPartitionSpec { + self.into() + } +} + +impl UnboundPartitionSpecInterface for UnboundPartitionSpec { + fn fields(&self) -> &[UnboundPartitionField] { + &self.fields + } + + fn into_unbound(self) -> UnboundPartitionSpec { + self + } +} + +impl UnboundPartitionSpecInterface for SchemalessPartitionSpec { + fn fields(&self) -> &[PartitionField] { + &self.fields + } + + fn into_unbound(self) -> UnboundPartitionSpec { + self.into() + } +} + +/// Trait for common functions between [`PartitionField`] and [`UnboundPartitionField`] +pub trait PartitionFieldInterface { + /// A source column id from the table’s schema + fn source_id(&self) -> i32; + /// A partition name. + fn name(&self) -> &str; + /// A transform that is applied to the source column to produce a partition value. + fn transform(&self) -> &Transform; + /// Convert to unbound partition field + fn into_unbound(self) -> UnboundPartitionField; +} + +impl PartitionFieldInterface for PartitionField { + fn source_id(&self) -> i32 { + self.source_id + } + + fn name(&self) -> &str { + &self.name + } + + fn transform(&self) -> &Transform { + &self.transform + } + + fn into_unbound(self) -> UnboundPartitionField { + self.into() + } +} + +impl PartitionFieldInterface for UnboundPartitionField { + fn source_id(&self) -> i32 { + self.source_id + } + + fn name(&self) -> &str { + &self.name + } + + fn transform(&self) -> &Transform { + &self.transform + } + + fn into_unbound(self) -> UnboundPartitionField { + self + } +} + impl From for UnboundPartitionField { fn from(field: PartitionField) -> Self { UnboundPartitionField { @@ -255,6 +459,24 @@ impl From for UnboundPartitionSpec { } } +impl From for UnboundPartitionSpec { + fn from(spec: SchemalessPartitionSpec) -> Self { + UnboundPartitionSpec { + spec_id: Some(spec.spec_id), + fields: spec.fields.into_iter().map(Into::into).collect(), + } + } +} + +impl From for SchemalessPartitionSpec { + fn from(spec: PartitionSpec) -> Self { + SchemalessPartitionSpec { + spec_id: spec.spec_id, + fields: spec.fields, + } + } +} + /// Create a new UnboundPartitionSpec #[derive(Debug, Default)] pub struct UnboundPartitionSpecBuilder { @@ -326,26 +548,29 @@ impl UnboundPartitionSpecBuilder { /// Create valid partition specs for a given schema. #[derive(Debug)] -pub struct PartitionSpecBuilder<'a> { +pub struct PartitionSpecBuilder { spec_id: Option, last_assigned_field_id: i32, fields: Vec, - schema: &'a Schema, + schema: SchemaRef, } -impl<'a> PartitionSpecBuilder<'a> { +impl PartitionSpecBuilder { /// Create a new partition spec builder with the given schema. - pub fn new(schema: &'a Schema) -> Self { + pub fn new(schema: impl Into) -> Self { Self { spec_id: None, fields: vec![], last_assigned_field_id: UNPARTITIONED_LAST_ASSIGNED_ID, - schema, + schema: schema.into(), } } /// Create a new partition spec builder from an existing unbound partition spec. - pub fn new_from_unbound(unbound: UnboundPartitionSpec, schema: &'a Schema) -> Result { + pub fn new_from_unbound( + unbound: UnboundPartitionSpec, + schema: impl Into, + ) -> Result { let mut builder = Self::new(schema).with_spec_id(unbound.spec_id.unwrap_or(DEFAULT_PARTITION_SPEC_ID)); @@ -408,8 +633,8 @@ impl<'a> PartitionSpecBuilder<'a> { pub fn add_unbound_field(mut self, field: UnboundPartitionField) -> Result { self.check_name_set_and_unique(&field.name)?; self.check_for_redundant_partitions(field.source_id, &field.transform)?; - Self::check_name_does_not_collide_with_schema(&field, self.schema)?; - Self::check_transform_compatibility(&field, self.schema)?; + Self::check_name_does_not_collide_with_schema(&field, &self.schema)?; + Self::check_transform_compatibility(&field, &self.schema)?; if let Some(partition_field_id) = field.field_id { self.check_partition_id_unique(partition_field_id)?; } @@ -434,9 +659,12 @@ impl<'a> PartitionSpecBuilder<'a> { /// Build a bound partition spec with the given schema. pub fn build(self) -> Result { let fields = Self::set_field_ids(self.fields, self.last_assigned_field_id)?; + let partition_type = Self::partition_type(&fields, &self.schema)?; Ok(PartitionSpec { spec_id: self.spec_id.unwrap_or(DEFAULT_PARTITION_SPEC_ID), fields, + partition_type, + schema: self.schema, }) } @@ -485,6 +713,32 @@ impl<'a> PartitionSpecBuilder<'a> { Ok(bound_fields) } + /// Returns the partition type of this partition spec. + fn partition_type(fields: &Vec, schema: &Schema) -> Result { + let mut struct_fields = Vec::with_capacity(fields.len()); + for partition_field in fields { + let field = schema + .field_by_id(partition_field.source_id) + .ok_or_else(|| { + Error::new( + // This should never occur as check_transform_compatibility + // already ensures that the source field exists in the schema + ErrorKind::Unexpected, + format!( + "No column with source column id {} in schema {:?}", + partition_field.source_id, schema + ), + ) + })?; + let res_type = partition_field.transform.result_type(&field.field_type)?; + let field = + NestedField::optional(partition_field.field_id, &partition_field.name, res_type) + .into(); + struct_fields.push(field); + } + Ok(StructType::new(struct_fields)) + } + /// Ensure that the partition name is unique among columns in the schema. /// Duplicate names are allowed if: /// 1. The column is sourced from the column with the same name. @@ -622,7 +876,7 @@ trait CorePartitionSpecValidator { fn fields(&self) -> &Vec; } -impl CorePartitionSpecValidator for PartitionSpecBuilder<'_> { +impl CorePartitionSpecValidator for PartitionSpecBuilder { fn fields(&self) -> &Vec { &self.fields } @@ -637,7 +891,7 @@ impl CorePartitionSpecValidator for UnboundPartitionSpecBuilder { #[cfg(test)] mod tests { use super::*; - use crate::spec::Type; + use crate::spec::{PrimitiveType, Type}; #[test] fn test_partition_spec() { @@ -663,7 +917,7 @@ mod tests { } "#; - let partition_spec: PartitionSpec = serde_json::from_str(spec).unwrap(); + let partition_spec: SchemalessPartitionSpec = serde_json::from_str(spec).unwrap(); assert_eq!(4, partition_spec.fields[0].source_id); assert_eq!(1000, partition_spec.fields[0].field_id); assert_eq!("ts_day", partition_spec.fields[0].name); @@ -695,7 +949,7 @@ mod tests { ]) .build() .unwrap(); - let partition_spec = PartitionSpec::builder(&schema) + let partition_spec = PartitionSpec::builder(schema.clone()) .with_spec_id(1) .build() .unwrap(); @@ -704,7 +958,7 @@ mod tests { "Empty partition spec should be unpartitioned" ); - let partition_spec = PartitionSpec::builder(&schema) + let partition_spec = PartitionSpec::builder(schema.clone()) .add_unbound_fields(vec![ UnboundPartitionField::builder() .source_id(1) @@ -726,7 +980,7 @@ mod tests { "Partition spec with one non void transform should not be unpartitioned" ); - let partition_spec = PartitionSpec::builder(&schema) + let partition_spec = PartitionSpec::builder(schema.clone()) .with_spec_id(1) .add_unbound_fields(vec![ UnboundPartitionField::builder() @@ -809,6 +1063,32 @@ mod tests { assert_eq!(Transform::Day, partition_spec.fields[0].transform); } + #[test] + fn test_new_unpartition() { + let schema = Schema::builder() + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(crate::spec::PrimitiveType::Int)) + .into(), + NestedField::required( + 2, + "name", + Type::Primitive(crate::spec::PrimitiveType::String), + ) + .into(), + ]) + .build() + .unwrap(); + let partition_spec = PartitionSpec::builder(schema.clone()) + .with_spec_id(0) + .build() + .unwrap(); + let partition_type = partition_spec.partition_type(); + assert_eq!(0, partition_type.fields().len()); + + let unpartition_spec = PartitionSpec::unpartition_spec(schema); + assert_eq!(partition_spec, unpartition_spec); + } + #[test] fn test_partition_type() { let spec = r#" @@ -833,7 +1113,7 @@ mod tests { } "#; - let partition_spec: PartitionSpec = serde_json::from_str(spec).unwrap(); + let partition_spec: SchemalessPartitionSpec = serde_json::from_str(spec).unwrap(); let schema = Schema::builder() .with_fields(vec![ NestedField::required(1, "id", Type::Primitive(crate::spec::PrimitiveType::Int)) @@ -909,7 +1189,7 @@ mod tests { } "#; - let partition_spec: PartitionSpec = serde_json::from_str(spec).unwrap(); + let partition_spec: SchemalessPartitionSpec = serde_json::from_str(spec).unwrap(); let schema = Schema::builder() .with_fields(vec![ NestedField::required(1, "id", Type::Primitive(crate::spec::PrimitiveType::Int)) @@ -976,7 +1256,7 @@ mod tests { } "#; - let partition_spec: PartitionSpec = serde_json::from_str(spec).unwrap(); + let partition_spec: SchemalessPartitionSpec = serde_json::from_str(spec).unwrap(); let schema = Schema::builder() .with_fields(vec![ NestedField::required(1, "id", Type::Primitive(crate::spec::PrimitiveType::Int)) @@ -994,6 +1274,50 @@ mod tests { assert!(partition_spec.partition_type(&schema).is_err()); } + #[test] + fn test_schemaless_bind_schema_keeps_field_ids_and_spec_id() { + let schema: Schema = Schema::builder() + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(crate::spec::PrimitiveType::Int)) + .into(), + NestedField::required( + 2, + "name", + Type::Primitive(crate::spec::PrimitiveType::String), + ) + .into(), + ]) + .build() + .unwrap(); + + let partition_spec = PartitionSpec::builder(schema.clone()) + .with_spec_id(99) + .add_unbound_field(UnboundPartitionField { + source_id: 1, + field_id: Some(1010), + name: "id".to_string(), + transform: Transform::Identity, + }) + .unwrap() + .add_unbound_field(UnboundPartitionField { + source_id: 2, + field_id: Some(1001), + name: "name_void".to_string(), + transform: Transform::Void, + }) + .unwrap() + .build() + .unwrap(); + + let schemaless_partition_spec = SchemalessPartitionSpec::from(partition_spec.clone()); + let bound_partition_spec = schemaless_partition_spec.bind(schema).unwrap(); + + assert_eq!(partition_spec, bound_partition_spec); + assert_eq!(partition_spec.fields[0].field_id, 1010); + assert_eq!(partition_spec.fields[1].field_id, 1001); + assert_eq!(bound_partition_spec.spec_id(), 99); + } + #[test] fn test_builder_disallow_duplicate_names() { UnboundPartitionSpec::builder() @@ -1018,7 +1342,7 @@ mod tests { ]) .build() .unwrap(); - PartitionSpec::builder(&schema) + PartitionSpec::builder(schema.clone()) .add_unbound_field(UnboundPartitionField { source_id: 1, field_id: Some(1000), @@ -1056,7 +1380,7 @@ mod tests { ]) .build() .unwrap(); - let spec = PartitionSpec::builder(&schema) + let spec = PartitionSpec::builder(schema.clone()) .with_spec_id(1) .add_unbound_field(UnboundPartitionField { source_id: 1, @@ -1104,12 +1428,12 @@ mod tests { .build() .unwrap(); - PartitionSpec::builder(&schema) + PartitionSpec::builder(schema.clone()) .with_spec_id(1) .build() .unwrap(); - let spec = PartitionSpec::builder(&schema) + let spec = PartitionSpec::builder(schema.clone()) .with_spec_id(1) .add_partition_field("id", "id_bucket[16]", Transform::Bucket(16)) .unwrap() @@ -1118,12 +1442,19 @@ mod tests { assert_eq!(spec, PartitionSpec { spec_id: 1, + schema: schema.into(), fields: vec![PartitionField { source_id: 1, field_id: 1000, name: "id_bucket[16]".to_string(), transform: Transform::Bucket(16), - }] + }], + partition_type: StructType::new(vec![NestedField::optional( + 1000, + "id_bucket[16]", + Type::Primitive(PrimitiveType::Int) + ) + .into()]) }); } @@ -1139,12 +1470,12 @@ mod tests { .build() .unwrap(); - PartitionSpec::builder(&schema) + PartitionSpec::builder(schema.clone()) .with_spec_id(1) .build() .unwrap(); - let err = PartitionSpec::builder(&schema) + let err = PartitionSpec::builder(schema) .with_spec_id(1) .add_unbound_field(UnboundPartitionField { source_id: 1, @@ -1172,12 +1503,12 @@ mod tests { .build() .unwrap(); - PartitionSpec::builder(&schema) + PartitionSpec::builder(schema.clone()) .with_spec_id(1) .build() .unwrap(); - PartitionSpec::builder(&schema) + PartitionSpec::builder(schema.clone()) .with_spec_id(1) .add_unbound_field(UnboundPartitionField { source_id: 1, @@ -1190,7 +1521,7 @@ mod tests { .unwrap(); // Not OK for different source id - PartitionSpec::builder(&schema) + PartitionSpec::builder(schema) .with_spec_id(1) .add_unbound_field(UnboundPartitionField { source_id: 2, @@ -1224,7 +1555,7 @@ mod tests { .unwrap(); // Valid - PartitionSpec::builder(&schema) + PartitionSpec::builder(schema.clone()) .with_spec_id(1) .add_unbound_fields(vec![ UnboundPartitionField { @@ -1245,7 +1576,7 @@ mod tests { .unwrap(); // Invalid - PartitionSpec::builder(&schema) + PartitionSpec::builder(schema) .with_spec_id(1) .add_unbound_fields(vec![ UnboundPartitionField { @@ -1291,7 +1622,7 @@ mod tests { .build() .unwrap(); - PartitionSpec::builder(&schema) + PartitionSpec::builder(schema) .with_spec_id(1) .add_unbound_field(UnboundPartitionField { source_id: 1, @@ -1342,7 +1673,7 @@ mod tests { .build() .unwrap(); - let partition_spec_1 = PartitionSpec::builder(&schema) + let partition_spec_1 = PartitionSpec::builder(schema.clone()) .with_spec_id(1) .add_unbound_field(UnboundPartitionField { source_id: 1, @@ -1354,7 +1685,7 @@ mod tests { .build() .unwrap(); - let partition_spec_2 = PartitionSpec::builder(&schema) + let partition_spec_2 = PartitionSpec::builder(schema) .with_spec_id(1) .add_unbound_field(UnboundPartitionField { source_id: 1, @@ -1381,7 +1712,7 @@ mod tests { .build() .unwrap(); - let partition_spec_1 = PartitionSpec::builder(&schema) + let partition_spec_1 = PartitionSpec::builder(schema.clone()) .with_spec_id(1) .add_unbound_field(UnboundPartitionField { source_id: 1, @@ -1393,7 +1724,7 @@ mod tests { .build() .unwrap(); - let partition_spec_2 = PartitionSpec::builder(&schema) + let partition_spec_2 = PartitionSpec::builder(schema) .with_spec_id(1) .add_unbound_field(UnboundPartitionField { source_id: 1, @@ -1424,7 +1755,7 @@ mod tests { .build() .unwrap(); - let partition_spec_1 = PartitionSpec::builder(&schema) + let partition_spec_1 = PartitionSpec::builder(schema.clone()) .with_spec_id(1) .add_unbound_field(UnboundPartitionField { source_id: 1, @@ -1436,7 +1767,7 @@ mod tests { .build() .unwrap(); - let partition_spec_2 = PartitionSpec::builder(&schema) + let partition_spec_2 = PartitionSpec::builder(schema) .with_spec_id(1) .add_unbound_field(UnboundPartitionField { source_id: 2, @@ -1467,7 +1798,7 @@ mod tests { .build() .unwrap(); - let partition_spec_1 = PartitionSpec::builder(&schema) + let partition_spec_1 = PartitionSpec::builder(schema.clone()) .with_spec_id(1) .add_unbound_field(UnboundPartitionField { source_id: 1, @@ -1486,7 +1817,7 @@ mod tests { .build() .unwrap(); - let partition_spec_2 = PartitionSpec::builder(&schema) + let partition_spec_2 = PartitionSpec::builder(schema) .with_spec_id(1) .add_unbound_field(UnboundPartitionField { source_id: 2, @@ -1510,7 +1841,7 @@ mod tests { #[test] fn test_highest_field_id_unpartitioned() { - let spec = PartitionSpec::builder(&Schema::builder().with_fields(vec![]).build().unwrap()) + let spec = PartitionSpec::builder(Schema::builder().with_fields(vec![]).build().unwrap()) .with_spec_id(1) .build() .unwrap(); @@ -1534,7 +1865,7 @@ mod tests { .build() .unwrap(); - let spec = PartitionSpec::builder(&schema) + let spec = PartitionSpec::builder(schema) .with_spec_id(1) .add_unbound_field(UnboundPartitionField { source_id: 1, @@ -1572,7 +1903,7 @@ mod tests { .build() .unwrap(); - let spec = PartitionSpec::builder(&schema) + let spec = PartitionSpec::builder(schema) .with_spec_id(1) .add_unbound_field(UnboundPartitionField { source_id: 1, @@ -1612,7 +1943,7 @@ mod tests { .build() .unwrap(); - let spec = PartitionSpec::builder(&schema) + let spec = PartitionSpec::builder(schema) .with_spec_id(1) .add_unbound_field(UnboundPartitionField { source_id: 1, @@ -1652,7 +1983,7 @@ mod tests { .build() .unwrap(); - let spec = PartitionSpec::builder(&schema) + let spec = PartitionSpec::builder(schema) .with_spec_id(1) .add_unbound_field(UnboundPartitionField { source_id: 1, diff --git a/crates/iceberg/src/spec/schema.rs b/crates/iceberg/src/spec/schema.rs index 63a9e3cb4..d78a2545a 100644 --- a/crates/iceberg/src/spec/schema.rs +++ b/crates/iceberg/src/spec/schema.rs @@ -39,7 +39,7 @@ use crate::{ensure_data_valid, Error, ErrorKind}; pub type SchemaId = i32; /// Reference to [`Schema`]. pub type SchemaRef = Arc; -const DEFAULT_SCHEMA_ID: SchemaId = 0; +pub(crate) const DEFAULT_SCHEMA_ID: SchemaId = 0; /// Defines schema in iceberg. #[derive(Debug, Serialize, Deserialize, Clone)] @@ -77,6 +77,7 @@ pub struct SchemaBuilder { fields: Vec, alias_to_id: BiHashMap, identifier_field_ids: HashSet, + reassign_field_ids_from: Option, } impl SchemaBuilder { @@ -86,6 +87,16 @@ impl SchemaBuilder { self } + /// Reassign all field-ids (including nested) on build. + /// Reassignment starts from the field-id specified in `start_from` (inclusive). + /// + /// All specified aliases and identifier fields will be updated to the new field-ids. + #[allow(dead_code)] // Will be needed in TableMetadataBuilder + pub(crate) fn with_reassigned_field_ids(mut self, start_from: u32) -> Self { + self.reassign_field_ids_from = Some(start_from.try_into().unwrap_or(i32::MAX)); + self + } + /// Set schema id. pub fn with_schema_id(mut self, schema_id: i32) -> Self { self.schema_id = schema_id; @@ -130,7 +141,7 @@ impl SchemaBuilder { let highest_field_id = id_to_field.keys().max().cloned().unwrap_or(0); - Ok(Schema { + let mut schema = Schema { r#struct, schema_id: self.schema_id, highest_field_id, @@ -143,7 +154,24 @@ impl SchemaBuilder { id_to_name, field_id_to_accessor, - }) + }; + + if let Some(start_from) = self.reassign_field_ids_from { + let mut id_reassigner = ReassignFieldIds::new(start_from); + let new_fields = id_reassigner.reassign_field_ids(schema.r#struct.fields().to_vec())?; + let new_identifier_field_ids = + id_reassigner.apply_to_identifier_fields(schema.identifier_field_ids)?; + let new_alias_to_id = id_reassigner.apply_to_aliases(schema.alias_to_id.clone())?; + + schema = Schema::builder() + .with_schema_id(schema.schema_id) + .with_fields(new_fields) + .with_identifier_field_ids(new_identifier_field_ids) + .with_alias(new_alias_to_id) + .build()?; + } + + Ok(schema) } fn build_accessors(&self) -> HashMap> { @@ -265,6 +293,7 @@ impl Schema { fields: vec![], identifier_field_ids: HashSet::default(), alias_to_id: BiHashMap::default(), + reassign_field_ids_from: None, } } @@ -275,6 +304,7 @@ impl Schema { fields: self.r#struct.fields().to_vec(), alias_to_id: self.alias_to_id, identifier_field_ids: self.identifier_field_ids, + reassign_field_ids_from: None, } } @@ -346,6 +376,24 @@ impl Schema { pub fn accessor_by_field_id(&self, field_id: i32) -> Option> { self.field_id_to_accessor.get(&field_id).cloned() } + + /// Check if this schema is identical to another schema semantically - excluding schema id. + pub(crate) fn is_same_schema(&self, other: &SchemaRef) -> bool { + self.as_struct().eq(other.as_struct()) + && self.identifier_field_ids().eq(other.identifier_field_ids()) + } + + /// Change the schema id of this schema. + // This is redundant with the `with_schema_id` method on the builder, but useful + // as it is infallible in contrast to the builder `build()` method. + pub(crate) fn with_schema_id(self, schema_id: SchemaId) -> Self { + Self { schema_id, ..self } + } + + /// Return A HashMap matching field ids to field names. + pub(crate) fn field_id_to_name_map(&self) -> &HashMap { + &self.id_to_name + } } impl Display for Schema { @@ -475,8 +523,7 @@ pub fn index_by_id(r#struct: &StructType) -> Result } fn field(&mut self, field: &NestedFieldRef, _value: ()) -> Result<()> { - self.0.insert(field.id, field.clone()); - Ok(()) + try_insert_field(&mut self.0, field.id, field.clone()) } fn r#struct(&mut self, _struct: &StructType, _results: Vec) -> Result { @@ -484,15 +531,16 @@ pub fn index_by_id(r#struct: &StructType) -> Result } fn list(&mut self, list: &ListType, _value: Self::T) -> Result { - self.0 - .insert(list.element_field.id, list.element_field.clone()); - Ok(()) + try_insert_field( + &mut self.0, + list.element_field.id, + list.element_field.clone(), + ) } fn map(&mut self, map: &MapType, _key_value: Self::T, _value: Self::T) -> Result { - self.0.insert(map.key_field.id, map.key_field.clone()); - self.0.insert(map.value_field.id, map.value_field.clone()); - Ok(()) + try_insert_field(&mut self.0, map.key_field.id, map.key_field.clone())?; + try_insert_field(&mut self.0, map.value_field.id, map.value_field.clone()) } fn primitive(&mut self, _: &PrimitiveType) -> Result { @@ -943,6 +991,148 @@ impl SchemaVisitor for PruneColumn { } } +struct ReassignFieldIds { + next_field_id: i32, + old_to_new_id: HashMap, +} + +fn try_insert_field(map: &mut HashMap, field_id: i32, value: V) -> Result<()> { + map.insert(field_id, value).map_or_else( + || Ok(()), + |_| { + Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Found duplicate 'field.id' {}. Field ids must be unique.", + field_id + ), + )) + }, + ) +} + +// We are not using the visitor here, as post order traversal is not desired. +// Instead we want to re-assign all fields on one level first before diving deeper. +impl ReassignFieldIds { + fn new(start_from: i32) -> Self { + Self { + next_field_id: start_from, + old_to_new_id: HashMap::new(), + } + } + + fn reassign_field_ids(&mut self, fields: Vec) -> Result> { + // Visit fields on the same level first + let outer_fields = fields + .into_iter() + .map(|field| { + try_insert_field(&mut self.old_to_new_id, field.id, self.next_field_id)?; + let new_field = Arc::unwrap_or_clone(field).with_id(self.next_field_id); + self.increase_next_field_id()?; + Ok(Arc::new(new_field)) + }) + .collect::>>()?; + + // Now visit nested fields + outer_fields + .into_iter() + .map(|field| { + if field.field_type.is_primitive() { + Ok(field) + } else { + let mut new_field = Arc::unwrap_or_clone(field); + *new_field.field_type = self.reassign_ids_visit_type(*new_field.field_type)?; + Ok(Arc::new(new_field)) + } + }) + .collect() + } + + fn reassign_ids_visit_type(&mut self, field_type: Type) -> Result { + match field_type { + Type::Primitive(s) => Ok(Type::Primitive(s)), + Type::Struct(s) => { + let new_fields = self.reassign_field_ids(s.fields().to_vec())?; + Ok(Type::Struct(StructType::new(new_fields))) + } + Type::List(l) => { + self.old_to_new_id + .insert(l.element_field.id, self.next_field_id); + let mut element_field = Arc::unwrap_or_clone(l.element_field); + element_field.id = self.next_field_id; + self.increase_next_field_id()?; + *element_field.field_type = + self.reassign_ids_visit_type(*element_field.field_type)?; + Ok(Type::List(ListType { + element_field: Arc::new(element_field), + })) + } + Type::Map(m) => { + self.old_to_new_id + .insert(m.key_field.id, self.next_field_id); + let mut key_field = Arc::unwrap_or_clone(m.key_field); + key_field.id = self.next_field_id; + self.increase_next_field_id()?; + *key_field.field_type = self.reassign_ids_visit_type(*key_field.field_type)?; + + self.old_to_new_id + .insert(m.value_field.id, self.next_field_id); + let mut value_field = Arc::unwrap_or_clone(m.value_field); + value_field.id = self.next_field_id; + self.increase_next_field_id()?; + *value_field.field_type = self.reassign_ids_visit_type(*value_field.field_type)?; + + Ok(Type::Map(MapType { + key_field: Arc::new(key_field), + value_field: Arc::new(value_field), + })) + } + } + } + + fn increase_next_field_id(&mut self) -> Result<()> { + self.next_field_id = self.next_field_id.checked_add(1).ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + "Field ID overflowed, cannot add more fields", + ) + })?; + Ok(()) + } + + fn apply_to_identifier_fields(&self, field_ids: HashSet) -> Result> { + field_ids + .into_iter() + .map(|id| { + self.old_to_new_id.get(&id).copied().ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + format!("Identifier Field ID {} not found", id), + ) + }) + }) + .collect() + } + + fn apply_to_aliases(&self, alias: BiHashMap) -> Result> { + alias + .into_iter() + .map(|(name, id)| { + self.old_to_new_id + .get(&id) + .copied() + .ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + format!("Field with id {} for alias {} not found", id, name), + ) + }) + .map(|new_id| (name, new_id)) + }) + .collect() + } +} + pub(super) mod _serde { /// This is a helper module that defines types to help with serialization/deserialization. /// For deserialization the input first gets read into either the [SchemaV1] or [SchemaV2] struct @@ -1062,6 +1252,8 @@ pub(super) mod _serde { mod tests { use std::collections::{HashMap, HashSet}; + use bimap::BiHashMap; + use super::DEFAULT_SCHEMA_ID; use crate::spec::datatypes::Type::{List, Map, Primitive, Struct}; use crate::spec::datatypes::{ @@ -2237,4 +2429,203 @@ table { let schema = table_schema_simple().0; assert_eq!(3, schema.highest_field_id()); } + + #[test] + fn test_highest_field_id_no_fields() { + let schema = Schema::builder().with_schema_id(1).build().unwrap(); + assert_eq!(0, schema.highest_field_id()); + } + + #[test] + fn test_reassign_ids() { + let schema = Schema::builder() + .with_schema_id(1) + .with_identifier_field_ids(vec![3]) + .with_alias(BiHashMap::from_iter(vec![("bar_alias".to_string(), 3)])) + .with_fields(vec![ + NestedField::optional(5, "foo", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(3, "bar", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional(4, "baz", Type::Primitive(PrimitiveType::Boolean)).into(), + ]) + .build() + .unwrap(); + + let reassigned_schema = schema + .into_builder() + .with_reassigned_field_ids(0) + .build() + .unwrap(); + + let expected = Schema::builder() + .with_schema_id(1) + .with_identifier_field_ids(vec![1]) + .with_alias(BiHashMap::from_iter(vec![("bar_alias".to_string(), 1)])) + .with_fields(vec![ + NestedField::optional(0, "foo", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(1, "bar", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional(2, "baz", Type::Primitive(PrimitiveType::Boolean)).into(), + ]) + .build() + .unwrap(); + + pretty_assertions::assert_eq!(expected, reassigned_schema); + assert_eq!(reassigned_schema.highest_field_id(), 2); + } + + #[test] + fn test_reassigned_ids_nested() { + let schema = table_schema_nested(); + let reassigned_schema = schema + .into_builder() + .with_alias(BiHashMap::from_iter(vec![("bar_alias".to_string(), 2)])) + .with_reassigned_field_ids(0) + .build() + .unwrap(); + + let expected = Schema::builder() + .with_schema_id(1) + .with_identifier_field_ids(vec![1]) + .with_alias(BiHashMap::from_iter(vec![("bar_alias".to_string(), 1)])) + .with_fields(vec![ + NestedField::optional(0, "foo", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(1, "bar", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional(2, "baz", Type::Primitive(PrimitiveType::Boolean)).into(), + NestedField::required( + 3, + "qux", + Type::List(ListType { + element_field: NestedField::list_element( + 7, + Type::Primitive(PrimitiveType::String), + true, + ) + .into(), + }), + ) + .into(), + NestedField::required( + 4, + "quux", + Type::Map(MapType { + key_field: NestedField::map_key_element( + 8, + Type::Primitive(PrimitiveType::String), + ) + .into(), + value_field: NestedField::map_value_element( + 9, + Type::Map(MapType { + key_field: NestedField::map_key_element( + 10, + Type::Primitive(PrimitiveType::String), + ) + .into(), + value_field: NestedField::map_value_element( + 11, + Type::Primitive(PrimitiveType::Int), + true, + ) + .into(), + }), + true, + ) + .into(), + }), + ) + .into(), + NestedField::required( + 5, + "location", + Type::List(ListType { + element_field: NestedField::list_element( + 12, + Type::Struct(StructType::new(vec![ + NestedField::optional( + 13, + "latitude", + Type::Primitive(PrimitiveType::Float), + ) + .into(), + NestedField::optional( + 14, + "longitude", + Type::Primitive(PrimitiveType::Float), + ) + .into(), + ])), + true, + ) + .into(), + }), + ) + .into(), + NestedField::optional( + 6, + "person", + Type::Struct(StructType::new(vec![ + NestedField::optional(15, "name", Type::Primitive(PrimitiveType::String)) + .into(), + NestedField::required(16, "age", Type::Primitive(PrimitiveType::Int)) + .into(), + ])), + ) + .into(), + ]) + .build() + .unwrap(); + + pretty_assertions::assert_eq!(expected, reassigned_schema); + assert_eq!(reassigned_schema.highest_field_id(), 16); + assert_eq!(reassigned_schema.field_by_id(6).unwrap().name, "person"); + assert_eq!(reassigned_schema.field_by_id(16).unwrap().name, "age"); + } + + #[test] + fn test_reassign_ids_fails_with_duplicate_ids() { + let reassigned_schema = Schema::builder() + .with_schema_id(1) + .with_identifier_field_ids(vec![5]) + .with_alias(BiHashMap::from_iter(vec![("bar_alias".to_string(), 3)])) + .with_fields(vec![ + NestedField::required(5, "foo", Type::Primitive(PrimitiveType::String)).into(), + NestedField::optional(3, "bar", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional(3, "baz", Type::Primitive(PrimitiveType::Boolean)).into(), + ]) + .with_reassigned_field_ids(0) + .build() + .unwrap_err(); + + assert!(reassigned_schema.message().contains("'field.id' 3")); + } + + #[test] + fn test_field_ids_must_be_unique() { + let reassigned_schema = Schema::builder() + .with_schema_id(1) + .with_identifier_field_ids(vec![5]) + .with_alias(BiHashMap::from_iter(vec![("bar_alias".to_string(), 3)])) + .with_fields(vec![ + NestedField::required(5, "foo", Type::Primitive(PrimitiveType::String)).into(), + NestedField::optional(3, "bar", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional(3, "baz", Type::Primitive(PrimitiveType::Boolean)).into(), + ]) + .build() + .unwrap_err(); + + assert!(reassigned_schema.message().contains("'field.id' 3")); + } + + #[test] + fn test_reassign_ids_empty_schema() { + let schema = Schema::builder().with_schema_id(1).build().unwrap(); + let reassigned_schema = schema + .clone() + .into_builder() + .with_reassigned_field_ids(0) + .build() + .unwrap(); + + assert_eq!(schema, reassigned_schema); + assert_eq!(schema.highest_field_id(), 0); + } } diff --git a/crates/iceberg/src/spec/table_metadata.rs b/crates/iceberg/src/spec/table_metadata.rs index a4de25415..cdad4c4ef 100644 --- a/crates/iceberg/src/spec/table_metadata.rs +++ b/crates/iceberg/src/spec/table_metadata.rs @@ -30,19 +30,13 @@ use serde_repr::{Deserialize_repr, Serialize_repr}; use uuid::Uuid; use super::snapshot::SnapshotReference; +pub use super::table_metadata_builder::TableMetadataBuilder; use super::{ - PartitionSpec, PartitionSpecRef, SchemaId, SchemaRef, Snapshot, SnapshotRef, SnapshotRetention, - SortOrder, SortOrderRef, DEFAULT_PARTITION_SPEC_ID, + PartitionSpecRef, SchemaId, SchemaRef, SchemalessPartitionSpecRef, Snapshot, SnapshotRef, + SnapshotRetention, SortOrder, SortOrderRef, DEFAULT_PARTITION_SPEC_ID, MAIN_BRANCH, }; use crate::error::{timestamp_ms_to_utc, Result}; -use crate::{Error, ErrorKind, TableCreation}; - -/// Main branch name -pub static MAIN_BRANCH: &str = "main"; -/// Default spec id (unpartitioned) -pub static DEFAULT_SPEC_ID: i32 = 0; -/// Default sort order id (unsorted) -pub static DEFAULT_SORT_ORDER_ID: i64 = 0; +use crate::{Error, ErrorKind}; pub(crate) static ONE_MINUTE_MS: i64 = 60_000; @@ -125,9 +119,9 @@ pub struct TableMetadata { /// ID of the table’s current schema. pub current_schema_id: i32, /// A list of partition specs, stored as full partition spec objects. - pub partition_specs: HashMap, + pub(crate) partition_specs: HashMap, /// ID of the “current” spec that writers should use by default. - pub default_spec_id: i32, + pub(crate) default_spec: PartitionSpecRef, /// An integer; the highest assigned partition field ID across all partition specs for the table. pub last_partition_id: i32, ///A string to string map of table properties. This is used to control settings that @@ -172,6 +166,15 @@ pub struct TableMetadata { } impl TableMetadata { + /// Convert this Table Metadata into a builder for modification. + /// + /// `current_file_location` is the location where the current version + /// of the metadata file is stored. This is used to update the metadata log. + #[must_use] + pub fn into_builder(self, current_file_location: impl Into) -> TableMetadataBuilder { + TableMetadataBuilder::new_from_metadata(self, current_file_location) + } + /// Returns format version of this metadata. #[inline] pub fn format_version(&self) -> FormatVersion { @@ -229,21 +232,26 @@ impl TableMetadata { /// Returns all partition specs. #[inline] - pub fn partition_specs_iter(&self) -> impl Iterator { + pub fn partition_specs_iter(&self) -> impl Iterator { self.partition_specs.values() } /// Lookup partition spec by id. #[inline] - pub fn partition_spec_by_id(&self, spec_id: i32) -> Option<&PartitionSpecRef> { + pub fn partition_spec_by_id(&self, spec_id: i32) -> Option<&SchemalessPartitionSpecRef> { self.partition_specs.get(&spec_id) } /// Get default partition spec #[inline] pub fn default_partition_spec(&self) -> &PartitionSpecRef { - self.partition_spec_by_id(self.default_spec_id) - .expect("Default partition spec id set, but not found in table metadata") + &self.default_spec + } + + #[inline] + /// Returns spec id of the "current" partition spec. + pub fn default_partition_spec_id(&self) -> i32 { + self.default_spec.spec_id() } /// Returns all snapshots @@ -359,29 +367,18 @@ impl TableMetadata { Ok(self) } - /// If the default partition spec is specified but the spec is not present in specs, add it + /// If the default partition spec is not present in specs, add it fn try_normalize_partition_spec(&mut self) -> Result<()> { - if self.partition_spec_by_id(self.default_spec_id).is_some() { - return Ok(()); - } - - if self.default_spec_id != DEFAULT_PARTITION_SPEC_ID { - return Err(Error::new( - ErrorKind::DataInvalid, - format!( - "No partition spec exists with the default spec id {}.", - self.default_spec_id - ), - )); + if self + .partition_spec_by_id(self.default_spec.spec_id()) + .is_none() + { + self.partition_specs.insert( + self.default_spec.spec_id(), + Arc::new(Arc::unwrap_or_clone(self.default_spec.clone()).into_schemaless()), + ); } - let partition_spec = PartitionSpec { - spec_id: DEFAULT_PARTITION_SPEC_ID, - fields: vec![], - }; - self.partition_specs - .insert(DEFAULT_PARTITION_SPEC_ID, Arc::new(partition_spec)); - Ok(()) } @@ -552,99 +549,6 @@ impl TableMetadata { } } -/// Manipulating table metadata. -pub struct TableMetadataBuilder(TableMetadata); - -impl TableMetadataBuilder { - /// Creates a new table metadata builder from the given table metadata. - pub fn new(origin: TableMetadata) -> Self { - Self(origin) - } - - /// Creates a new table metadata builder from the given table creation. - pub fn from_table_creation(table_creation: TableCreation) -> Result { - let TableCreation { - name: _, - location, - schema, - partition_spec, - sort_order, - properties, - } = table_creation; - - let partition_specs = match partition_spec { - Some(_) => { - return Err(Error::new( - ErrorKind::FeatureUnsupported, - "Can't create table with partition spec now", - )) - } - None => HashMap::from([( - DEFAULT_PARTITION_SPEC_ID, - Arc::new(PartitionSpec { - spec_id: DEFAULT_PARTITION_SPEC_ID, - fields: vec![], - }), - )]), - }; - - let sort_orders = match sort_order { - Some(_) => { - return Err(Error::new( - ErrorKind::FeatureUnsupported, - "Can't create table with sort order now", - )) - } - None => HashMap::from([( - SortOrder::UNSORTED_ORDER_ID, - Arc::new(SortOrder::unsorted_order()), - )]), - }; - - let mut table_metadata = TableMetadata { - format_version: FormatVersion::V2, - table_uuid: Uuid::now_v7(), - location: location.ok_or_else(|| { - Error::new( - ErrorKind::DataInvalid, - "Can't create table without location", - ) - })?, - last_sequence_number: 0, - last_updated_ms: Utc::now().timestamp_millis(), - last_column_id: schema.highest_field_id(), - current_schema_id: schema.schema_id(), - schemas: HashMap::from([(schema.schema_id(), Arc::new(schema))]), - partition_specs, - default_spec_id: DEFAULT_PARTITION_SPEC_ID, - last_partition_id: 0, - properties, - current_snapshot_id: None, - snapshots: Default::default(), - snapshot_log: vec![], - sort_orders, - metadata_log: vec![], - default_sort_order_id: SortOrder::UNSORTED_ORDER_ID, - refs: Default::default(), - }; - - table_metadata.try_normalize()?; - - Ok(Self(table_metadata)) - } - - /// Changes uuid of table metadata. - pub fn assign_uuid(mut self, uuid: Uuid) -> Result { - self.0.table_uuid = uuid; - Ok(self) - } - - /// Returns the new table metadata after changes. - pub fn build(self) -> Result { - Ok(self.0) - } -} - pub(super) mod _serde { use std::borrow::BorrowMut; /// This is a helper module that defines types to help with serialization/deserialization. @@ -668,8 +572,8 @@ pub(super) mod _serde { use crate::spec::schema::_serde::{SchemaV1, SchemaV2}; use crate::spec::snapshot::_serde::{SnapshotV1, SnapshotV2}; use crate::spec::{ - PartitionField, PartitionSpec, Schema, Snapshot, SnapshotReference, SnapshotRetention, - SortOrder, + PartitionField, PartitionSpec, Schema, SchemaRef, SchemalessPartitionSpec, Snapshot, + SnapshotReference, SnapshotRetention, SortOrder, }; use crate::{Error, ErrorKind}; @@ -692,7 +596,7 @@ pub(super) mod _serde { pub last_column_id: i32, pub schemas: Vec, pub current_schema_id: i32, - pub partition_specs: Vec, + pub partition_specs: Vec, pub default_spec_id: i32, pub last_partition_id: i32, #[serde(skip_serializing_if = "Option::is_none")] @@ -728,7 +632,7 @@ pub(super) mod _serde { pub current_schema_id: Option, pub partition_spec: Vec, #[serde(skip_serializing_if = "Option::is_none")] - pub partition_specs: Option>, + pub partition_specs: Option>, #[serde(skip_serializing_if = "Option::is_none")] pub default_spec_id: Option, #[serde(skip_serializing_if = "Option::is_none")] @@ -816,6 +720,44 @@ pub(super) mod _serde { .map(|schema| Ok((schema.schema_id, Arc::new(schema.try_into()?)))) .collect::, Error>>()?, ); + + let current_schema: &SchemaRef = + schemas.get(&value.current_schema_id).ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + format!( + "No schema exists with the current schema id {}.", + value.current_schema_id + ), + ) + })?; + let partition_specs = HashMap::from_iter( + value + .partition_specs + .into_iter() + .map(|x| (x.spec_id(), Arc::new(x))), + ); + let default_spec_id = value.default_spec_id; + let default_spec = partition_specs + .get(&value.default_spec_id) + .map(|schemaless_spec| { + (*schemaless_spec.clone()) + .clone() + .bind(current_schema.clone()) + }) + .transpose()? + .or_else(|| { + (DEFAULT_PARTITION_SPEC_ID == default_spec_id) + .then(|| PartitionSpec::unpartition_spec(current_schema.clone())) + }) + .ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + format!("Default partition spec {default_spec_id} not found"), + ) + })? + .into(); + let mut metadata = TableMetadata { format_version: FormatVersion::V2, table_uuid: value.table_uuid, @@ -825,13 +767,8 @@ pub(super) mod _serde { last_column_id: value.last_column_id, current_schema_id: value.current_schema_id, schemas, - partition_specs: HashMap::from_iter( - value - .partition_specs - .into_iter() - .map(|x| (x.spec_id(), Arc::new(x))), - ), - default_spec_id: value.default_spec_id, + partition_specs, + default_spec, last_partition_id: value.last_partition_id, properties: value.properties.unwrap_or_default(), current_snapshot_id, @@ -907,18 +844,49 @@ pub(super) mod _serde { }) .transpose()? .unwrap_or_default(); - let partition_specs = HashMap::from_iter( - value - .partition_specs - .unwrap_or_else(|| { - vec![PartitionSpec { - spec_id: DEFAULT_PARTITION_SPEC_ID, - fields: value.partition_spec, - }] - }) - .into_iter() - .map(|x| (x.spec_id(), Arc::new(x))), - ); + let current_schema_id = value + .current_schema_id + .unwrap_or_else(|| schemas.keys().copied().max().unwrap_or_default()); + let current_schema = schemas + .get(¤t_schema_id) + .ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + format!( + "No schema exists with the current schema id {}.", + current_schema_id + ), + ) + })? + .clone(); + + let partition_specs = match value.partition_specs { + Some(partition_specs) => partition_specs, + None => vec![PartitionSpec::builder(current_schema.clone()) + .with_spec_id(DEFAULT_PARTITION_SPEC_ID) + .add_unbound_fields(value.partition_spec.into_iter().map(|f| f.into_unbound()))? + .build()? + .into_schemaless()], + } + .into_iter() + .map(|x| (x.spec_id(), Arc::new(x))) + .collect::>(); + + let default_spec_id = value + .default_spec_id + .unwrap_or_else(|| partition_specs.keys().copied().max().unwrap_or_default()); + let default_spec = partition_specs + .get(&default_spec_id) + .map(|x| Arc::unwrap_or_clone(x.clone()).bind(current_schema.clone())) + .transpose()? + .ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + format!("Default partition spec {default_spec_id} not found"), + ) + })? + .into(); + let mut metadata = TableMetadata { format_version: FormatVersion::V1, table_uuid: value.table_uuid.unwrap_or_default(), @@ -926,12 +894,8 @@ pub(super) mod _serde { last_sequence_number: 0, last_updated_ms: value.last_updated_ms, last_column_id: value.last_column_id, - current_schema_id: value - .current_schema_id - .unwrap_or_else(|| schemas.keys().copied().max().unwrap_or_default()), - default_spec_id: value - .default_spec_id - .unwrap_or_else(|| partition_specs.keys().copied().max().unwrap_or_default()), + current_schema_id, + default_spec, last_partition_id: value .last_partition_id .unwrap_or_else(|| partition_specs.keys().copied().max().unwrap_or_default()), @@ -1005,7 +969,7 @@ pub(super) mod _serde { .into_values() .map(|x| Arc::try_unwrap(x).unwrap_or_else(|s| s.as_ref().clone())) .collect(), - default_spec_id: v.default_spec_id, + default_spec_id: v.default_spec.spec_id(), last_partition_id: v.last_partition_id, properties: Some(v.properties), current_snapshot_id: v.current_snapshot_id.or(Some(-1)), @@ -1074,18 +1038,14 @@ pub(super) mod _serde { .collect(), ), current_schema_id: Some(v.current_schema_id), - partition_spec: v - .partition_specs - .get(&v.default_spec_id) - .map(|x| x.fields().to_vec()) - .unwrap_or_default(), + partition_spec: v.default_spec.fields().to_vec(), partition_specs: Some( v.partition_specs .into_values() .map(|x| Arc::try_unwrap(x).unwrap_or_else(|s| s.as_ref().clone())) .collect(), ), - default_spec_id: Some(v.default_spec_id), + default_spec_id: Some(v.default_spec.spec_id()), last_partition_id: Some(v.last_partition_id), properties: if v.properties.is_empty() { None @@ -1202,9 +1162,9 @@ mod tests { use super::{FormatVersion, MetadataLog, SnapshotLog, TableMetadataBuilder}; use crate::spec::table_metadata::TableMetadata; use crate::spec::{ - NestedField, NullOrder, Operation, PartitionField, PartitionSpec, PrimitiveType, Schema, - Snapshot, SnapshotReference, SnapshotRetention, SortDirection, SortField, SortOrder, - Summary, Transform, Type, UnboundPartitionField, + NestedField, NullOrder, Operation, PartitionSpec, PrimitiveType, Schema, Snapshot, + SnapshotReference, SnapshotRetention, SortDirection, SortField, SortOrder, Summary, + Transform, Type, UnboundPartitionField, }; use crate::TableCreation; @@ -1245,6 +1205,12 @@ mod tests { "name": "struct_name", "required": true, "type": "fixed[1]" + }, + { + "id": 4, + "name": "ts", + "required": true, + "type": "timestamp" } ] } @@ -1286,23 +1252,32 @@ mod tests { let schema = Schema::builder() .with_schema_id(1) - .with_fields(vec![Arc::new(NestedField::required( - 1, - "struct_name", - Type::Primitive(PrimitiveType::Fixed(1)), - ))]) + .with_fields(vec![ + Arc::new(NestedField::required( + 1, + "struct_name", + Type::Primitive(PrimitiveType::Fixed(1)), + )), + Arc::new(NestedField::required( + 4, + "ts", + Type::Primitive(PrimitiveType::Timestamp), + )), + ]) .build() .unwrap(); - let partition_spec = PartitionSpec { - spec_id: 0, - fields: vec![PartitionField { + let partition_spec = PartitionSpec::builder(schema.clone()) + .with_spec_id(0) + .add_unbound_field(UnboundPartitionField { name: "ts_day".to_string(), transform: Transform::Day, source_id: 4, - field_id: 1000, - }], - }; + field_id: Some(1000), + }) + .unwrap() + .build() + .unwrap(); let expected = TableMetadata { format_version: FormatVersion::V2, @@ -1312,8 +1287,11 @@ mod tests { last_column_id: 1, schemas: HashMap::from_iter(vec![(1, Arc::new(schema))]), current_schema_id: 1, - partition_specs: HashMap::from_iter(vec![(0, partition_spec.into())]), - default_spec_id: 0, + partition_specs: HashMap::from_iter(vec![( + 0, + partition_spec.clone().into_schemaless().into(), + )]), + default_spec: partition_spec.into(), last_partition_id: 1000, default_sort_order_id: 0, sort_orders: HashMap::from_iter(vec![(0, SortOrder::unsorted_order().into())]), @@ -1452,7 +1430,8 @@ mod tests { .build() .unwrap(); - let partition_spec = PartitionSpec::builder(&schema) + let schema = Arc::new(schema); + let partition_spec = PartitionSpec::builder(schema.clone()) .with_spec_id(0) .add_partition_field("vendor_id", "vendor_id", Transform::Identity) .unwrap() @@ -1479,10 +1458,10 @@ mod tests { location: "/home/iceberg/warehouse/nyc/taxis".to_string(), last_updated_ms: 1662532818843, last_column_id: 5, - schemas: HashMap::from_iter(vec![(0, Arc::new(schema))]), + schemas: HashMap::from_iter(vec![(0, schema)]), current_schema_id: 0, - partition_specs: HashMap::from_iter(vec![(0, partition_spec.into())]), - default_spec_id: 0, + partition_specs: HashMap::from_iter(vec![(0, partition_spec.clone().into_schemaless().into())]), + default_spec: Arc::new(partition_spec), last_partition_id: 1000, default_sort_order_id: 0, sort_orders: HashMap::from_iter(vec![(0, sort_order.into())]), @@ -1521,6 +1500,12 @@ mod tests { "name": "struct_name", "required": true, "type": "fixed[1]" + }, + { + "id": 4, + "name": "ts", + "required": true, + "type": "timestamp" } ] } @@ -1621,6 +1606,12 @@ mod tests { "name": "struct_name", "required": true, "type": "fixed[1]" + }, + { + "id": 4, + "name": "ts", + "required": true, + "type": "timestamp" } ] } @@ -1706,6 +1697,12 @@ mod tests { "name": "struct_name", "required": true, "type": "fixed[1]" + }, + { + "id": 4, + "name": "ts", + "required": true, + "type": "timestamp" } ] } @@ -1835,7 +1832,7 @@ mod tests { .build() .unwrap(); - let partition_spec = PartitionSpec::builder(&schema1) + let partition_spec = PartitionSpec::builder(schema2.clone()) .with_spec_id(0) .add_unbound_field(UnboundPartitionField { name: "x".to_string(), @@ -1896,8 +1893,11 @@ mod tests { last_column_id: 3, schemas: HashMap::from_iter(vec![(0, Arc::new(schema1)), (1, Arc::new(schema2))]), current_schema_id: 1, - partition_specs: HashMap::from_iter(vec![(0, partition_spec.into())]), - default_spec_id: 0, + partition_specs: HashMap::from_iter(vec![( + 0, + partition_spec.clone().into_schemaless().into(), + )]), + default_spec: Arc::new(partition_spec), last_partition_id: 1000, default_sort_order_id: 3, sort_orders: HashMap::from_iter(vec![(3, sort_order.into())]), @@ -1958,7 +1958,7 @@ mod tests { .build() .unwrap(); - let partition_spec = PartitionSpec::builder(&schema) + let partition_spec = PartitionSpec::builder(schema.clone()) .with_spec_id(0) .add_unbound_field(UnboundPartitionField { name: "x".to_string(), @@ -1995,8 +1995,11 @@ mod tests { last_column_id: 3, schemas: HashMap::from_iter(vec![(0, Arc::new(schema))]), current_schema_id: 0, - partition_specs: HashMap::from_iter(vec![(0, partition_spec.into())]), - default_spec_id: 0, + partition_specs: HashMap::from_iter(vec![( + 0, + partition_spec.clone().into_schemaless().into(), + )]), + default_spec: Arc::new(partition_spec), last_partition_id: 1000, default_sort_order_id: 3, sort_orders: HashMap::from_iter(vec![(3, sort_order.into())]), @@ -2038,7 +2041,7 @@ mod tests { .build() .unwrap(); - let partition_spec = PartitionSpec::builder(&schema) + let partition_spec = PartitionSpec::builder(schema.clone()) .with_spec_id(0) .add_unbound_field(UnboundPartitionField { name: "x".to_string(), @@ -2058,8 +2061,11 @@ mod tests { last_column_id: 3, schemas: HashMap::from_iter(vec![(0, Arc::new(schema))]), current_schema_id: 0, - partition_specs: HashMap::from_iter(vec![(0, partition_spec.into())]), - default_spec_id: 0, + partition_specs: HashMap::from_iter(vec![( + 0, + partition_spec.clone().into_schemaless().into(), + )]), + default_spec: Arc::new(partition_spec), last_partition_id: 0, default_sort_order_id: 0, // Sort order is added during deserialization for V2 compatibility @@ -2172,16 +2178,22 @@ mod tests { fn test_default_partition_spec() { let default_spec_id = 1234; let mut table_meta_data = get_test_table_metadata("TableMetadataV2Valid.json"); - table_meta_data.default_spec_id = default_spec_id; + let partition_spec = + PartitionSpec::unpartition_spec(table_meta_data.current_schema().clone()); + table_meta_data.default_spec = partition_spec.clone().into(); table_meta_data .partition_specs - .insert(default_spec_id, Arc::new(PartitionSpec::default())); + .insert(default_spec_id, Arc::new(partition_spec.into_schemaless())); assert_eq!( - table_meta_data.default_partition_spec(), - table_meta_data + (*table_meta_data.default_partition_spec().clone()) + .clone() + .into_schemaless(), + (*table_meta_data .partition_spec_by_id(default_spec_id) .unwrap() + .clone()) + .clone() ); } #[test] @@ -2213,7 +2225,8 @@ mod tests { let table_metadata = TableMetadataBuilder::from_table_creation(table_creation) .unwrap() .build() - .unwrap(); + .unwrap() + .metadata; assert_eq!(table_metadata.location, "s3://db/table"); assert_eq!(table_metadata.schemas.len(), 1); assert_eq!( @@ -2232,10 +2245,11 @@ mod tests { HashMap::from([( 0, Arc::new( - PartitionSpec::builder(table_metadata.schemas.get(&0).unwrap()) + PartitionSpec::builder(table_metadata.schemas.get(&0).unwrap().clone()) .with_spec_id(0) .build() .unwrap() + .into_schemaless() ) )]) ); diff --git a/crates/iceberg/src/spec/table_metadata_builder.rs b/crates/iceberg/src/spec/table_metadata_builder.rs new file mode 100644 index 000000000..035a07774 --- /dev/null +++ b/crates/iceberg/src/spec/table_metadata_builder.rs @@ -0,0 +1,1909 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; + +use uuid::Uuid; + +use super::{ + FormatVersion, MetadataLog, PartitionSpec, PartitionSpecBuilder, Schema, SchemaRef, Snapshot, + SnapshotLog, SnapshotReference, SortOrder, SortOrderRef, TableMetadata, UnboundPartitionSpec, + UnboundPartitionSpecInterface as _, DEFAULT_PARTITION_SPEC_ID, DEFAULT_SCHEMA_ID, MAIN_BRANCH, + ONE_MINUTE_MS, PROPERTY_FORMAT_VERSION, PROPERTY_METADATA_PREVIOUS_VERSIONS_MAX_DEFAULT, + RESERVED_PROPERTIES, UNPARTITIONED_LAST_ASSIGNED_ID, +}; +use crate::error::{Error, ErrorKind, Result}; +use crate::{TableCreation, TableUpdate}; + +/// Manipulating table metadata. +/// +/// For this builder the order of called functions matters. Functions are applied in-order. +/// All operations applied to the `TableMetadata` are tracked in `changes` as a chronologically +/// ordered vec of `TableUpdate`. +/// If an operation does not lead to a change of the `TableMetadata`, the corresponding update +/// is omitted from `changes`. +/// +/// Unlike a typical builder pattern, the order of function calls matters. +/// Some basic rules: +/// - `add_schema` must be called before `set_current_schema`. +/// - If a new partition spec and schema are added, the schema should be added first. +#[derive(Debug, Clone)] +pub struct TableMetadataBuilder { + metadata: TableMetadata, + changes: Vec, + last_added_schema_id: Option, + last_added_spec_id: Option, + last_added_order_id: Option, + // None if this is a new table (from_metadata) method not used + previous_history_entry: Option, +} + +#[derive(Debug, Clone, PartialEq)] +pub struct TableMetadataBuildResult { + pub metadata: TableMetadata, + pub changes: Vec, + pub expired_metadata_logs: Vec, +} + +impl TableMetadataBuilder { + const LAST_ADDED: i32 = -1; + + /// Create a `TableMetadata` object from scratch. + /// + /// This method re-assign ids of fields in the schema, schema.id, sort_order.id and + /// spec.id. It should only be used to create new table metadata from scratch. + pub fn new( + schema: Schema, + spec: impl Into, + sort_order: SortOrder, + location: String, + format_version: FormatVersion, + properties: HashMap, + ) -> Result { + // Re-assign field_ids, schema.id, sort_order.id and spec.id for a new table. + let (fresh_schema, fresh_spec, fresh_sort_order) = + Self::reassign_ids(schema, spec.into(), sort_order)?; + let schema_id = fresh_schema.schema_id(); + + let builder = Self { + metadata: TableMetadata { + format_version, + table_uuid: Uuid::now_v7(), + location: "".to_string(), // Overwritten immediately by set_location + last_sequence_number: 0, + last_updated_ms: 0, // Overwritten by build() if not set before + last_column_id: -1, // Overwritten immediately by add_current_schema + current_schema_id: -1, // Overwritten immediately by add_current_schema + schemas: HashMap::new(), + partition_specs: HashMap::new(), + default_spec: Arc::new( + PartitionSpec::unpartition_spec(fresh_schema.clone()).with_spec_id(-1), + ), // Overwritten immediately by add_default_partition_spec + last_partition_id: UNPARTITIONED_LAST_ASSIGNED_ID, + properties: HashMap::new(), + current_snapshot_id: None, + snapshots: HashMap::new(), + snapshot_log: vec![], + sort_orders: HashMap::new(), + metadata_log: vec![], + default_sort_order_id: -1, // Overwritten immediately by add_default_sort_order + refs: HashMap::default(), + }, + changes: vec![], + last_added_schema_id: Some(schema_id), + last_added_spec_id: None, + last_added_order_id: None, + previous_history_entry: None, + }; + + builder + .set_location(location) + .add_current_schema(fresh_schema)? + .add_default_partition_spec(fresh_spec.into_unbound())? + .add_default_sort_order(fresh_sort_order)? + .set_properties(properties) + } + + /// Creates a new table metadata builder from the given metadata to modify it. + /// Previous file location is used to populate the Metadata Log. + #[must_use] + pub fn new_from_metadata( + previous: TableMetadata, + previous_file_location: impl Into, + ) -> Self { + Self { + previous_history_entry: Some(MetadataLog { + metadata_file: previous_file_location.into(), + timestamp_ms: previous.last_updated_ms, + }), + metadata: previous, + changes: Vec::default(), + last_added_schema_id: None, + last_added_spec_id: None, + last_added_order_id: None, + } + } + + /// Creates a new table metadata builder from the given table creation. + pub fn from_table_creation(table_creation: TableCreation) -> Result { + let TableCreation { + name: _, + location, + schema, + partition_spec, + sort_order, + properties, + } = table_creation; + + let location = location.ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + "Can't create table without location", + ) + })?; + let partition_spec = partition_spec.unwrap_or(UnboundPartitionSpec { + spec_id: None, + fields: vec![], + }); + + Self::new( + schema, + partition_spec, + sort_order.unwrap_or(SortOrder::unsorted_order()), + location, + FormatVersion::V1, + properties, + ) + } + + /// Get the current schema with all changes applied up to this point. + #[inline] + pub fn current_schema(&self) -> &SchemaRef { + self.metadata.current_schema() + } + + /// Get the current last column id + #[inline] + pub fn last_column_id(&self) -> i32 { + self.metadata.last_column_id + } + + /// Get the current last updated timestamp + #[inline] + pub fn last_updated_ms(&self) -> i64 { + self.metadata.last_updated_ms + } + + /// Changes uuid of table metadata. + pub fn assign_uuid(mut self, uuid: Uuid) -> Self { + if self.metadata.table_uuid != uuid { + self.metadata.table_uuid = uuid; + self.changes.push(TableUpdate::AssignUuid { uuid }); + } + + self + } + + /// Upgrade `FormatVersion`. Downgrades are not allowed. + /// + /// # Errors + /// - Cannot downgrade to older format versions. + pub fn upgrade_format_version(mut self, format_version: FormatVersion) -> Result { + if format_version < self.metadata.format_version { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Cannot downgrade FormatVersion from {} to {}", + self.metadata.format_version, format_version + ), + )); + } + + if format_version != self.metadata.format_version { + self.metadata.format_version = format_version; + self.changes + .push(TableUpdate::UpgradeFormatVersion { format_version }); + } + + Ok(self) + } + + /// Set properties. If a property already exists, it will be overwritten. + /// + /// If a reserved property is set, the corresponding action is performed and the property is not persisted. + /// Currently the following reserved properties are supported: + /// * format-version: Set the format version of the table. + /// + /// # Errors + /// - If format-version property is set to a lower version than the current format version. + pub fn set_properties(mut self, properties: HashMap) -> Result { + // List of specified properties that are RESERVED and should not be persisted. + let reserved_properties = properties + .keys() + .filter(|key| RESERVED_PROPERTIES.contains(&key.as_str())) + .map(ToString::to_string) + .collect::>(); + + if !reserved_properties.is_empty() { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Table properties should not contain reserved properties, but got: [{}]", + reserved_properties.join(", ") + ), + )); + } + + if properties.is_empty() { + return Ok(self); + } + + self.metadata.properties.extend(properties.clone()); + self.changes.push(TableUpdate::SetProperties { + updates: properties, + }); + + Ok(self) + } + + /// Remove properties from the table metadata. + /// Does nothing if the key is not present. + pub fn remove_properties(mut self, properties: &[String]) -> Self { + for property in properties { + self.metadata.properties.remove(property); + } + + if !properties.is_empty() { + self.changes.push(TableUpdate::RemoveProperties { + removals: properties.to_vec(), + }); + } + + self + } + + /// Set the location of the table metadata, stripping any trailing slashes. + pub fn set_location(mut self, location: String) -> Self { + let location = location.trim_end_matches('/').to_string(); + if self.metadata.location != location { + self.changes.push(TableUpdate::SetLocation { + location: location.clone(), + }); + self.metadata.location = location; + } + + self + } + + /// Add a snapshot to the table metadata. + /// + /// # Errors + /// - No schema has been added to the table metadata. + /// - No partition spec has been added to the table metadata. + /// - No sort order has been added to the table metadata. + /// - Snapshot id already exists. + /// - For format version > 1: the sequence number of the snapshot is loser than the highest sequence number specified so far. + pub fn add_snapshot(mut self, snapshot: Snapshot) -> Result { + if self.metadata.partition_specs.is_empty() { + return Err(Error::new( + ErrorKind::DataInvalid, + "Attempting to add a snapshot before a partition spec is added", + )); + } + + if self.metadata.sort_orders.is_empty() { + return Err(Error::new( + ErrorKind::DataInvalid, + "Attempting to add a snapshot before a sort order is added", + )); + } + + if self + .metadata + .snapshots + .contains_key(&snapshot.snapshot_id()) + { + return Err(Error::new( + ErrorKind::DataInvalid, + format!("Snapshot already exists for: '{}'", snapshot.snapshot_id()), + )); + } + + if self.metadata.format_version != FormatVersion::V1 + && snapshot.sequence_number() <= self.metadata.last_sequence_number + && snapshot.parent_snapshot_id().is_some() + { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Cannot add snapshot with sequence number {} older than last sequence number {}", + snapshot.sequence_number(), + self.metadata.last_sequence_number + ) + )); + } + + if let Some(last) = self.metadata.snapshot_log.last() { + // commits can happen concurrently from different machines. + // A tolerance helps us avoid failure for small clock skew + if snapshot.timestamp_ms() - last.timestamp_ms < -ONE_MINUTE_MS { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Invalid snapshot timestamp {}: before last snapshot timestamp {}", + snapshot.timestamp_ms(), + last.timestamp_ms + ), + )); + } + } + + if snapshot.timestamp_ms() - self.metadata.last_updated_ms < -ONE_MINUTE_MS { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Invalid snapshot timestamp {}: before last updated timestamp {}", + snapshot.timestamp_ms(), + self.metadata.last_updated_ms + ), + )); + } + + // Mutation happens in next line - must be infallible from here + self.changes.push(TableUpdate::AddSnapshot { + snapshot: snapshot.clone(), + }); + + self.metadata.last_updated_ms = snapshot.timestamp_ms(); + self.metadata.last_sequence_number = snapshot.sequence_number(); + self.metadata + .snapshots + .insert(snapshot.snapshot_id(), snapshot.into()); + + Ok(self) + } + + /// Append a snapshot to the specified branch. + /// If branch is not specified, the snapshot is appended to the main branch. + /// The `ref` must already exist. Retention settings from the `ref` are re-used. + /// + /// # Errors + /// - The ref is unknown. + /// - Any of the preconditions of `self.add_snapshot` are not met. + pub fn append_snapshot(self, snapshot: Snapshot, ref_name: Option<&str>) -> Result { + let ref_name = ref_name.unwrap_or(MAIN_BRANCH); + let mut reference = self + .metadata + .refs + .get(ref_name) + .ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + format!("Cannot append snapshot to unknown ref: '{}'", ref_name), + ) + })? + .clone(); + + reference.snapshot_id = snapshot.snapshot_id(); + + self.add_snapshot(snapshot)?.set_ref(ref_name, reference) + } + + /// Remove snapshots by its ids from the table metadata. + /// Does nothing if a snapshot id is not present. + /// Keeps as changes only the snapshots that were actually removed. + pub fn remove_snapshots(mut self, snapshot_ids: &[i64]) -> Self { + let mut removed_snapshots = Vec::with_capacity(snapshot_ids.len()); + + self.metadata.snapshots.retain(|k, _| { + if snapshot_ids.contains(k) { + removed_snapshots.push(*k); + false + } else { + true + } + }); + + if !removed_snapshots.is_empty() { + self.changes.push(TableUpdate::RemoveSnapshots { + snapshot_ids: removed_snapshots, + }); + } + + // Remove refs that are no longer valid + self.metadata + .refs + .retain(|_, v| self.metadata.snapshots.contains_key(&v.snapshot_id)); + + self + } + + /// Set a reference to a snapshot. + /// + /// # Errors + /// - The snapshot id is unknown. + pub fn set_ref(mut self, ref_name: &str, reference: SnapshotReference) -> Result { + if self + .metadata + .refs + .get(ref_name) + .is_some_and(|snap_ref| snap_ref.eq(&reference)) + { + return Ok(self); + } + + let Some(snapshot) = self.metadata.snapshots.get(&reference.snapshot_id) else { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Cannot set '{ref_name}' to unknown snapshot: '{}'", + reference.snapshot_id + ), + )); + }; + + // Update last_updated_ms to the exact timestamp of the snapshot if it was added in this commit + let is_added_snapshot = self.changes.iter().any(|update| { + matches!(update, TableUpdate::AddSnapshot { snapshot: snap } if snap.snapshot_id() == snapshot.snapshot_id()) + }); + if is_added_snapshot { + self.metadata.last_updated_ms = snapshot.timestamp_ms(); + } + + // Current snapshot id is set only for the main branch + if ref_name == MAIN_BRANCH { + self.metadata.current_snapshot_id = Some(snapshot.snapshot_id()); + if self.metadata.last_updated_ms == i64::default() { + self.metadata.last_updated_ms = chrono::Utc::now().timestamp_millis(); + }; + + self.metadata.snapshot_log.push(SnapshotLog { + snapshot_id: snapshot.snapshot_id(), + timestamp_ms: self.metadata.last_updated_ms, + }); + } + + self.changes.push(TableUpdate::SetSnapshotRef { + ref_name: ref_name.to_string(), + reference: reference.clone(), + }); + self.metadata.refs.insert(ref_name.to_string(), reference); + + Ok(self) + } + + /// Remove a reference + /// + /// If `ref_name='main'` the current snapshot id is set to -1. + pub fn remove_ref(mut self, ref_name: &str) -> Self { + if ref_name == MAIN_BRANCH { + self.metadata.current_snapshot_id = Some(i64::from(Self::LAST_ADDED)); + self.metadata.snapshot_log.clear(); + } + + if self.metadata.refs.remove(ref_name).is_some() || ref_name == MAIN_BRANCH { + self.changes.push(TableUpdate::RemoveSnapshotRef { + ref_name: ref_name.to_string(), + }); + } + + self + } + + /// Add a schema to the table metadata. + /// + /// The provided `schema.schema_id` may not be used. + + // ToDo Discuss: Should we add `new_last_column_id` argument? + // TLDR; I believe not as it acts as an assertion and its purpose (and source) is not clear. We shouldn't add it. + // + // Schemas can contain only old columns or a mix of old and new columns. + // In Java, if `new_last_column_id` set but too low, the function would fail, basically hinting at + // at the schema having been built for an older metadata version. `new_last_column_id` is typically obtained + // in the schema building process. + // + // This assertion is not required if the user controls the flow - he knows for which + // metadata he created a schema. If asserting the `new_last_column_id` was semantically important, it should be part of the schema and + // not be passed around alongside it. + // + // Specifying `new_last_column_id` in java also allows to set `metadata.last_column_id` to any arbitrary value + // even if its not present as a column. I believe this to be undesired behavior. This is not possible with the current Rust interface. + // + // If the schema is built out of sync with the TableMetadata, for example in a REST Catalog setting, the assertion of + // the provided `last_column_id` as part of the `TableUpdate::AddSchema` is still done in its `.apply` method. + pub fn add_schema(mut self, schema: Schema) -> Self { + // fn returns a result because I think we should check field-id <-> type compatibility if the field-id + // is still present in the metadata. This is not done in the Java code. + let new_schema_id = self.reuse_or_create_new_schema_id(&schema); + let schema_found = self.metadata.schemas.contains_key(&new_schema_id); + + if schema_found { + // ToDo Discuss: The Java code is a bit convoluted and I think it might be wrong for an edge case. + // Why is it wrong: The baseline is, that if something changes the state of the builder, it has an effect on it and + // must be recorded in the changes. + // The Java code might or might not change `lastAddedSchemaId`, and does not record this change in `changes`. + // Thus, replaying the changes, would lead to a different result if a schema is added twice in unfavorable + // conditions. + // Here we do it differently, but a check from a Java maintainer would be nice. + if self.last_added_schema_id != Some(new_schema_id) { + self.changes.push(TableUpdate::AddSchema { + last_column_id: Some(self.metadata.last_column_id), + schema: schema.clone(), + }); + self.last_added_schema_id = Some(new_schema_id); + } + + return self; + } + + // New schemas might contain only old columns. In this case last_column_id should not be + // reduced. + self.metadata.last_column_id = + std::cmp::max(self.metadata.last_column_id, schema.highest_field_id()); + + // Set schema-id + let schema = match new_schema_id == schema.schema_id() { + true => schema, + false => schema.with_schema_id(new_schema_id), + }; + + self.metadata + .schemas + .insert(new_schema_id, schema.clone().into()); + + self.changes.push(TableUpdate::AddSchema { + schema, + last_column_id: Some(self.metadata.last_column_id), + }); + + self.last_added_schema_id = Some(new_schema_id); + + self + } + + /// Set the current schema id. + /// + /// Errors: + /// - provided `schema_id` is -1 but no schema has been added via `add_schema`. + /// - No schema with the provided `schema_id` exists. + pub fn set_current_schema(mut self, mut schema_id: i32) -> Result { + if schema_id == Self::LAST_ADDED { + schema_id = self.last_added_schema_id.ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + "Cannot set current schema to last added schema: no schema has been added.", + ) + })?; + }; + let schema_id = schema_id; // Make immutable + + if schema_id == self.metadata.current_schema_id { + return Ok(self); + } + + let _schema = self.metadata.schemas.get(&schema_id).ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + format!( + "Cannot set current schema to unknown schema with id: '{}'", + schema_id + ), + ) + })?; + + // Old partition specs and sort-orders should be preserved even if they are not compatible with the new schema, + // so that older metadata can still be interpreted. + // Default partition spec and sort order are checked in the build() method + // which allows other default partition specs and sort orders to be set before the build. + + self.metadata.current_schema_id = schema_id; + + if self.last_added_schema_id == Some(schema_id) { + self.changes.push(TableUpdate::SetCurrentSchema { + schema_id: Self::LAST_ADDED, + }); + } else { + self.changes + .push(TableUpdate::SetCurrentSchema { schema_id }); + } + + Ok(self) + } + + /// Add a schema and set it as the current schema. + pub fn add_current_schema(self, schema: Schema) -> Result { + self.add_schema(schema).set_current_schema(Self::LAST_ADDED) + } + + /// Add a partition spec to the table metadata. + /// + /// The spec is bound eagerly to the current schema. + /// If a schema is added in the same set of changes, the schema should be added first. + /// + /// Even if `unbound_spec.spec_id` is provided as `Some`, it may not be used. + /// + /// # Errors + /// - The partition spec cannot be bound to the current schema. + /// - The partition spec has non-sequential field ids and the table format version is 1. + pub fn add_partition_spec(mut self, unbound_spec: UnboundPartitionSpec) -> Result { + let new_spec_id = self.reuse_or_create_new_spec_id(&unbound_spec); + let spec_found = self.metadata.partition_specs.contains_key(&new_spec_id); + let unbound_spec = unbound_spec.with_spec_id(new_spec_id); + + if spec_found { + if self.last_added_spec_id != Some(new_spec_id) { + self.changes + .push(TableUpdate::AddSpec { spec: unbound_spec }); + self.last_added_spec_id = Some(new_spec_id); + } + + return Ok(self); + } + + let schema = self.get_current_schema()?.clone(); + let spec = PartitionSpecBuilder::new_from_unbound(unbound_spec.clone(), schema)? + .with_last_assigned_field_id(self.metadata.last_partition_id) + .with_spec_id(new_spec_id) + .build()?; + + if self.metadata.format_version <= FormatVersion::V1 && !spec.has_sequential_ids() { + return Err(Error::new( + ErrorKind::DataInvalid, + "Cannot add partition spec with non-sequential field ids to format version 1 table", + )); + } + + let highest_field_id = spec.highest_field_id(); + self.metadata + .partition_specs + .insert(new_spec_id, Arc::new(spec.into())); + self.changes + .push(TableUpdate::AddSpec { spec: unbound_spec }); + + self.last_added_spec_id = Some(new_spec_id); + self.metadata.last_partition_id = + std::cmp::max(self.metadata.last_partition_id, highest_field_id); + + Ok(self) + } + + /// Set the default partition spec. + /// + /// # Errors + /// - spec_id is -1 but no spec has been added via `add_partition_spec`. + /// - No partition spec with the provided `spec_id` exists. + pub fn set_default_partition_spec(mut self, mut spec_id: i32) -> Result { + if spec_id == Self::LAST_ADDED { + spec_id = self.last_added_spec_id.ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + "Cannot set default partition spec to last added spec: no spec has been added.", + ) + })?; + } + + if self.metadata.default_spec.spec_id() == spec_id { + return Ok(self); + } + + if !self.metadata.partition_specs.contains_key(&spec_id) { + return Err(Error::new( + ErrorKind::DataInvalid, + format!("Cannot set default partition spec to unknown spec with id: '{spec_id}'",), + )); + } + + let schemaless_spec = + self.metadata + .partition_specs + .get(&spec_id) + .ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + format!("Cannot set default partition spec to unknown spec with id: '{spec_id}'",), + ) + })? + .clone(); + let spec = + Arc::unwrap_or_clone(schemaless_spec).bind(self.get_current_schema()?.clone())?; + self.metadata.default_spec = Arc::new(spec); + + if self.last_added_spec_id == Some(spec_id) { + self.changes.push(TableUpdate::SetDefaultSpec { + spec_id: Self::LAST_ADDED, + }); + } else { + self.changes.push(TableUpdate::SetDefaultSpec { spec_id }); + } + + Ok(self) + } + + /// Add a partition spec and set it as the default + pub fn add_default_partition_spec(self, unbound_spec: UnboundPartitionSpec) -> Result { + self.add_partition_spec(unbound_spec)? + .set_default_partition_spec(Self::LAST_ADDED) + } + + /// Add a sort order to the table metadata. + /// + /// The spec is bound eagerly to the current schema and must be valid for it. + /// If a schema is added in the same set of changes, the schema should be added first. + /// + /// Even if `sort_order.order_id` is provided, it may not be used. + /// + /// # Errors + /// - Sort order id to add already exists. + /// - Sort order is incompatible with the current schema. + pub fn add_sort_order(mut self, sort_order: SortOrder) -> Result { + let new_order_id = self.reuse_or_create_new_sort_id(&sort_order); + let sort_order_found = self.metadata.sort_orders.contains_key(&new_order_id); + + if sort_order_found { + if self.last_added_order_id != Some(new_order_id) { + self.changes.push(TableUpdate::AddSortOrder { + sort_order: sort_order.clone(), + }); + self.last_added_order_id = Some(new_order_id); + } + + return Ok(self); + } + + // ToDo Discuss: Java builds a fresh spec here: + // https://github.com/apache/iceberg/blob/64b36999d7ff716ae2534fb0972fcc10d22a64c2/core/src/main/java/org/apache/iceberg/TableMetadata.java#L1613 + // For rust we make the assumption + // that the order that is added refers to the current schema and check compatibility with it. + + let schema = self.get_current_schema()?.clone().as_ref().clone(); + let sort_order = SortOrder::builder() + .with_order_id(new_order_id) + .with_fields(sort_order.fields) + .build(&schema) + .map_err(|e| { + Error::new( + ErrorKind::DataInvalid, + format!("Sort order to add is incompatible with current schema: {e}"), + ) + .with_source(e) + })?; + + self.last_added_order_id = Some(new_order_id); + self.metadata + .sort_orders + .insert(new_order_id, sort_order.clone().into()); + self.changes.push(TableUpdate::AddSortOrder { sort_order }); + + Ok(self) + } + + /// Set the default sort order. If `sort_order_id` is -1, the last added sort order is set as default. + /// + /// # Errors + /// - sort_order_id is -1 but no sort order has been added via `add_sort_order`. + /// - No sort order with the provided `sort_order_id` exists. + pub fn set_default_sort_order(mut self, mut sort_order_id: i64) -> Result { + if sort_order_id == Self::LAST_ADDED as i64 { + sort_order_id = self.last_added_order_id.ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + "Cannot set default sort order to last added order: no order has been added.", + ) + })?; + } + + if self.metadata.default_sort_order_id == sort_order_id { + return Ok(self); + } + + if !self.metadata.sort_orders.contains_key(&sort_order_id) { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Cannot set default sort order to unknown order with id: '{sort_order_id}'" + ), + )); + } + + self.metadata.default_sort_order_id = sort_order_id; + + if self.last_added_order_id == Some(sort_order_id) { + self.changes.push(TableUpdate::SetDefaultSortOrder { + sort_order_id: Self::LAST_ADDED as i64, + }); + } else { + self.changes + .push(TableUpdate::SetDefaultSortOrder { sort_order_id }); + } + + Ok(self) + } + + /// Add a sort order and set it as the default + fn add_default_sort_order(self, sort_order: SortOrder) -> Result { + self.add_sort_order(sort_order)? + .set_default_sort_order(Self::LAST_ADDED as i64) + } + + /// Build the table metadata. + pub fn build(mut self) -> Result { + if self.metadata.last_updated_ms == i64::default() { + self.metadata.last_updated_ms = chrono::Utc::now().timestamp_millis(); + } + + // Check compatibility of the current schema to the default partition spec and sort order. + // We use the `get_xxx` methods from the builder to avoid using the panicking + // `TableMetadata.default_partition_spec` etc. methods. + let schema = self.get_current_schema()?.clone(); + let sort_order = Arc::unwrap_or_clone(self.get_default_sort_order()?); + + self.metadata.default_spec = Arc::new( + Arc::unwrap_or_clone(self.metadata.default_spec) + .into_unbound() + .bind(schema.clone())?, + ); + SortOrder::builder() + .with_fields(sort_order.fields) + .build(&schema)?; + + let expired_metadata_logs = self.expire_metadata_log(); + self.update_snapshot_log()?; + self.metadata.try_normalize()?; + + if let Some(hist_entry) = self.previous_history_entry.take() { + self.metadata.metadata_log.push(hist_entry); + } + + Ok(TableMetadataBuildResult { + metadata: self.metadata, + changes: self.changes, + expired_metadata_logs, + }) + } + + fn expire_metadata_log(&mut self) -> Vec { + let max_size = self + .metadata + .properties + .get(PROPERTY_FORMAT_VERSION) + .and_then(|v| v.parse::().ok()) + .unwrap_or(PROPERTY_METADATA_PREVIOUS_VERSIONS_MAX_DEFAULT) + .max(1); + + if self.metadata.metadata_log.len() > max_size { + self.metadata + .metadata_log + .drain(0..self.metadata.metadata_log.len() - max_size) + .collect() + } else { + Vec::new() + } + } + + fn update_snapshot_log(&mut self) -> Result<()> { + let intermediate_snapshots = self.get_intermediate_snapshots(); + let has_removed_snapshots = self + .changes + .iter() + .any(|update| matches!(update, TableUpdate::RemoveSnapshots { .. })); + + if intermediate_snapshots.is_empty() && !has_removed_snapshots { + return Ok(()); + } + + let mut new_snapshot_log = Vec::new(); + for log_entry in &self.metadata.snapshot_log { + let snapshot_id = log_entry.snapshot_id; + if self.metadata.snapshots.contains_key(&snapshot_id) { + if !intermediate_snapshots.contains(&snapshot_id) { + new_snapshot_log.push(log_entry.clone()); + } + } else if has_removed_snapshots { + // any invalid entry causes the history before it to be removed. otherwise, there could be + // history gaps that cause time-travel queries to produce incorrect results. for example, + // if history is [(t1, s1), (t2, s2), (t3, s3)] and s2 is removed, the history cannot be + // [(t1, s1), (t3, s3)] because it appears that s3 was current during the time between t2 + // and t3 when in fact s2 was the current snapshot. + new_snapshot_log.clear(); + } + } + + if let Some(current_snapshot_id) = self.metadata.current_snapshot_id { + let last_id = new_snapshot_log.last().map(|entry| entry.snapshot_id); + if last_id != Some(current_snapshot_id) { + return Err(Error::new( + ErrorKind::DataInvalid, + "Cannot set invalid snapshot log: latest entry is not the current snapshot", + )); + } + }; + + self.metadata.snapshot_log = new_snapshot_log; + Ok(()) + } + + /// Finds intermediate snapshots that have not been committed as the current snapshot. + /// + /// Transactions can create snapshots that are never the current snapshot because several + /// changes are combined by the transaction into one table metadata update. when each + /// intermediate snapshot is added to table metadata, it is added to the snapshot log, assuming + /// that it will be the current snapshot. when there are multiple snapshot updates, the log must + /// be corrected by suppressing the intermediate snapshot entries. + /// + /// A snapshot is an intermediate snapshot if it was added but is not the current snapshot. + fn get_intermediate_snapshots(&self) -> HashSet { + let added_snapshot_ids = self + .changes + .iter() + .filter_map(|update| match update { + TableUpdate::AddSnapshot { snapshot } => Some(snapshot.snapshot_id()), + _ => None, + }) + .collect::>(); + + self.changes + .iter() + .filter_map(|update| match update { + TableUpdate::SetSnapshotRef { + ref_name, + reference, + } => { + if added_snapshot_ids.contains(&reference.snapshot_id) + && ref_name == MAIN_BRANCH + && reference.snapshot_id != self.metadata.current_snapshot_id.unwrap_or(-1) + { + Some(reference.snapshot_id) + } else { + None + } + } + _ => None, + }) + .collect() + } + + fn reassign_ids( + schema: Schema, + spec: UnboundPartitionSpec, + sort_order: SortOrder, + ) -> Result<(Schema, PartitionSpec, SortOrder)> { + // Re-assign field ids and schema ids for a new table. + let previous_id_to_name = schema.field_id_to_name_map().clone(); + let fresh_schema = schema + .into_builder() + .with_schema_id(DEFAULT_SCHEMA_ID) + .with_reassigned_field_ids(u32::try_from(DEFAULT_PARTITION_SPEC_ID).unwrap_or_default()) + .build()?; + + // Re-build partition spec with new ids + let mut fresh_spec = PartitionSpecBuilder::new(fresh_schema.clone()); + for field in spec.fields() { + let source_field_name = previous_id_to_name.get(&field.source_id).ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + format!( + "Cannot find source column with id {} for partition column {} in schema.", + field.source_id, field.name + ), + ) + })?; + fresh_spec = + fresh_spec.add_partition_field(source_field_name, &field.name, field.transform)?; + } + let fresh_spec = fresh_spec.build()?; + + // Re-build sort order with new ids + let mut fresh_order = SortOrder::builder(); + for mut field in sort_order.fields { + let source_field_name = previous_id_to_name.get(&field.source_id).ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + format!( + "Cannot find source column with id {} for sort column in schema.", + field.source_id + ), + ) + })?; + let new_field_id = fresh_schema + .field_by_name(source_field_name) + .ok_or_else(|| { + Error::new( + ErrorKind::Unexpected, + format!( + "Cannot find source column with name {} for sort column in re-assigned schema.", + source_field_name + ), + ) + })?.id; + field.source_id = new_field_id; + fresh_order.with_sort_field(field); + } + let fresh_sort_order = fresh_order.build(&fresh_schema)?; + + Ok((fresh_schema, fresh_spec, fresh_sort_order)) + } + + fn reuse_or_create_new_schema_id(&self, new_schema: &Schema) -> i32 { + self.metadata + .schemas + .iter() + .find_map(|(id, schema)| new_schema.is_same_schema(schema).then_some(*id)) + .unwrap_or_else(|| self.get_highest_schema_id() + 1) + } + + fn get_highest_schema_id(&self) -> i32 { + *self + .metadata + .schemas + .keys() + .max() + .unwrap_or(&self.metadata.current_schema_id) + } + + fn get_current_schema(&self) -> Result<&SchemaRef> { + self.metadata + .schemas + .get(&self.metadata.current_schema_id) + .ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + format!( + "Current schema with id '{}' not found in table metadata.", + self.metadata.current_schema_id + ), + ) + }) + } + + fn get_default_sort_order(&self) -> Result { + self.metadata + .sort_orders + .get(&self.metadata.default_sort_order_id) + .cloned() + .ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + format!( + "Default sort order with id '{}' not found in table metadata.", + self.metadata.default_sort_order_id + ), + ) + }) + } + + /// If a compatible spec already exists, use the same ID. Otherwise, use 1 more than the highest ID. + fn reuse_or_create_new_spec_id(&self, new_spec: &UnboundPartitionSpec) -> i32 { + self.metadata + .partition_specs + .iter() + .find_map(|(id, old_spec)| old_spec.is_compatible_with(new_spec).then_some(*id)) + .unwrap_or_else(|| { + self.get_highest_spec_id() + .map(|id| id + 1) + .unwrap_or(DEFAULT_PARTITION_SPEC_ID) + }) + } + + fn get_highest_spec_id(&self) -> Option { + self.metadata.partition_specs.keys().max().copied() + } + + /// If a compatible sort-order already exists, use the same ID. Otherwise, use 1 more than the highest ID. + fn reuse_or_create_new_sort_id(&self, new_sort_order: &SortOrder) -> i64 { + if new_sort_order.is_unsorted() { + return SortOrder::unsorted_order().order_id; + } + + self.metadata + .sort_orders + .iter() + .find_map(|(id, sort_order)| { + sort_order.fields.eq(&new_sort_order.fields).then_some(*id) + }) + .unwrap_or_else(|| { + self.highest_sort_order_id() + .unwrap_or(SortOrder::unsorted_order().order_id) + + 1 + }) + } + + fn highest_sort_order_id(&self) -> Option { + self.metadata.sort_orders.keys().max().copied() + } +} + +impl From for TableMetadata { + fn from(result: TableMetadataBuildResult) -> Self { + result.metadata + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::spec::{ + NestedField, NullOrder, Operation, PrimitiveType, Schema, SnapshotRetention, SortDirection, + SortField, StructType, Summary, Transform, Type, UnboundPartitionField, + }; + + const TEST_LOCATION: &str = "s3://bucket/test/location"; + const LAST_ASSIGNED_COLUMN_ID: i32 = 2; + + fn schema() -> Schema { + Schema::builder() + .with_fields(vec![ + NestedField::required(0, "x", Type::Primitive(PrimitiveType::Long)).into(), + NestedField::required(1, "y", Type::Primitive(PrimitiveType::Long)).into(), + NestedField::required(2, "z", Type::Primitive(PrimitiveType::Long)).into(), + ]) + .build() + .unwrap() + } + + fn sort_order() -> SortOrder { + let schema = schema(); + SortOrder::builder() + .with_order_id(1) + .with_sort_field(SortField { + source_id: 2, + transform: Transform::Bucket(4), + direction: SortDirection::Descending, + null_order: NullOrder::First, + }) + .build(&schema) + .unwrap() + } + + fn partition_spec() -> UnboundPartitionSpec { + UnboundPartitionSpec::builder() + .with_spec_id(0) + .add_partition_field(1, "y", Transform::Identity) + .unwrap() + .build() + } + + fn builder_without_changes(format_version: FormatVersion) -> TableMetadataBuilder { + TableMetadataBuilder::new( + schema(), + partition_spec(), + sort_order(), + TEST_LOCATION.to_string(), + format_version, + HashMap::new(), + ) + .unwrap() + .build() + .unwrap() + .metadata + .into_builder("s3://bucket/test/location/metadata/metadata1.json") + } + + #[test] + fn test_minimal_build() { + let metadata = TableMetadataBuilder::new( + schema(), + partition_spec(), + sort_order(), + TEST_LOCATION.to_string(), + FormatVersion::V1, + HashMap::new(), + ) + .unwrap() + .build() + .unwrap() + .metadata; + + assert_eq!(metadata.format_version, FormatVersion::V1); + assert_eq!(metadata.location, TEST_LOCATION); + assert_eq!(metadata.current_schema_id, 0); + assert_eq!(metadata.default_spec.spec_id(), 0); + assert_eq!(metadata.default_sort_order_id, 1); + assert_eq!(metadata.last_partition_id, 1000); + assert_eq!(metadata.last_column_id, 2); + assert_eq!(metadata.snapshots.len(), 0); + assert_eq!(metadata.refs.len(), 0); + assert_eq!(metadata.properties.len(), 0); + assert_eq!(metadata.metadata_log.len(), 0); + assert_eq!(metadata.last_sequence_number, 0); + assert_eq!(metadata.last_column_id, LAST_ASSIGNED_COLUMN_ID); + + // Test can serialize v1 + let _ = serde_json::to_string(&metadata).unwrap(); + + // Test can serialize v2 + let metadata = metadata + .into_builder("s3://bucket/test/location/metadata/metadata1.json") + .upgrade_format_version(FormatVersion::V2) + .unwrap() + .build() + .unwrap() + .metadata; + + assert_eq!(metadata.format_version, FormatVersion::V2); + let _ = serde_json::to_string(&metadata).unwrap(); + } + + #[test] + fn test_reassigns_ids() { + let schema = Schema::builder() + .with_schema_id(10) + .with_fields(vec![ + NestedField::required(11, "a", Type::Primitive(PrimitiveType::Long)).into(), + NestedField::required(12, "b", Type::Primitive(PrimitiveType::Long)).into(), + NestedField::required( + 13, + "struct", + Type::Struct(StructType::new(vec![NestedField::required( + 14, + "nested", + Type::Primitive(PrimitiveType::Long), + ) + .into()])), + ) + .into(), + NestedField::required(15, "c", Type::Primitive(PrimitiveType::Long)).into(), + ]) + .build() + .unwrap(); + let spec = PartitionSpec::builder(schema.clone()) + .with_spec_id(20) + .add_partition_field("a", "a", Transform::Identity) + .unwrap() + .add_partition_field("struct.nested", "nested_partition", Transform::Identity) + .unwrap() + .build() + .unwrap(); + let sort_order = SortOrder::builder() + .with_fields(vec![SortField { + source_id: 11, + transform: Transform::Identity, + direction: SortDirection::Ascending, + null_order: NullOrder::First, + }]) + .with_order_id(10) + .build(&schema) + .unwrap(); + + let (fresh_schema, fresh_spec, fresh_sort_order) = + TableMetadataBuilder::reassign_ids(schema, spec.into_unbound(), sort_order).unwrap(); + + let expected_schema = Schema::builder() + .with_fields(vec![ + NestedField::required(0, "a", Type::Primitive(PrimitiveType::Long)).into(), + NestedField::required(1, "b", Type::Primitive(PrimitiveType::Long)).into(), + NestedField::required( + 2, + "struct", + Type::Struct(StructType::new(vec![NestedField::required( + 4, + "nested", + Type::Primitive(PrimitiveType::Long), + ) + .into()])), + ) + .into(), + NestedField::required(3, "c", Type::Primitive(PrimitiveType::Long)).into(), + ]) + .build() + .unwrap(); + + let expected_spec = PartitionSpec::builder(expected_schema.clone()) + .with_spec_id(0) + .add_partition_field("a", "a", Transform::Identity) + .unwrap() + .add_partition_field("struct.nested", "nested_partition", Transform::Identity) + .unwrap() + .build() + .unwrap(); + + let expected_sort_order = SortOrder::builder() + .with_fields(vec![SortField { + source_id: 0, + transform: Transform::Identity, + direction: SortDirection::Ascending, + null_order: NullOrder::First, + }]) + .with_order_id(1) + .build(&expected_schema) + .unwrap(); + + assert_eq!(fresh_schema, expected_schema); + assert_eq!(fresh_spec, expected_spec); + assert_eq!(fresh_sort_order, expected_sort_order); + } + + #[test] + fn test_ids_are_reassigned_for_new_metadata() { + let schema = schema().into_builder().with_schema_id(10).build().unwrap(); + + let metadata = TableMetadataBuilder::new( + schema, + partition_spec(), + sort_order(), + TEST_LOCATION.to_string(), + FormatVersion::V1, + HashMap::new(), + ) + .unwrap() + .build() + .unwrap() + .metadata; + + assert_eq!(metadata.current_schema_id, 0); + assert_eq!(metadata.current_schema().schema_id(), 0); + } + + #[test] + fn test_new_metadata_changes() { + let changes = TableMetadataBuilder::new( + schema(), + partition_spec(), + sort_order(), + TEST_LOCATION.to_string(), + FormatVersion::V1, + HashMap::new(), + ) + .unwrap() + .build() + .unwrap() + .changes; + + pretty_assertions::assert_eq!(changes, vec![ + TableUpdate::SetLocation { + location: TEST_LOCATION.to_string() + }, + TableUpdate::AddSchema { + last_column_id: Some(LAST_ASSIGNED_COLUMN_ID), + schema: schema(), + }, + TableUpdate::SetCurrentSchema { schema_id: -1 }, + TableUpdate::AddSpec { + // Because this is a new tables, field-ids are assigned + // partition_spec() has None set for field-id + spec: PartitionSpec::builder(schema()) + .with_spec_id(0) + .add_unbound_field(UnboundPartitionField { + name: "y".to_string(), + transform: Transform::Identity, + source_id: 1, + field_id: Some(1000) + }) + .unwrap() + .build() + .unwrap() + .into_unbound(), + }, + TableUpdate::SetDefaultSpec { spec_id: -1 }, + TableUpdate::AddSortOrder { + sort_order: sort_order(), + }, + TableUpdate::SetDefaultSortOrder { sort_order_id: -1 }, + ]); + } + + #[test] + fn test_add_partition_spec() { + let builder = builder_without_changes(FormatVersion::V2); + + let added_spec = UnboundPartitionSpec::builder() + .with_spec_id(10) + .add_partition_fields(vec![ + UnboundPartitionField { + // The previous field - has field_id set + name: "y".to_string(), + transform: Transform::Identity, + source_id: 1, + field_id: Some(1000), + }, + UnboundPartitionField { + // A new field without field id - should still be without field id in changes + name: "z".to_string(), + transform: Transform::Identity, + source_id: 2, + field_id: None, + }, + ]) + .unwrap() + .build(); + + let build_result = builder + .add_partition_spec(added_spec.clone()) + .unwrap() + .build() + .unwrap(); + + // Spec id should be re-assigned + let expected_change = added_spec.with_spec_id(1); + let expected_spec = PartitionSpec::builder(schema()) + .with_spec_id(1) + .add_unbound_field(UnboundPartitionField { + name: "y".to_string(), + transform: Transform::Identity, + source_id: 1, + field_id: Some(1000), + }) + .unwrap() + .add_unbound_field(UnboundPartitionField { + name: "z".to_string(), + transform: Transform::Identity, + source_id: 2, + field_id: Some(1001), + }) + .unwrap() + .build() + .unwrap(); + + assert_eq!(build_result.changes.len(), 1); + assert_eq!( + build_result.metadata.partition_spec_by_id(1), + Some(&Arc::new(expected_spec.into_schemaless())) + ); + assert_eq!(build_result.metadata.default_spec.spec_id(), 0); + assert_eq!(build_result.metadata.last_partition_id, 1001); + pretty_assertions::assert_eq!(build_result.changes[0], TableUpdate::AddSpec { + spec: expected_change + }); + } + + #[test] + fn test_set_default_partition_spec() { + let builder = builder_without_changes(FormatVersion::V2); + let schema = builder.get_current_schema().unwrap().clone(); + let added_spec = UnboundPartitionSpec::builder() + .with_spec_id(10) + .add_partition_field(1, "y_bucket[2]", Transform::Bucket(2)) + .unwrap() + .build(); + + let build_result = builder + .add_partition_spec(added_spec.clone()) + .unwrap() + .set_default_partition_spec(-1) + .unwrap() + .build() + .unwrap(); + + let expected_spec = PartitionSpec::builder(schema) + .with_spec_id(1) + .add_unbound_field(UnboundPartitionField { + name: "y_bucket[2]".to_string(), + transform: Transform::Bucket(2), + source_id: 1, + field_id: Some(1001), + }) + .unwrap() + .build() + .unwrap(); + + assert_eq!(build_result.changes.len(), 2); + assert_eq!(build_result.metadata.default_spec, Arc::new(expected_spec)); + assert_eq!(build_result.changes, vec![ + TableUpdate::AddSpec { + // Should contain the actual ID that was used + spec: added_spec.with_spec_id(1) + }, + TableUpdate::SetDefaultSpec { spec_id: -1 } + ]); + } + + #[test] + fn test_set_existing_default_partition_spec() { + let builder = builder_without_changes(FormatVersion::V2); + // Add and set an unbound spec as current + let unbound_spec = UnboundPartitionSpec::builder().with_spec_id(1).build(); + let build_result = builder + .add_partition_spec(unbound_spec.clone()) + .unwrap() + .set_default_partition_spec(-1) + .unwrap() + .build() + .unwrap(); + + assert_eq!(build_result.changes.len(), 2); + assert_eq!(build_result.changes[0], TableUpdate::AddSpec { + spec: unbound_spec.clone() + }); + assert_eq!(build_result.changes[1], TableUpdate::SetDefaultSpec { + spec_id: -1 + }); + assert_eq!( + build_result.metadata.default_spec, + Arc::new( + unbound_spec + .bind(build_result.metadata.current_schema().clone()) + .unwrap() + ) + ); + + // Set old spec again + let build_result = build_result + .metadata + .into_builder("s3://bucket/test/location/metadata/metadata1.json") + .set_default_partition_spec(0) + .unwrap() + .build() + .unwrap(); + + assert_eq!(build_result.changes.len(), 1); + assert_eq!(build_result.changes[0], TableUpdate::SetDefaultSpec { + spec_id: 0 + }); + assert_eq!( + build_result.metadata.default_spec, + Arc::new( + partition_spec() + .bind(build_result.metadata.current_schema().clone()) + .unwrap() + ) + ); + } + + #[test] + fn test_add_sort_order() { + let builder = builder_without_changes(FormatVersion::V2); + + let added_sort_order = SortOrder::builder() + .with_order_id(10) + .with_fields(vec![SortField { + source_id: 1, + transform: Transform::Identity, + direction: SortDirection::Ascending, + null_order: NullOrder::First, + }]) + .build(&schema()) + .unwrap(); + + let build_result = builder + .add_sort_order(added_sort_order.clone()) + .unwrap() + .build() + .unwrap(); + + let expected_sort_order = added_sort_order.with_order_id(2); + + assert_eq!(build_result.changes.len(), 1); + assert_eq!(build_result.metadata.sort_orders.keys().max(), Some(&2)); + pretty_assertions::assert_eq!( + build_result.metadata.sort_order_by_id(2), + Some(&Arc::new(expected_sort_order.clone())) + ); + pretty_assertions::assert_eq!(build_result.changes[0], TableUpdate::AddSortOrder { + sort_order: expected_sort_order + }); + } + + #[test] + fn test_add_compatible_schema() { + let builder = builder_without_changes(FormatVersion::V2); + + let added_schema = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(0, "x", Type::Primitive(PrimitiveType::Long)).into(), + NestedField::required(1, "y", Type::Primitive(PrimitiveType::Long)).into(), + NestedField::required(2, "z", Type::Primitive(PrimitiveType::Long)).into(), + NestedField::required(3, "a", Type::Primitive(PrimitiveType::Long)).into(), + ]) + .build() + .unwrap(); + + let build_result = builder + .add_current_schema(added_schema.clone()) + .unwrap() + .build() + .unwrap(); + + assert_eq!(build_result.changes.len(), 2); + assert_eq!(build_result.metadata.schemas.keys().max(), Some(&1)); + pretty_assertions::assert_eq!( + build_result.metadata.schema_by_id(1), + Some(&Arc::new(added_schema.clone())) + ); + pretty_assertions::assert_eq!(build_result.changes[0], TableUpdate::AddSchema { + last_column_id: Some(3), + schema: added_schema + }); + assert_eq!(build_result.changes[1], TableUpdate::SetCurrentSchema { + schema_id: -1 + }); + } + + #[test] + fn test_set_current_schema_change_is_minus_one_if_schema_was_added_in_this_change() { + let builder = builder_without_changes(FormatVersion::V2); + + let added_schema = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(0, "x", Type::Primitive(PrimitiveType::Long)).into(), + NestedField::required(1, "y", Type::Primitive(PrimitiveType::Long)).into(), + NestedField::required(2, "z", Type::Primitive(PrimitiveType::Long)).into(), + NestedField::required(3, "a", Type::Primitive(PrimitiveType::Long)).into(), + ]) + .build() + .unwrap(); + + let build_result = builder + .add_schema(added_schema.clone()) + .set_current_schema(1) + .unwrap() + .build() + .unwrap(); + + assert_eq!(build_result.changes.len(), 2); + assert_eq!(build_result.changes[1], TableUpdate::SetCurrentSchema { + schema_id: -1 + }); + } + + #[test] + fn test_no_metadata_log_for_create_table() { + let build_result = TableMetadataBuilder::new( + schema(), + partition_spec(), + sort_order(), + TEST_LOCATION.to_string(), + FormatVersion::V2, + HashMap::new(), + ) + .unwrap() + .build() + .unwrap(); + + assert_eq!(build_result.metadata.metadata_log.len(), 0); + } + + #[test] + fn test_from_metadata_generates_metadata_log() { + let metadata_path = "s3://bucket/test/location/metadata/metadata1.json"; + let builder = TableMetadataBuilder::new( + schema(), + partition_spec(), + sort_order(), + TEST_LOCATION.to_string(), + FormatVersion::V2, + HashMap::new(), + ) + .unwrap() + .build() + .unwrap() + .metadata + .into_builder(metadata_path); + + let builder = builder + .add_default_sort_order(SortOrder::unsorted_order()) + .unwrap(); + + let build_result = builder.build().unwrap(); + + assert_eq!(build_result.metadata.metadata_log.len(), 1); + assert_eq!( + build_result.metadata.metadata_log[0].metadata_file, + metadata_path + ); + } + + #[test] + fn test_set_ref() { + let builder = builder_without_changes(FormatVersion::V2); + + let snapshot = Snapshot::builder() + .with_snapshot_id(1) + .with_timestamp_ms(builder.last_updated_ms() + 1) + .with_sequence_number(0) + .with_schema_id(0) + .with_manifest_list("/snap-1.avro") + .with_summary(Summary { + operation: Operation::Append, + other: HashMap::from_iter(vec![ + ( + "spark.app.id".to_string(), + "local-1662532784305".to_string(), + ), + ("added-data-files".to_string(), "4".to_string()), + ("added-records".to_string(), "4".to_string()), + ("added-files-size".to_string(), "6001".to_string()), + ]), + }) + .build(); + + let builder = builder.add_snapshot(snapshot.clone()).unwrap(); + + assert!(builder + .clone() + .set_ref(MAIN_BRANCH, SnapshotReference { + snapshot_id: 10, + retention: SnapshotRetention::Branch { + min_snapshots_to_keep: Some(10), + max_snapshot_age_ms: None, + max_ref_age_ms: None, + }, + }) + .unwrap_err() + .to_string() + .contains("Cannot set 'main' to unknown snapshot: '10'")); + + let build_result = builder + .set_ref(MAIN_BRANCH, SnapshotReference { + snapshot_id: 1, + retention: SnapshotRetention::Branch { + min_snapshots_to_keep: Some(10), + max_snapshot_age_ms: None, + max_ref_age_ms: None, + }, + }) + .unwrap() + .build() + .unwrap(); + assert_eq!(build_result.metadata.snapshots.len(), 1); + assert_eq!( + build_result.metadata.snapshot_by_id(1), + Some(&Arc::new(snapshot.clone())) + ); + assert_eq!(build_result.metadata.snapshot_log, vec![SnapshotLog { + snapshot_id: 1, + timestamp_ms: snapshot.timestamp_ms() + }]) + } + + #[test] + fn test_snapshot_log_skips_intermediates() { + let builder = builder_without_changes(FormatVersion::V2); + + let snapshot_1 = Snapshot::builder() + .with_snapshot_id(1) + .with_timestamp_ms(builder.last_updated_ms() + 1) + .with_sequence_number(0) + .with_schema_id(0) + .with_manifest_list("/snap-1.avro") + .with_summary(Summary { + operation: Operation::Append, + other: HashMap::from_iter(vec![ + ( + "spark.app.id".to_string(), + "local-1662532784305".to_string(), + ), + ("added-data-files".to_string(), "4".to_string()), + ("added-records".to_string(), "4".to_string()), + ("added-files-size".to_string(), "6001".to_string()), + ]), + }) + .build(); + + let snapshot_2 = Snapshot::builder() + .with_snapshot_id(2) + .with_timestamp_ms(builder.last_updated_ms() + 1) + .with_sequence_number(0) + .with_schema_id(0) + .with_manifest_list("/snap-1.avro") + .with_summary(Summary { + operation: Operation::Append, + other: HashMap::from_iter(vec![ + ( + "spark.app.id".to_string(), + "local-1662532784305".to_string(), + ), + ("added-data-files".to_string(), "4".to_string()), + ("added-records".to_string(), "4".to_string()), + ("added-files-size".to_string(), "6001".to_string()), + ]), + }) + .build(); + + let result = builder + .add_snapshot(snapshot_1) + .unwrap() + .set_ref(MAIN_BRANCH, SnapshotReference { + snapshot_id: 1, + retention: SnapshotRetention::Branch { + min_snapshots_to_keep: Some(10), + max_snapshot_age_ms: None, + max_ref_age_ms: None, + }, + }) + .unwrap() + .append_snapshot(snapshot_2.clone(), Some(MAIN_BRANCH)) + .unwrap() + .build() + .unwrap(); + + assert_eq!(result.metadata.snapshot_log, vec![SnapshotLog { + snapshot_id: 2, + timestamp_ms: snapshot_2.timestamp_ms() + }]); + assert_eq!(result.metadata.current_snapshot().unwrap().snapshot_id(), 2); + } + + #[test] + fn test_cannot_add_duplicate_snapshot_id() { + let builder = builder_without_changes(FormatVersion::V2); + + let snapshot = Snapshot::builder() + .with_snapshot_id(2) + .with_timestamp_ms(builder.last_updated_ms() + 1) + .with_sequence_number(0) + .with_schema_id(0) + .with_manifest_list("/snap-1.avro") + .with_summary(Summary { + operation: Operation::Append, + other: HashMap::from_iter(vec![ + ( + "spark.app.id".to_string(), + "local-1662532784305".to_string(), + ), + ("added-data-files".to_string(), "4".to_string()), + ("added-records".to_string(), "4".to_string()), + ("added-files-size".to_string(), "6001".to_string()), + ]), + }) + .build(); + + let builder = builder.add_snapshot(snapshot.clone()).unwrap(); + builder.add_snapshot(snapshot).unwrap_err(); + } + + #[test] + fn test_add_incompatible_current_schema_fails() { + let builder = builder_without_changes(FormatVersion::V2); + + let added_schema = Schema::builder() + .with_schema_id(1) + .with_fields(vec![]) + .build() + .unwrap(); + + let err = builder + .add_current_schema(added_schema) + .unwrap() + .build() + .unwrap_err(); + + assert!(err + .to_string() + .contains("Cannot find partition source field")); + } + + #[test] + fn test_add_partition_spec_for_v1_requires_sequential_ids() { + let builder = builder_without_changes(FormatVersion::V1); + + let added_spec = UnboundPartitionSpec::builder() + .with_spec_id(10) + .add_partition_fields(vec![ + UnboundPartitionField { + name: "y".to_string(), + transform: Transform::Identity, + source_id: 1, + field_id: Some(1000), + }, + UnboundPartitionField { + name: "z".to_string(), + transform: Transform::Identity, + source_id: 2, + field_id: Some(1002), + }, + ]) + .unwrap() + .build(); + + let err = builder.add_partition_spec(added_spec).unwrap_err(); + assert!(err.to_string().contains( + "Cannot add partition spec with non-sequential field ids to format version 1 table" + )); + } +} diff --git a/crates/iceberg/src/spec/values.rs b/crates/iceberg/src/spec/values.rs index 3568d3dcd..3c6e2aa68 100644 --- a/crates/iceberg/src/spec/values.rs +++ b/crates/iceberg/src/spec/values.rs @@ -3192,10 +3192,10 @@ mod tests { (Literal::Primitive(PrimitiveLiteral::Int(3)), None), ])), &Type::Map(MapType { - key_field: NestedField::map_key_element(0, Type::Primitive(PrimitiveType::Int)) + key_field: NestedField::map_key_element(2, Type::Primitive(PrimitiveType::Int)) .into(), value_field: NestedField::map_value_element( - 1, + 3, Type::Primitive(PrimitiveType::Long), false, ) @@ -3219,10 +3219,10 @@ mod tests { ), ])), &Type::Map(MapType { - key_field: NestedField::map_key_element(0, Type::Primitive(PrimitiveType::Int)) + key_field: NestedField::map_key_element(2, Type::Primitive(PrimitiveType::Int)) .into(), value_field: NestedField::map_value_element( - 1, + 3, Type::Primitive(PrimitiveType::Long), true, ) @@ -3249,10 +3249,10 @@ mod tests { ), ])), &Type::Map(MapType { - key_field: NestedField::map_key_element(0, Type::Primitive(PrimitiveType::String)) + key_field: NestedField::map_key_element(2, Type::Primitive(PrimitiveType::String)) .into(), value_field: NestedField::map_value_element( - 1, + 3, Type::Primitive(PrimitiveType::Int), false, ) @@ -3276,10 +3276,10 @@ mod tests { ), ])), &Type::Map(MapType { - key_field: NestedField::map_key_element(0, Type::Primitive(PrimitiveType::String)) + key_field: NestedField::map_key_element(2, Type::Primitive(PrimitiveType::String)) .into(), value_field: NestedField::map_value_element( - 1, + 3, Type::Primitive(PrimitiveType::Int), true, ) @@ -3299,9 +3299,9 @@ mod tests { None, ])), &Type::Struct(StructType::new(vec![ - NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::optional(2, "name", Type::Primitive(PrimitiveType::String)).into(), - NestedField::optional(3, "address", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(2, "id", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional(3, "name", Type::Primitive(PrimitiveType::String)).into(), + NestedField::optional(4, "address", Type::Primitive(PrimitiveType::String)).into(), ])), ); } diff --git a/crates/iceberg/src/writer/file_writer/location_generator.rs b/crates/iceberg/src/writer/file_writer/location_generator.rs index 44326190d..1d5dedda1 100644 --- a/crates/iceberg/src/writer/file_writer/location_generator.rs +++ b/crates/iceberg/src/writer/file_writer/location_generator.rs @@ -132,7 +132,7 @@ pub(crate) mod test { use uuid::Uuid; use super::LocationGenerator; - use crate::spec::{FormatVersion, TableMetadata}; + use crate::spec::{FormatVersion, PartitionSpec, TableMetadata}; use crate::writer::file_writer::location_generator::{ FileNameGenerator, WRITE_DATA_LOCATION, WRITE_FOLDER_STORAGE_LOCATION, }; @@ -156,6 +156,7 @@ pub(crate) mod test { #[test] fn test_default_location_generate() { + let schema = crate::spec::Schema::builder().build().unwrap(); let mut table_metadata = TableMetadata { format_version: FormatVersion::V2, table_uuid: Uuid::parse_str("fb072c92-a02b-11e9-ae9c-1bb7bc9eca94").unwrap(), @@ -165,7 +166,7 @@ pub(crate) mod test { schemas: HashMap::new(), current_schema_id: 1, partition_specs: HashMap::new(), - default_spec_id: 1, + default_spec: PartitionSpec::unpartition_spec(schema).into(), last_partition_id: 1000, default_sort_order_id: 0, sort_orders: HashMap::from_iter(vec![]),