Skip to content

Commit

Permalink
collator
Browse files Browse the repository at this point in the history
  • Loading branch information
robertbastian committed Jun 22, 2023
1 parent bfc4842 commit 68974fb
Show file tree
Hide file tree
Showing 11 changed files with 173 additions and 264 deletions.
3 changes: 2 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion components/collator/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,17 +43,20 @@ zerovec = { version = "0.9.4", path = "../../utils/zerovec" }
databake = { version = "0.1.3", path = "../../utils/databake", optional = true, features = ["derive"] }
serde = { version = "1.0", default-features = false, features = ["derive", "alloc"], optional = true }

icu_collator_data = { path = "data", optional = true }
icu_locid_transform = { path = "../../components/locid_transform", features = ["data"], optional = true }

[dev-dependencies]
arraystring = "0.3.0"
atoi = "1.0.0"
icu = { path = "../icu" }
icu_testdata = { path = "../../provider/testdata", default-features = false, features = ["icu_collator", "icu_normalizer", "icu_locid_transform"] }
criterion = "0.4"

[features]
std = ["icu_collections/std", "icu_locid/std", "icu_normalizer/std", "icu_properties/std", "icu_provider/std"]
serde = ["dep:serde", "zerovec/serde", "icu_properties/serde", "icu_normalizer/serde", "icu_collections/serde", "icu_provider/serde"]
datagen = ["serde", "dep:databake", "zerovec/databake", "icu_properties/datagen", "icu_normalizer/datagen", "icu_collections/databake"]
data = ["dep:icu_collator_data", "icu_normalizer/data", "dep:icu_locid_transform"]

[[test]]
name = "tests"
Expand Down
39 changes: 14 additions & 25 deletions components/collator/README.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 1 addition & 6 deletions components/collator/benches/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -156,12 +156,7 @@ pub fn collator_with_locale(criterion: &mut Criterion) {
for (index, strength) in benched_strength.iter().enumerate() {
let mut options = CollatorOptions::new();
options.strength = Some(*strength);
let collator = Collator::try_new_unstable(
&icu_testdata::unstable(),
&locale_under_bench,
options,
)
.unwrap();
let collator = Collator::try_new(&locale_under_bench, options).unwrap();
// ICU4X collator performance, sort is locale-aware
group.bench_function(
BenchmarkId::new(
Expand Down
3 changes: 1 addition & 2 deletions components/collator/fuzz/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@ rust_icu_sys = { version = "3", features = ["use-bindgen", "icu_config"] }
rust_icu_ustring = { version = "3", features = ["use-bindgen", "icu_config"] }
encoding_rs = "0.8.31"
icu_locid = { path = "../../../components/locid" }
icu_testdata = { path = "../../../provider/testdata" }
icu_collator = { path = ".." }
icu_collator = { path = "..", features = ["data"] }

# Prevent this from interfering with workspaces
[workspace]
Expand Down
2 changes: 1 addition & 1 deletion components/collator/fuzz/fuzz_targets/compare_utf16.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ fn compare_icu4x(
) -> Ordering {
let locale: Locale = locale_str.parse().unwrap();

let collator: Collator = Collator::try_new_unstable(locale, &icu_testdata::unstable(), options).unwrap();
let collator: Collator = Collator::try_new(locale, options).unwrap();
collator.compare_utf16(left, right)
}

Expand Down
84 changes: 61 additions & 23 deletions components/collator/src/comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,35 @@ pub struct Collator {

impl Collator {
/// Instantiates a collator for a given locale with the given options
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
/// <div class="stab unstable">
/// ⚠️ The bounds on this function may change over time, including in SemVer minor releases.
/// </div>
#[cfg(feature = "data")]
pub fn try_new(locale: &DataLocale, options: CollatorOptions) -> Result<Self, CollatorError> {
Self::try_new_unstable_internal(
&crate::provider::Baked,
DataPayload::from_static_ref(
icu_normalizer::provider::Baked::SINGLETON_NORMALIZER_NFD_V1,
),
DataPayload::from_static_ref(
icu_normalizer::provider::Baked::SINGLETON_NORMALIZER_NFDEX_V1,
),
DataPayload::from_static_ref(crate::provider::Baked::SINGLETON_COLLATOR_JAMO_V1),
|| {
Ok(DataPayload::from_static_ref(
crate::provider::Baked::SINGLETON_COLLATOR_PRIM_V1,
))
},
locale,
options,
)
}

icu_provider::gen_any_buffer_data_constructors!(
locale: include,
options: CollatorOptions,
error: CollatorError,
#[cfg(skip)]
);

#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::try_new)]
pub fn try_new_unstable<D>(
data_provider: &D,
locale: &DataLocale,
Expand All @@ -89,6 +113,36 @@ impl Collator {
+ DataProvider<CanonicalDecompositionDataV1Marker>
+ DataProvider<CanonicalDecompositionTablesV1Marker>
+ ?Sized,
{
Self::try_new_unstable_internal(
data_provider,
data_provider.load(Default::default())?.take_payload()?,
data_provider.load(Default::default())?.take_payload()?,
data_provider.load(Default::default())?.take_payload()?,
|| data_provider.load(Default::default())?.take_payload(),
locale,
options,
)
}

fn try_new_unstable_internal<D>(
data_provider: &D,
decompositions: DataPayload<CanonicalDecompositionDataV1Marker>,
tables: DataPayload<CanonicalDecompositionTablesV1Marker>,
jamo: DataPayload<CollationJamoV1Marker>,
special_primaries: impl FnOnce() -> Result<
DataPayload<CollationSpecialPrimariesV1Marker>,
DataError,
>,
locale: &DataLocale,
options: CollatorOptions,
) -> Result<Self, CollatorError>
where
D: DataProvider<CollationDataV1Marker>
+ DataProvider<CollationDiacriticsV1Marker>
+ DataProvider<CollationMetadataV1Marker>
+ DataProvider<CollationReorderingV1Marker>
+ ?Sized,
{
let req = DataRequest {
locale,
Expand Down Expand Up @@ -145,20 +199,11 @@ impl Collator {
return Err(CollatorError::MalformedData);
}

let jamo: DataPayload<CollationJamoV1Marker> = data_provider
.load(Default::default())? // TODO: redesign Korean search collation handling
.take_payload()?;

// TODO: redesign Korean search collation handling
if jamo.get().ce32s.len() != JAMO_COUNT {
return Err(CollatorError::MalformedData);
}

let decompositions: DataPayload<CanonicalDecompositionDataV1Marker> =
data_provider.load(Default::default())?.take_payload()?;

let tables: DataPayload<CanonicalDecompositionTablesV1Marker> =
data_provider.load(Default::default())?.take_payload()?;

let mut altered_defaults = CollatorOptionsBitField::new();

if metadata.alternate_shifted() {
Expand All @@ -177,8 +222,7 @@ impl Collator {
let special_primaries = if merged_options.alternate_handling() == AlternateHandling::Shifted
|| merged_options.numeric()
{
let special_primaries: DataPayload<CollationSpecialPrimariesV1Marker> =
data_provider.load(Default::default())?.take_payload()?;
let special_primaries = special_primaries()?;
// `variant_count` isn't stable yet:
// https://github.com/rust-lang/rust/issues/73662
if special_primaries.get().last_primaries.len() <= (MaxVariable::Currency as usize) {
Expand All @@ -203,12 +247,6 @@ impl Collator {
})
}

icu_provider::gen_any_buffer_constructors!(
locale: include,
options: CollatorOptions,
error: CollatorError
);

/// Compare potentially ill-formed UTF-16 slices. Unpaired surrogates
/// are compared as if each one was a REPLACEMENT CHARACTER.
pub fn compare_utf16(&self, left: &[u16], right: &[u16]) -> Ordering {
Expand Down
Loading

0 comments on commit 68974fb

Please sign in to comment.