From 92eeb45811055e0f055c21ac28e536a41156e57f Mon Sep 17 00:00:00 2001 From: Frank Tang Date: Thu, 26 Oct 2023 16:24:36 -0700 Subject: [PATCH] ICU-22547 fix addLikelySubtags for 4 chars script code Also fix ICU-22546 to correct the comments in the API doc and add additional unit tests --- icu4c/source/common/loclikely.cpp | 9 ++++- icu4c/source/common/unicode/locid.h | 10 ++--- icu4c/source/common/unicode/uloc.h | 9 +++-- icu4c/source/test/cintltst/cloctst.c | 38 ++++++++++++++++-- icu4c/source/test/intltest/loctest.cpp | 39 +++++++++++++++++++ .../ibm/icu/dev/test/util/ULocaleTest.java | 32 +++++++++++++++ .../main/java/com/ibm/icu/util/ULocale.java | 8 ++-- 7 files changed, 127 insertions(+), 18 deletions(-) diff --git a/icu4c/source/common/loclikely.cpp b/icu4c/source/common/loclikely.cpp index eedfb8149e26..c7cf21486e0c 100644 --- a/icu4c/source/common/loclikely.cpp +++ b/icu4c/source/common/loclikely.cpp @@ -467,7 +467,14 @@ _uloc_addLikelySubtags(const char* localeID, goto error; } if (langLength > 3) { - goto error; + if (langLength == 4 && scriptLength == 0) { + langLength = 0; + scriptLength = 4; + uprv_memcpy(script, lang, 4); + lang[0] = '\0'; + } else { + goto error; + } } /* Find the length of the trailing portion. */ diff --git a/icu4c/source/common/unicode/locid.h b/icu4c/source/common/unicode/locid.h index f0bdc7ca5141..98bc28c0df0b 100644 --- a/icu4c/source/common/unicode/locid.h +++ b/icu4c/source/common/unicode/locid.h @@ -518,20 +518,20 @@ class U_COMMON_API Locale : public UObject { * If this Locale is already in the maximal form, or not valid, or there is * no data available for maximization, the Locale will be unchanged. * - * For example, "und-Zzzz" cannot be maximized, since there is no + * For example, "sh" cannot be maximized, since there is no * reasonable maximization. * * Examples: * + * "und_Zzzz" maximizes to "en_Latn_US" + * * "en" maximizes to "en_Latn_US" * - * "de" maximizes to "de_Latn_US" + * "de" maximizes to "de_Latn_DE" * * "sr" maximizes to "sr_Cyrl_RS" * - * "sh" maximizes to "sr_Latn_RS" (Note this will not reverse.) - * - * "zh_Hani" maximizes to "zh_Hans_CN" (Note this will not reverse.) + * "zh_Hani" maximizes to "zh_Hani_CN" * * @param status error information if maximizing this Locale failed. * If this Locale is not well-formed, the error code is diff --git a/icu4c/source/common/unicode/uloc.h b/icu4c/source/common/unicode/uloc.h index 21179c1b628e..7f98a098574f 100644 --- a/icu4c/source/common/unicode/uloc.h +++ b/icu4c/source/common/unicode/uloc.h @@ -1158,19 +1158,20 @@ uloc_getLocaleForLCID(uint32_t hostID, char *locale, int32_t localeCapacity, * * If localeID is already in the maximal form, or there is no data available * for maximization, it will be copied to the output buffer. For example, - * "und-Zzzz" cannot be maximized, since there is no reasonable maximization. + * "sh" cannot be maximized, since there is no reasonable maximization. * * Examples: * + * "und_Zzzz" maximizes to "en_Latn_US" + * * "en" maximizes to "en_Latn_US" * - * "de" maximizes to "de_Latn_US" + * "de" maximizes to "de_Latn_DE" * * "sr" maximizes to "sr_Cyrl_RS" * - * "sh" maximizes to "sr_Latn_RS" (Note this will not reverse.) + * "zh_Hani" maximizes to "zh_Hani_CN" * - * "zh_Hani" maximizes to "zh_Hans_CN" (Note this will not reverse.) * * @param localeID The locale to maximize * @param maximizedLocaleID The maximized locale diff --git a/icu4c/source/test/cintltst/cloctst.c b/icu4c/source/test/cintltst/cloctst.c index 9490431cd8a5..f789c65b6232 100644 --- a/icu4c/source/test/cintltst/cloctst.c +++ b/icu4c/source/test/cintltst/cloctst.c @@ -3782,6 +3782,38 @@ const char* const basic_maximize_data[][2] = { }, { "_DE@em=emoji", "de_Latn_DE@em=emoji" + }, { + // ICU-22547 + // unicode_language_id = "root" | + // (unicode_language_subtag (sep unicode_script_subtag)? | unicode_script_subtag) + // (sep unicode_region_subtag)? (sep unicode_variant_subtag)* ; + // so "aaaa" is a well-formed unicode_language_id + "aaaa", + "aaaa", + }, { + // ICU-22546 + "und-Zzzz", + "en_Latn_US" // If change, please also update common/unicode/uloc.h + }, { + // ICU-22546 + "en", + "en_Latn_US" // If change, please also update common/unicode/uloc.h + }, { + // ICU-22546 + "de", + "de_Latn_DE" // If change, please also update common/unicode/uloc.h + }, { + // ICU-22546 + "sr", + "sr_Cyrl_RS" // If change, please also update common/unicode/uloc.h + }, { + // ICU-22546 + "sh", + "sh" // If change, please also update common/unicode/uloc.h + }, { + // ICU-22546 + "zh_Hani", + "zh_Hani_CN" // If change, please also update common/unicode/uloc.h } }; @@ -6013,7 +6045,7 @@ static void TestLikelySubtags() } } else if (uprv_stricmp(maximal, buffer) != 0) { - log_err(" maximal doesn't match expected %s in uloc_addLikelySubtags(), minimal \"%s\" = %s\n", maximal, minimal, buffer); + log_err("1 maximal doesn't match expected %s in uloc_addLikelySubtags(), minimal \"%s\" = %s\n", maximal, minimal, buffer); } } @@ -6066,7 +6098,7 @@ static void TestLikelySubtags() } } else if (uprv_stricmp(maximal, buffer) != 0) { - log_err(" maximal doesn't match expected \"%s\" in uloc_addLikelySubtags(), minimal \"%s\" = \"%s\"\n", maximal, minimal, buffer); + log_err("2 maximal doesn't match expected \"%s\" in uloc_addLikelySubtags(), minimal \"%s\" = \"%s\"\n", maximal, minimal, buffer); } } @@ -6128,7 +6160,7 @@ static void TestLikelySubtags() } else if (status == U_BUFFER_OVERFLOW_ERROR || status == U_STRING_NOT_TERMINATED_WARNING) { if (uprv_strnicmp(maximal, buffer, bufferSize) != 0) { - log_err(" maximal doesn't match expected %s in uloc_addLikelySubtags(), minimal \"%s\" = %*s\n", + log_err("3 maximal doesn't match expected %s in uloc_addLikelySubtags(), minimal \"%s\" = %*s\n", maximal, minimal, (int)sizeof(buffer), buffer); } } diff --git a/icu4c/source/test/intltest/loctest.cpp b/icu4c/source/test/intltest/loctest.cpp index 546b49006cc2..4ac8927a708e 100644 --- a/icu4c/source/test/intltest/loctest.cpp +++ b/icu4c/source/test/intltest/loctest.cpp @@ -3842,6 +3842,45 @@ LocaleTest::TestAddLikelyAndMinimizeSubtags() { "und_US", "en_Latn_US", "en" + }, { + // ICU-22547 + // unicode_language_id = "root" | + // (unicode_language_subtag (sep unicode_script_subtag)? | unicode_script_subtag) + // (sep unicode_region_subtag)? (sep unicode_variant_subtag)* ; + // so "aaaa" is a well-formed unicode_language_id + "aaaa", + "aaaa", + "aaaa", + }, { + // ICU-22546 + "und-Zzzz", + "en_Latn_US", // If change, please also update common/unicode/locid.h + "en" + }, { + // ICU-22546 + "en", + "en_Latn_US", // If change, please also update common/unicode/locid.h + "en" + }, { + // ICU-22546 + "de", + "de_Latn_DE", // If change, please also update common/unicode/locid.h + "de" + }, { + // ICU-22546 + "sr", + "sr_Cyrl_RS", // If change, please also update common/unicode/locid.h + "sr" + }, { + // ICU-22546 + "sh", + "sh",// If change, please also update common/unicode/locid.h + "sh" + }, { + // ICU-22546 + "zh_Hani", + "zh_Hani_CN", // If change, please also update common/unicode/locid.h + "zh_Hani" } }; diff --git a/icu4j/main/common_tests/src/test/java/com/ibm/icu/dev/test/util/ULocaleTest.java b/icu4j/main/common_tests/src/test/java/com/ibm/icu/dev/test/util/ULocaleTest.java index 5f1903a28a26..4b08f3608bc3 100644 --- a/icu4j/main/common_tests/src/test/java/com/ibm/icu/dev/test/util/ULocaleTest.java +++ b/icu4j/main/common_tests/src/test/java/com/ibm/icu/dev/test/util/ULocaleTest.java @@ -1917,6 +1917,38 @@ public void TestAddLikelySubtags() { }, { "zzz", "" + }, { + // ICU-22547 + // unicode_language_id = "root" | + // (unicode_language_subtag (sep unicode_script_subtag)? | unicode_script_subtag) + // (sep unicode_region_subtag)? (sep unicode_variant_subtag)* ; + // so "aaaa" is a well-formed unicode_language_id + "aaaa", + "aaaa", + }, { + // ICU-22546 + "und-Zzzz", + "en_Latn_US" // If change, please also update ULocale.java + }, { + // ICU-22546 + "en", + "en_Latn_US" // If change, please also update ULocale.java + }, { + // ICU-22546 + "de", + "de_Latn_DE" // If change, please also update ULocale.java + }, { + // ICU-22546 + "sr", + "sr_Cyrl_RS" // If change, please also update ULocale.java + }, { + // ICU-22546 + "sh", + "sh" // If change, please also update ULocale.java + }, { + // ICU-22546 + "zh_Hani", + "zh_Hani_CN" // If change, please also update ULocale.java } }; diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/util/ULocale.java b/icu4j/main/core/src/main/java/com/ibm/icu/util/ULocale.java index e68396cf119b..d002bc2ee984 100644 --- a/icu4j/main/core/src/main/java/com/ibm/icu/util/ULocale.java +++ b/icu4j/main/core/src/main/java/com/ibm/icu/util/ULocale.java @@ -2694,20 +2694,18 @@ public static ULocale acceptLanguage(ULocale[] acceptLanguageList, boolean[] fal * * If the provided ULocale instance is already in the maximal form, or there is no * data available available for maximization, it will be returned. For example, - * "und-Zzzz" cannot be maximized, since there is no reasonable maximization. + * "sh" cannot be maximized, since there is no reasonable maximization. * Otherwise, a new ULocale instance with the maximal form is returned. * * Examples: * * "en" maximizes to "en_Latn_US" * - * "de" maximizes to "de_Latn_US" + * "de" maximizes to "de_Latn_DE" * * "sr" maximizes to "sr_Cyrl_RS" * - * "sh" maximizes to "sr_Latn_RS" (Note this will not reverse.) - * - * "zh_Hani" maximizes to "zh_Hans_CN" (Note this will not reverse.) + * "zh_Hani" maximizes to "zh_Hani_CN" * * @param loc The ULocale to maximize * @return The maximized ULocale instance.