From 92eeb45811055e0f055c21ac28e536a41156e57f Mon Sep 17 00:00:00 2001
From: Frank Tang <ftang@chromium.org>
Date: Thu, 26 Oct 2023 16:24:36 -0700
Subject: [PATCH] ICU-22547 fix addLikelySubtags for 4 chars script code

Also fix ICU-22546 to correct the comments in the API doc
and add additional unit tests
---
 icu4c/source/common/loclikely.cpp             |  9 ++++-
 icu4c/source/common/unicode/locid.h           | 10 ++---
 icu4c/source/common/unicode/uloc.h            |  9 +++--
 icu4c/source/test/cintltst/cloctst.c          | 38 ++++++++++++++++--
 icu4c/source/test/intltest/loctest.cpp        | 39 +++++++++++++++++++
 .../ibm/icu/dev/test/util/ULocaleTest.java    | 32 +++++++++++++++
 .../main/java/com/ibm/icu/util/ULocale.java   |  8 ++--
 7 files changed, 127 insertions(+), 18 deletions(-)

diff --git a/icu4c/source/common/loclikely.cpp b/icu4c/source/common/loclikely.cpp
index eedfb8149e26..c7cf21486e0c 100644
--- a/icu4c/source/common/loclikely.cpp
+++ b/icu4c/source/common/loclikely.cpp
@@ -467,7 +467,14 @@ _uloc_addLikelySubtags(const char* localeID,
         goto error;
     }
     if (langLength > 3) {
-        goto error;
+        if (langLength == 4 && scriptLength == 0) {
+            langLength = 0;
+            scriptLength = 4;
+            uprv_memcpy(script, lang, 4);
+            lang[0] = '\0';
+        } else {
+            goto error;
+        }
     }
 
     /* Find the length of the trailing portion. */
diff --git a/icu4c/source/common/unicode/locid.h b/icu4c/source/common/unicode/locid.h
index f0bdc7ca5141..98bc28c0df0b 100644
--- a/icu4c/source/common/unicode/locid.h
+++ b/icu4c/source/common/unicode/locid.h
@@ -518,20 +518,20 @@ class U_COMMON_API Locale : public UObject {
      * If this Locale is already in the maximal form, or not valid, or there is
      * no data available for maximization, the Locale will be unchanged.
      *
-     * For example, "und-Zzzz" cannot be maximized, since there is no
+     * For example, "sh" cannot be maximized, since there is no
      * reasonable maximization.
      *
      * Examples:
      *
+     * "und_Zzzz" maximizes to "en_Latn_US"
+     *
      * "en" maximizes to "en_Latn_US"
      *
-     * "de" maximizes to "de_Latn_US"
+     * "de" maximizes to "de_Latn_DE"
      *
      * "sr" maximizes to "sr_Cyrl_RS"
      *
-     * "sh" maximizes to "sr_Latn_RS" (Note this will not reverse.)
-     *
-     * "zh_Hani" maximizes to "zh_Hans_CN" (Note this will not reverse.)
+     * "zh_Hani" maximizes to "zh_Hani_CN"
      *
      * @param status  error information if maximizing this Locale failed.
      *                If this Locale is not well-formed, the error code is
diff --git a/icu4c/source/common/unicode/uloc.h b/icu4c/source/common/unicode/uloc.h
index 21179c1b628e..7f98a098574f 100644
--- a/icu4c/source/common/unicode/uloc.h
+++ b/icu4c/source/common/unicode/uloc.h
@@ -1158,19 +1158,20 @@ uloc_getLocaleForLCID(uint32_t hostID, char *locale, int32_t localeCapacity,
  *
  * If localeID is already in the maximal form, or there is no data available
  * for maximization, it will be copied to the output buffer.  For example,
- * "und-Zzzz" cannot be maximized, since there is no reasonable maximization.
+ * "sh" cannot be maximized, since there is no reasonable maximization.
  *
  * Examples:
  *
+ * "und_Zzzz" maximizes to "en_Latn_US"
+ *
  * "en" maximizes to "en_Latn_US"
  *
- * "de" maximizes to "de_Latn_US"
+ * "de" maximizes to "de_Latn_DE"
  *
  * "sr" maximizes to "sr_Cyrl_RS"
  *
- * "sh" maximizes to "sr_Latn_RS" (Note this will not reverse.)
+ * "zh_Hani" maximizes to "zh_Hani_CN"
  *
- * "zh_Hani" maximizes to "zh_Hans_CN" (Note this will not reverse.)
  *
  * @param localeID The locale to maximize
  * @param maximizedLocaleID The maximized locale
diff --git a/icu4c/source/test/cintltst/cloctst.c b/icu4c/source/test/cintltst/cloctst.c
index 9490431cd8a5..f789c65b6232 100644
--- a/icu4c/source/test/cintltst/cloctst.c
+++ b/icu4c/source/test/cintltst/cloctst.c
@@ -3782,6 +3782,38 @@ const char* const basic_maximize_data[][2] = {
   }, {
     "_DE@em=emoji",
     "de_Latn_DE@em=emoji"
+  }, {
+    // ICU-22547
+    // unicode_language_id = "root" |
+    //   (unicode_language_subtag (sep unicode_script_subtag)?  | unicode_script_subtag)
+    //     (sep unicode_region_subtag)?  (sep unicode_variant_subtag)* ;
+    // so "aaaa" is a well-formed unicode_language_id
+    "aaaa",
+    "aaaa",
+  }, {
+    // ICU-22546
+    "und-Zzzz",
+    "en_Latn_US" // If change, please also update common/unicode/uloc.h
+  }, {
+    // ICU-22546
+    "en",
+    "en_Latn_US" // If change, please also update common/unicode/uloc.h
+  }, {
+    // ICU-22546
+    "de",
+    "de_Latn_DE" // If change, please also update common/unicode/uloc.h
+  }, {
+    // ICU-22546
+    "sr",
+    "sr_Cyrl_RS" // If change, please also update common/unicode/uloc.h
+  }, {
+    // ICU-22546
+    "sh",
+    "sh" // If change, please also update common/unicode/uloc.h
+  }, {
+    // ICU-22546
+    "zh_Hani",
+    "zh_Hani_CN" // If change, please also update common/unicode/uloc.h
   }
 };
 
@@ -6013,7 +6045,7 @@ static void TestLikelySubtags()
             }
         }
         else if (uprv_stricmp(maximal, buffer) != 0) {
-            log_err("  maximal doesn't match expected %s in uloc_addLikelySubtags(), minimal \"%s\" = %s\n", maximal, minimal, buffer);
+            log_err("1  maximal doesn't match expected %s in uloc_addLikelySubtags(), minimal \"%s\" = %s\n", maximal, minimal, buffer);
         }
     }
 
@@ -6066,7 +6098,7 @@ static void TestLikelySubtags()
             }
         }
         else if (uprv_stricmp(maximal, buffer) != 0) {
-            log_err("  maximal doesn't match expected \"%s\" in uloc_addLikelySubtags(), minimal \"%s\" = \"%s\"\n", maximal, minimal, buffer);
+            log_err("2  maximal doesn't match expected \"%s\" in uloc_addLikelySubtags(), minimal \"%s\" = \"%s\"\n", maximal, minimal, buffer);
         }
     }
 
@@ -6128,7 +6160,7 @@ static void TestLikelySubtags()
         }
         else if (status == U_BUFFER_OVERFLOW_ERROR || status == U_STRING_NOT_TERMINATED_WARNING) {
             if (uprv_strnicmp(maximal, buffer, bufferSize) != 0) {
-                log_err("  maximal doesn't match expected %s in uloc_addLikelySubtags(), minimal \"%s\" = %*s\n",
+                log_err("3  maximal doesn't match expected %s in uloc_addLikelySubtags(), minimal \"%s\" = %*s\n",
                     maximal, minimal, (int)sizeof(buffer), buffer);
             }
         }
diff --git a/icu4c/source/test/intltest/loctest.cpp b/icu4c/source/test/intltest/loctest.cpp
index 546b49006cc2..4ac8927a708e 100644
--- a/icu4c/source/test/intltest/loctest.cpp
+++ b/icu4c/source/test/intltest/loctest.cpp
@@ -3842,6 +3842,45 @@ LocaleTest::TestAddLikelyAndMinimizeSubtags() {
             "und_US",
             "en_Latn_US",
             "en"
+        }, {
+            // ICU-22547
+            // unicode_language_id = "root" |
+            //   (unicode_language_subtag (sep unicode_script_subtag)?  | unicode_script_subtag)
+            //     (sep unicode_region_subtag)?  (sep unicode_variant_subtag)* ;
+            // so "aaaa" is a well-formed unicode_language_id
+            "aaaa",
+            "aaaa",
+            "aaaa",
+        }, {
+            // ICU-22546
+            "und-Zzzz",
+            "en_Latn_US", // If change, please also update common/unicode/locid.h
+            "en"
+        }, {
+            // ICU-22546
+            "en",
+            "en_Latn_US", // If change, please also update common/unicode/locid.h
+            "en"
+        }, {
+            // ICU-22546
+            "de",
+            "de_Latn_DE", // If change, please also update common/unicode/locid.h
+            "de"
+        }, {
+            // ICU-22546
+            "sr",
+            "sr_Cyrl_RS", // If change, please also update common/unicode/locid.h
+            "sr"
+        }, {
+            // ICU-22546
+            "sh",
+            "sh",// If change, please also update common/unicode/locid.h
+            "sh"
+        }, {
+            // ICU-22546
+            "zh_Hani",
+            "zh_Hani_CN", // If change, please also update common/unicode/locid.h
+            "zh_Hani"
         }
     };
 
diff --git a/icu4j/main/common_tests/src/test/java/com/ibm/icu/dev/test/util/ULocaleTest.java b/icu4j/main/common_tests/src/test/java/com/ibm/icu/dev/test/util/ULocaleTest.java
index 5f1903a28a26..4b08f3608bc3 100644
--- a/icu4j/main/common_tests/src/test/java/com/ibm/icu/dev/test/util/ULocaleTest.java
+++ b/icu4j/main/common_tests/src/test/java/com/ibm/icu/dev/test/util/ULocaleTest.java
@@ -1917,6 +1917,38 @@ public void TestAddLikelySubtags() {
                 }, {
                     "zzz",
                     ""
+                }, {
+                    // ICU-22547
+                    // unicode_language_id = "root" |
+                    //   (unicode_language_subtag (sep unicode_script_subtag)?  | unicode_script_subtag)
+                    //     (sep unicode_region_subtag)?  (sep unicode_variant_subtag)* ;
+                    // so "aaaa" is a well-formed unicode_language_id
+                    "aaaa",
+                    "aaaa",
+                }, {
+                    // ICU-22546
+                    "und-Zzzz",
+                    "en_Latn_US" // If change, please also update ULocale.java
+                }, {
+                    // ICU-22546
+                    "en",
+                    "en_Latn_US" // If change, please also update ULocale.java
+                }, {
+                    // ICU-22546
+                    "de",
+                    "de_Latn_DE" // If change, please also update ULocale.java
+                }, {
+                    // ICU-22546
+                    "sr",
+                    "sr_Cyrl_RS" // If change, please also update ULocale.java
+                }, {
+                    // ICU-22546
+                    "sh",
+                    "sh" // If change, please also update ULocale.java
+                }, {
+                    // ICU-22546
+                    "zh_Hani",
+                    "zh_Hani_CN" // If change, please also update ULocale.java
                 }
         };
 
diff --git a/icu4j/main/core/src/main/java/com/ibm/icu/util/ULocale.java b/icu4j/main/core/src/main/java/com/ibm/icu/util/ULocale.java
index e68396cf119b..d002bc2ee984 100644
--- a/icu4j/main/core/src/main/java/com/ibm/icu/util/ULocale.java
+++ b/icu4j/main/core/src/main/java/com/ibm/icu/util/ULocale.java
@@ -2694,20 +2694,18 @@ public static ULocale acceptLanguage(ULocale[] acceptLanguageList, boolean[] fal
      *
      * If the provided ULocale instance is already in the maximal form, or there is no
      * data available available for maximization, it will be returned.  For example,
-     * "und-Zzzz" cannot be maximized, since there is no reasonable maximization.
+     * "sh" cannot be maximized, since there is no reasonable maximization.
      * Otherwise, a new ULocale instance with the maximal form is returned.
      *
      * Examples:
      *
      * "en" maximizes to "en_Latn_US"
      *
-     * "de" maximizes to "de_Latn_US"
+     * "de" maximizes to "de_Latn_DE"
      *
      * "sr" maximizes to "sr_Cyrl_RS"
      *
-     * "sh" maximizes to "sr_Latn_RS" (Note this will not reverse.)
-     *
-     * "zh_Hani" maximizes to "zh_Hans_CN" (Note this will not reverse.)
+     * "zh_Hani" maximizes to "zh_Hani_CN"
      *
      * @param loc The ULocale to maximize
      * @return The maximized ULocale instance.