src: remove icu based ToASCII and ToUnicode

PR-URL: #55156 Reviewed-By: James M Snell <[email protected]> Reviewed-By: Matthew Aitken <[email protected]> Reviewed-By: Daniel Lemire <[email protected]> Reviewed-By: Richard Lau <[email protected]>
nodejs · Nov 27, 2024 · bde374e · bde374e
1 parent cac5856
commit bde374e
Show file tree

Hide file tree

Showing 4 changed files with 2 additions and 389 deletions.
diff --git a/src/node_i18n.cc b/src/node_i18n.cc
@@ -60,19 +60,17 @@
 #include <unicode/uchar.h>
 #include <unicode/uclean.h>
 #include <unicode/ucnv.h>
-#include <unicode/udata.h>
-#include <unicode/uidna.h>
 #include <unicode/ulocdata.h>
 #include <unicode/urename.h>
-#include <unicode/ustring.h>
 #include <unicode/utf16.h>
-#include <unicode/utf8.h>
 #include <unicode/utypes.h>
 #include <unicode/uvernum.h>
 #include <unicode/uversion.h>
 #include "nbytes.h"
 
 #ifdef NODE_HAVE_SMALL_ICU
+#include <unicode/udata.h>
+
 /* if this is defined, we have a 'secondary' entry point.
    compare following to utypes.h defs for U_ICUDATA_ENTRY_POINT */
 #define SMALL_ICUDATA_ENTRY_POINT \
@@ -96,7 +94,6 @@ using v8::Int32;
 using v8::Isolate;
 using v8::Local;
 using v8::MaybeLocal;
-using v8::NewStringType;
 using v8::Object;
 using v8::ObjectTemplate;
 using v8::String;
@@ -583,167 +580,6 @@ void SetDefaultTimeZone(const char* tzid) {
   CHECK(U_SUCCESS(status));
 }
 
-int32_t ToUnicode(MaybeStackBuffer<char>* buf,
-                  const char* input,
-                  size_t length) {
-  UErrorCode status = U_ZERO_ERROR;
-  uint32_t options = UIDNA_NONTRANSITIONAL_TO_UNICODE;
-  UIDNA* uidna = uidna_openUTS46(options, &status);
-  if (U_FAILURE(status))
-    return -1;
-  UIDNAInfo info = UIDNA_INFO_INITIALIZER;
-
-  int32_t len = uidna_nameToUnicodeUTF8(uidna,
-                                        input, length,
-                                        **buf, buf->capacity(),
-                                        &info,
-                                        &status);
-
-  // Do not check info.errors like we do with ToASCII since ToUnicode always
-  // returns a string, despite any possible errors that may have occurred.
-
-  if (status == U_BUFFER_OVERFLOW_ERROR) {
-    status = U_ZERO_ERROR;
-    buf->AllocateSufficientStorage(len);
-    len = uidna_nameToUnicodeUTF8(uidna,
-                                  input, length,
-                                  **buf, buf->capacity(),
-                                  &info,
-                                  &status);
-  }
-
-  // info.errors is ignored as UTS #46 ToUnicode always produces a Unicode
-  // string, regardless of whether an error occurred.
-
-  if (U_FAILURE(status)) {
-    len = -1;
-    buf->SetLength(0);
-  } else {
-    buf->SetLength(len);
-  }
-
-  uidna_close(uidna);
-  return len;
-}
-
-int32_t ToASCII(MaybeStackBuffer<char>* buf,
-                const char* input,
-                size_t length,
-                idna_mode mode) {
-  UErrorCode status = U_ZERO_ERROR;
-  uint32_t options =                  // CheckHyphens = false; handled later
-    UIDNA_CHECK_BIDI |                // CheckBidi = true
-    UIDNA_CHECK_CONTEXTJ |            // CheckJoiners = true
-    UIDNA_NONTRANSITIONAL_TO_ASCII;   // Nontransitional_Processing
-  if (mode == idna_mode::kStrict) {
-    options |= UIDNA_USE_STD3_RULES;  // UseSTD3ASCIIRules = beStrict
-                                      // VerifyDnsLength = beStrict;
-                                      //   handled later
-  }
-
-  UIDNA* uidna = uidna_openUTS46(options, &status);
-  if (U_FAILURE(status))
-    return -1;
-  UIDNAInfo info = UIDNA_INFO_INITIALIZER;
-
-  int32_t len = uidna_nameToASCII_UTF8(uidna,
-                                       input, length,
-                                       **buf, buf->capacity(),
-                                       &info,
-                                       &status);
-
-  if (status == U_BUFFER_OVERFLOW_ERROR) {
-    status = U_ZERO_ERROR;
-    buf->AllocateSufficientStorage(len);
-    len = uidna_nameToASCII_UTF8(uidna,
-                                 input, length,
-                                 **buf, buf->capacity(),
-                                 &info,
-                                 &status);
-  }
-
-  // In UTS #46 which specifies ToASCII, certain error conditions are
-  // configurable through options, and the WHATWG URL Standard promptly elects
-  // to disable some of them to accommodate for real-world use cases.
-  // Unfortunately, ICU4C's IDNA module does not support disabling some of
-  // these options through `options` above, and thus continues throwing
-  // unnecessary errors. To counter this situation, we just filter out the
-  // errors that may have happened afterwards, before deciding whether to
-  // return an error from this function.
-
-  // CheckHyphens = false
-  // (Specified in the current UTS #46 draft rev. 18.)
-  // Refs:
-  // - https://github.com/whatwg/url/issues/53
-  // - https://github.com/whatwg/url/pull/309
-  // - http://www.unicode.org/review/pri317/
-  // - http://www.unicode.org/reports/tr46/tr46-18.html
-  // - https://www.icann.org/news/announcement-2000-01-07-en
-  info.errors &= ~UIDNA_ERROR_HYPHEN_3_4;
-  info.errors &= ~UIDNA_ERROR_LEADING_HYPHEN;
-  info.errors &= ~UIDNA_ERROR_TRAILING_HYPHEN;
-
-  if (mode != idna_mode::kStrict) {
-    // VerifyDnsLength = beStrict
-    info.errors &= ~UIDNA_ERROR_EMPTY_LABEL;
-    info.errors &= ~UIDNA_ERROR_LABEL_TOO_LONG;
-    info.errors &= ~UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;
-  }
-
-  if (U_FAILURE(status) || (mode != idna_mode::kLenient && info.errors != 0)) {
-    len = -1;
-    buf->SetLength(0);
-  } else {
-    buf->SetLength(len);
-  }
-
-  uidna_close(uidna);
-  return len;
-}
-
-static void ToUnicode(const FunctionCallbackInfo<Value>& args) {
-  Environment* env = Environment::GetCurrent(args);
-  CHECK_GE(args.Length(), 1);
-  CHECK(args[0]->IsString());
-  Utf8Value val(env->isolate(), args[0]);
-
-  MaybeStackBuffer<char> buf;
-  int32_t len = ToUnicode(&buf, *val, val.length());
-
-  if (len < 0) {
-    return THROW_ERR_INVALID_ARG_VALUE(env, "Cannot convert name to Unicode");
-  }
-
-  args.GetReturnValue().Set(
-      String::NewFromUtf8(env->isolate(),
-                          *buf,
-                          NewStringType::kNormal,
-                          len).ToLocalChecked());
-}
-
-static void ToASCII(const FunctionCallbackInfo<Value>& args) {
-  Environment* env = Environment::GetCurrent(args);
-  CHECK_GE(args.Length(), 1);
-  CHECK(args[0]->IsString());
-  Utf8Value val(env->isolate(), args[0]);
-  // optional arg
-  bool lenient = args[1]->BooleanValue(env->isolate());
-  idna_mode mode = lenient ? idna_mode::kLenient : idna_mode::kDefault;
-
-  MaybeStackBuffer<char> buf;
-  int32_t len = ToASCII(&buf, *val, val.length(), mode);
-
-  if (len < 0) {
-    return THROW_ERR_INVALID_ARG_VALUE(env, "Cannot convert name to ASCII");
-  }
-
-  args.GetReturnValue().Set(
-      String::NewFromUtf8(env->isolate(),
-                          *buf,
-                          NewStringType::kNormal,
-                          len).ToLocalChecked());
-}
-
 // This is similar to wcwidth except that it takes the current unicode
 // character properties database into consideration, allowing it to
 // correctly calculate the column widths of things like emoji's and
@@ -850,8 +686,6 @@ static void CreatePerIsolateProperties(IsolateData* isolate_data,
                                        Local<ObjectTemplate> target) {
   Isolate* isolate = isolate_data->isolate();
 
-  SetMethod(isolate, target, "toUnicode", ToUnicode);
-  SetMethod(isolate, target, "toASCII", ToASCII);
   SetMethod(isolate, target, "getStringWidth", GetStringWidth);
 
   // One-shot converters
@@ -880,8 +714,6 @@ void CreatePerContextProperties(Local<Object> target,
                                 void* priv) {}
 
 void RegisterExternalReferences(ExternalReferenceRegistry* registry) {
-  registry->Register(ToUnicode);
-  registry->Register(ToASCII);
   registry->Register(GetStringWidth);
   registry->Register(ICUErrorName);
   registry->Register(Transcode);

diff --git a/src/node_i18n.h b/src/node_i18n.h
@@ -53,19 +53,6 @@ enum class idna_mode {
   kStrict
 };
 
-// Implements the WHATWG URL Standard "domain to ASCII" algorithm.
-// https://url.spec.whatwg.org/#concept-domain-to-ascii
-int32_t ToASCII(MaybeStackBuffer<char>* buf,
-                const char* input,
-                size_t length,
-                idna_mode mode = idna_mode::kDefault);
-
-// Implements the WHATWG URL Standard "domain to Unicode" algorithm.
-// https://url.spec.whatwg.org/#concept-domain-to-unicode
-int32_t ToUnicode(MaybeStackBuffer<char>* buf,
-                  const char* input,
-                  size_t length);
-
 struct ConverterDeleter {
   void operator()(UConverter* pointer) const { ucnv_close(pointer); }
 };

diff --git a/test/fixtures/icu-punycode-toascii.json b/test/fixtures/icu-punycode-toascii.json