From b07e4b1eeb2f530ce4a2a980aa1f01289b8325ff Mon Sep 17 00:00:00 2001 From: Joshua Haberman Date: Wed, 17 Jul 2024 08:05:12 -0700 Subject: [PATCH] [Ruby] Warn if assigning a "UTF-8" string with invalid UTF-8. (#17253) This is a warning now, but will be an error in the next version. PiperOrigin-RevId: 653236128 --- ruby/Rakefile | 1 + ruby/ext/google/protobuf_c/convert.c | 50 +++++-- .../google/protobuf/ffi/internal/convert.rb | 15 +- .../java/com/google/protobuf/jruby/Utils.java | 19 ++- ruby/tests/BUILD.bazel | 10 ++ ruby/tests/utf8.proto | 9 ++ ruby/tests/utf8.rb | 136 ++++++++++++++++++ 7 files changed, 219 insertions(+), 21 deletions(-) create mode 100644 ruby/tests/utf8.proto create mode 100644 ruby/tests/utf8.rb diff --git a/ruby/Rakefile b/ruby/Rakefile index 92118af99c44..33fb568a3982 100644 --- a/ruby/Rakefile +++ b/ruby/Rakefile @@ -35,6 +35,7 @@ test_protos = %w[ tests/test_import_proto2.proto tests/test_ruby_package.proto tests/test_ruby_package_proto2.proto + tests/utf8.proto ] # These are omitted for now because we don't support proto2. diff --git a/ruby/ext/google/protobuf_c/convert.c b/ruby/ext/google/protobuf_c/convert.c index 1fae52446f70..90db70ebbde8 100644 --- a/ruby/ext/google/protobuf_c/convert.c +++ b/ruby/ext/google/protobuf_c/convert.c @@ -104,6 +104,41 @@ static int32_t Convert_ToEnum(VALUE value, const char* name, rb_raise(rb_eRangeError, "Unknown symbol value for enum field '%s'.", name); } +VALUE Convert_CheckStringUtf8(VALUE str) { + VALUE utf8 = rb_enc_from_encoding(rb_utf8_encoding()); + + if (rb_obj_encoding(str) == utf8) { + // Note: Just because a string is marked as having UTF-8 encoding does + // not mean that it is *valid* UTF-8. We have to check separately + // whether it is valid. + if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) { + // TODO: For now + // we only warn for this case. We will remove the warning and throw an + // exception below in the 30.x release + + rb_warn( + "String is invalid UTF-8. This will be an error in a future " + "version."); + // VALUE exc = rb_const_get_at( + // rb_cEncoding, rb_intern("InvalidByteSequenceError")); + // rb_raise(exc, "String is invalid UTF-8"); + } + } else { + // Note: this will not duplicate underlying string data unless + // necessary. + // + // This will throw an exception if the conversion cannot be performed: + // - Encoding::UndefinedConversionError if certain characters cannot be + // converted to UTF-8. + // - Encoding::InvalidByteSequenceError if certain characters were invalid + // in the source encoding. + str = rb_str_encode(str, utf8, 0, Qnil); + PBRUBY_ASSERT(rb_enc_str_coderange(str) != ENC_CODERANGE_BROKEN); + } + + return str; +} + upb_MessageValue Convert_RubyToUpb(VALUE value, const char* name, TypeInfo type_info, upb_Arena* arena) { upb_MessageValue ret; @@ -137,8 +172,7 @@ upb_MessageValue Convert_RubyToUpb(VALUE value, const char* name, } break; } - case kUpb_CType_String: { - VALUE utf8 = rb_enc_from_encoding(rb_utf8_encoding()); + case kUpb_CType_String: if (rb_obj_class(value) == rb_cSymbol) { value = rb_funcall(value, rb_intern("to_s"), 0); } else if (!rb_obj_is_kind_of(value, rb_cString)) { @@ -147,19 +181,9 @@ upb_MessageValue Convert_RubyToUpb(VALUE value, const char* name, rb_class2name(CLASS_OF(value))); } - if (rb_obj_encoding(value) != utf8) { - // Note: this will not duplicate underlying string data unless - // necessary. - value = rb_str_encode(value, utf8, 0, Qnil); - - if (rb_enc_str_coderange(value) == ENC_CODERANGE_BROKEN) { - rb_raise(rb_eEncodingError, "String is invalid UTF-8"); - } - } - + value = Convert_CheckStringUtf8(value); ret.str_val = Convert_StringData(value, arena); break; - } case kUpb_CType_Bytes: { VALUE bytes = rb_enc_from_encoding(rb_ascii8bit_encoding()); if (rb_obj_class(value) != rb_cString) { diff --git a/ruby/lib/google/protobuf/ffi/internal/convert.rb b/ruby/lib/google/protobuf/ffi/internal/convert.rb index ff59f8cd4e7b..b22184e41979 100644 --- a/ruby/lib/google/protobuf/ffi/internal/convert.rb +++ b/ruby/lib/google/protobuf/ffi/internal/convert.rb @@ -33,11 +33,18 @@ def convert_ruby_to_upb(value, arena, c_type, msg_or_enum_def) return_value[:bool_val] = value when :string raise TypeError.new "Invalid argument for string field '#{name}' (given #{value.class})." unless value.is_a?(String) or value.is_a?(Symbol) - begin + value = value.to_s if value.is_a?(Symbol) + if value.encoding == Encoding::UTF_8 + unless value.valid_encoding? + # TODO: + # For now we only warn for this case. We will remove the + # warning and throw an exception below in the 30.x release + warn "String is invalid UTF-8. This will be an error in a future version." + # raise Encoding::InvalidByteSequenceError.new "String is invalid UTF-8" + end + string_value = value + else string_value = value.to_s.encode("UTF-8") - rescue Encoding::UndefinedConversionError - # TODO - why not include the field name here? - raise Encoding::UndefinedConversionError.new "String is invalid UTF-8" end return_value[:str_val][:size] = string_value.bytesize return_value[:str_val][:data] = Google::Protobuf::FFI.arena_malloc(arena, string_value.bytesize) diff --git a/ruby/src/main/java/com/google/protobuf/jruby/Utils.java b/ruby/src/main/java/com/google/protobuf/jruby/Utils.java index 78aa57f1ef7c..c42bfd454d5d 100644 --- a/ruby/src/main/java/com/google/protobuf/jruby/Utils.java +++ b/ruby/src/main/java/com/google/protobuf/jruby/Utils.java @@ -40,6 +40,7 @@ import org.jcodings.specific.ASCIIEncoding; import org.jcodings.specific.UTF8Encoding; import org.jruby.*; +import org.jruby.common.RubyWarnings; import org.jruby.exceptions.RaiseException; import org.jruby.ext.bigdecimal.RubyBigDecimal; import org.jruby.runtime.Block; @@ -389,11 +390,21 @@ private static IRubyObject validateAndEncodeString( if (!(value instanceof RubyString)) throw createInvalidTypeError(context, fieldType, fieldName, value); + RubyString string = (RubyString) value; + if (encoding == UTF8Encoding.INSTANCE && string.getEncoding().isUTF8()) { + if (string.isCodeRangeBroken()) { + // TODO: For now we only warn for + // this case. We will remove the warning and throw an exception in the 30.x release + context + .runtime + .getWarnings() + .warn("String is invalid UTF-8. This will be an error in a future version."); + } + } + value = - ((RubyString) value) - .encode( - context, - context.runtime.getEncodingService().convertEncodingToRubyEncoding(encoding)); + string.encode( + context, context.runtime.getEncodingService().convertEncodingToRubyEncoding(encoding)); value.setFrozen(true); return value; } diff --git a/ruby/tests/BUILD.bazel b/ruby/tests/BUILD.bazel index 2b45725431eb..f61982eb1cb7 100644 --- a/ruby/tests/BUILD.bazel +++ b/ruby/tests/BUILD.bazel @@ -142,6 +142,16 @@ ruby_test( ], ) +ruby_test( + name = "utf8", + srcs = ["utf8.rb"], + deps = [ + ":test_ruby_protos", + "//ruby:protobuf", + "@protobuf_bundle//:test-unit", + ], +) + ruby_test( name = "well_known_types_test", srcs = ["well_known_types_test.rb"], diff --git a/ruby/tests/utf8.proto b/ruby/tests/utf8.proto new file mode 100644 index 000000000000..ca8bc2c0844c --- /dev/null +++ b/ruby/tests/utf8.proto @@ -0,0 +1,9 @@ +syntax = "proto2"; + +package utf8_test_protos; + +message TestUtf8 { + optional string optional_string = 1; + repeated string repeated_string = 2; + map map_string_string = 3; +} diff --git a/ruby/tests/utf8.rb b/ruby/tests/utf8.rb new file mode 100644 index 000000000000..6512e18aee75 --- /dev/null +++ b/ruby/tests/utf8.rb @@ -0,0 +1,136 @@ +#!/usr/bin/ruby + +require 'google/protobuf' +require 'utf8_pb' +require 'test/unit' + +module CaptureWarnings + @@warnings = nil + + module_function + + def warn(message, category: nil, **kwargs) + if @@warnings + @@warnings << message + else + super + end + end + + def capture + @@warnings = [] + yield + @@warnings + ensure + @@warnings = nil + end +end + +Warning.extend CaptureWarnings + +module Utf8Test + def test_scalar + msg = Utf8TestProtos::TestUtf8.new + assert_bad_utf8 { msg.optional_string = bad_utf8_string() } + end + + def test_repeated + msg = Utf8TestProtos::TestUtf8.new + assert_bad_utf8 { msg.repeated_string << bad_utf8_string() } + end + + def test_map_key + msg = Utf8TestProtos::TestUtf8.new + assert_bad_utf8 { msg.map_string_string[bad_utf8_string()] = "abc" } + end + + def test_map_value + msg = Utf8TestProtos::TestUtf8.new + assert_bad_utf8 { msg.map_string_string["abc"] = bad_utf8_string() } + end +end + +# Tests the case of string objects that are marked UTF-8, but contain invalid +# UTF-8. +# +# For now these only warn, but in the next major version they will throw an +# exception. +class MarkedUtf8Test < Test::Unit::TestCase + def assert_bad_utf8(&block) + warnings = CaptureWarnings.capture(&block) + assert_equal 1, warnings.length + assert_match(/String is invalid UTF-8. This will be an error in a future version./, warnings[0]) + end + + def bad_utf8_string + str = "\x80" + assert_false str.valid_encoding? + str + end + + include Utf8Test +end + +# This test doesn't work in JRuby because JRuby appears to have a bug where +# the "valid" bit on a string's data is not invalidated properly when the +# string is modified: https://github.com/jruby/jruby/issues/8316 +if !defined? JRUBY_VERSION + # Tests the case of string objects that are marked UTF-8, and initially contain + # valid UTF-8, but are later modified to be invalid UTF-8. This may put the + # string into an state of "unknown" validity. + # + # For now these only warn, but in the next major version they will throw an + # exception. + class MarkedModifiedUtf8Test < Test::Unit::TestCase + def assert_bad_utf8(&block) + warnings = CaptureWarnings.capture(&block) + assert_equal 1, warnings.length + assert_match(/String is invalid UTF-8. This will be an error in a future version./, warnings[0]) + end + + def bad_utf8_string + str = " " + assert_true str.valid_encoding? + str[0] = "\x80" + str + end + + include Utf8Test + end +end + +# Tests the case of string objects that are marked with a non-UTF-8 encoding, +# but contain invalid UTF-8. +# +# This case will raise Encoding::UndefinedConversionError. +class MarkedNonUtf8Test < Test::Unit::TestCase + def assert_bad_utf8 + assert_raises(Encoding::UndefinedConversionError) { yield } + end + + def bad_utf8_string + str = "\x80".force_encoding(Encoding::ASCII_8BIT) + assert_true str.valid_encoding? + str + end + + include Utf8Test +end + +# Tests the case of string objects that are marked with a non-UTF-8 encoding, +# but are invalid even in their source encoding. +# +# This case will raise Encoding::InvalidByteSequenceError +class MarkedNonUtf8Test < Test::Unit::TestCase + def assert_bad_utf8(&block) + assert_raises(Encoding::InvalidByteSequenceError, &block) + end + + def bad_utf8_string + str = "\x80".force_encoding(Encoding::ASCII) + assert_false str.valid_encoding? + str + end + + include Utf8Test +end