From 37949d20920225972b51f4cc69c8fbec775cc9fd Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Wed, 10 Feb 2021 09:51:44 +0100 Subject: [PATCH] Assume symbols are either ASCII or UTF-8 Right now only US-ASCII are properly preserved. Any other encoding comes back with `ASCII-8BIT` (binary) encoding. After this change UTF-8 symbols are properly preserved as well. Other encoding cause an EncodingError. Since UTF-8 is the only encoding handled by msgpack strings, I think this change makes sense. --- ext/msgpack/unpacker.c | 2 +- lib/msgpack/symbol.rb | 9 ++++++- spec/factory_spec.rb | 61 ++++++++++++++++++++++++++++++++---------- 3 files changed, 56 insertions(+), 16 deletions(-) diff --git a/ext/msgpack/unpacker.c b/ext/msgpack/unpacker.c index 0d396c78..0cb20f9c 100644 --- a/ext/msgpack/unpacker.c +++ b/ext/msgpack/unpacker.c @@ -286,7 +286,7 @@ static inline int read_raw_body_begin(msgpack_unpacker_t* uk, int raw_type) if(length <= msgpack_buffer_top_readable_size(UNPACKER_BUFFER_(uk))) { int ret; if ((uk->optimized_symbol_ext_type && uk->symbol_ext_type == raw_type) || (uk->symbolize_keys && is_reading_map_key(uk))) { - VALUE symbol = msgpack_buffer_read_top_as_symbol(UNPACKER_BUFFER_(uk), length, raw_type == RAW_TYPE_STRING); + VALUE symbol = msgpack_buffer_read_top_as_symbol(UNPACKER_BUFFER_(uk), length, raw_type != RAW_TYPE_BINARY); ret = object_complete_symbol(uk, symbol); } else { /* don't use zerocopy for hash keys but get a frozen string directly diff --git a/lib/msgpack/symbol.rb b/lib/msgpack/symbol.rb index 5f98c786..ebe77a29 100644 --- a/lib/msgpack/symbol.rb +++ b/lib/msgpack/symbol.rb @@ -14,6 +14,13 @@ def self.from_msgpack_ext(data) # The canonical way to do it for symbols would be: # data.unpack1('A*').to_sym # However in this instance we can take a shortcut - data.to_sym + + # We assume the string encoding is UTF-8, and let Ruby create either + # an ASCII symbol or UTF-8 symbol. + data.force_encoding(Encoding::UTF_8).to_sym + rescue EncodingError + # If somehow the string wasn't valid UTF-8 not valid ASCII, we fallback + # to what has been the historical behavior of creating a binary symbol + data.force_encoding(Encoding::BINARY).to_sym end end diff --git a/spec/factory_spec.rb b/spec/factory_spec.rb index b3234666..40ba1a25 100644 --- a/spec/factory_spec.rb +++ b/spec/factory_spec.rb @@ -1,4 +1,3 @@ -# encoding: ascii-8bit require 'spec_helper' describe MessagePack::Factory do @@ -255,29 +254,25 @@ def to_msgpack_ext subject { factory.packer.pack(value).to_s } before { stub_const('Value', Class.new{ include Mod }) } let(:value) { Value.new } - it { is_expected.to eq "\xC7\x0F\x01value_msgpacked" } + it { is_expected.to eq "\xC7\x0F\x01value_msgpacked".force_encoding(Encoding::BINARY) } end describe "packing an object which has been extended by the module" do subject { factory.packer.pack(object).to_s } let(:object) { Object.new.extend Mod } - it { is_expected.to eq "\xC7\x0F\x01value_msgpacked" } + it { is_expected.to eq "\xC7\x0F\x01value_msgpacked".force_encoding(Encoding::BINARY) } end describe "unpacking with the module" do - subject { factory.unpacker.feed("\xC7\x06\x01module").unpack } + subject { factory.unpacker.feed("\xC7\x06\x01module".force_encoding(Encoding::BINARY)).unpack } it { is_expected.to eq "unpacked module" } end end end describe 'the special treatment of symbols with ext type' do - let(:packer) { subject.packer } - let(:unpacker) { subject.unpacker } - - def symbol_after_roundtrip - packed_symbol = packer.pack(:symbol).to_s - unpacker.feed(packed_symbol).unpack + def roundtrip(object) + subject.load(subject.dump(object)) end context 'using the optimized symbol unpacker' do @@ -293,13 +288,25 @@ def symbol_after_roundtrip end it 'lets symbols survive a roundtrip' do - expect(symbol_after_roundtrip).to be :symbol + expect(roundtrip(:symbol)).to be :symbol + end + + it 'preserves encoding for ASCII symbols' do + expect(:symbol.encoding).to be Encoding::US_ASCII + expect(roundtrip(:symbol)).to be :symbol + expect(roundtrip(:symbol).encoding).to be Encoding::US_ASCII + end + + it 'preserves encoding for UTF-8 symbols' do + expect(:"fée".encoding).to be Encoding::UTF_8 + expect(roundtrip(:"fée").encoding).to be Encoding::UTF_8 + expect(roundtrip(:"fée")).to be :"fée" end end context 'if no ext type is registered for symbols' do it 'converts symbols to string' do - expect(symbol_after_roundtrip).to eq 'symbol' + expect(roundtrip(:symbol)).to eq 'symbol' end end @@ -308,7 +315,33 @@ def symbol_after_roundtrip before { subject.register_type(0x00, ::Symbol) } it 'lets symbols survive a roundtrip' do - expect(symbol_after_roundtrip).to be :symbol + expect(roundtrip(:symbol)).to be :symbol + end + + it 'preserves encoding for ASCII symbols' do + expect(:symbol.encoding).to be Encoding::US_ASCII + expect(roundtrip(:symbol)).to be :symbol + expect(roundtrip(:symbol).encoding).to be Encoding::US_ASCII + end + + it 'preserves encoding for UTF-8 symbols' do + expect(:"fée".encoding).to be Encoding::UTF_8 + expect(roundtrip(:"fée")).to be :"fée" + expect(roundtrip(:"fée").encoding).to be Encoding::UTF_8 + end + + it 'does not handle symbols in other encodings' do + symbol = "fàe".encode(Encoding::ISO_8859_1).to_sym + expect(symbol.encoding).to be Encoding::ISO_8859_1 + + if IS_JRUBY + # JRuby doesn't quite behave like MRI here. + # "fàe".force_encoding(Encoding::BINARY).to_sym is able to lookup the existing ISO-8859-1 symbol + # It likely is a JRuby bug. + expect(roundtrip(symbol).encoding).to be Encoding::ISO_8859_1 + else + expect(roundtrip(symbol).encoding).to be Encoding::BINARY + end end end @@ -332,7 +365,7 @@ def from_msgpack_ext(data) before { subject.register_type(0x00, ::Symbol) } it 'lets symbols survive a roundtrip' do - expect(symbol_after_roundtrip).to be :symbol + expect(roundtrip(:symbol)).to be :symbol end after do