Skip to content

Commit

Permalink
encoding: make TextDecoder handle BOM correctly
Browse files Browse the repository at this point in the history
Do not accept the BOM if it comes from a different encoding, and
only discard the BOM after it has actually been read (including
when it is spread over multiple chunks in streaming mode).

Fixes: nodejs#25315
  • Loading branch information
addaleax committed Nov 1, 2019
1 parent 6858c7e commit a30272b
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 34 deletions.
27 changes: 12 additions & 15 deletions lib/internal/encoding.js
Original file line number Diff line number Diff line change
Expand Up @@ -484,25 +484,22 @@ function makeTextDecoderJS() {
this[kFlags] |= CONVERTER_FLAGS_FLUSH;
}

if (!this[kBOMSeen] && !(this[kFlags] & CONVERTER_FLAGS_IGNORE_BOM)) {
if (this[kEncoding] === 'utf-8') {
if (input.length >= 3 &&
input[0] === 0xEF && input[1] === 0xBB && input[2] === 0xBF) {
input = input.slice(3);
}
} else if (this[kEncoding] === 'utf-16le') {
if (input.length >= 2 && input[0] === 0xFF && input[1] === 0xFE) {
input = input.slice(2);
}
let result = this[kFlags] & CONVERTER_FLAGS_FLUSH ?
this[kHandle].end(input) :
this[kHandle].write(input);

if (result.length > 0 &&
!this[kBOMSeen] &&
!(this[kFlags] & CONVERTER_FLAGS_IGNORE_BOM)) {
// If the very first result in the stream is a BOM, and we are not
// explicitly told to ignore it, then we discard it.
if (result[0] === '\ufeff') {
result = result.slice(1);
}
this[kBOMSeen] = true;
}

if (this[kFlags] & CONVERTER_FLAGS_FLUSH) {
return this[kHandle].end(input);
}

return this[kHandle].write(input);
return result;
}
}

Expand Down
8 changes: 4 additions & 4 deletions src/node_buffer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -219,10 +219,10 @@ size_t Length(Local<Object> obj) {
}


inline MaybeLocal<Uint8Array> New(Environment* env,
Local<ArrayBuffer> ab,
size_t byte_offset,
size_t length) {
MaybeLocal<Uint8Array> New(Environment* env,
Local<ArrayBuffer> ab,
size_t byte_offset,
size_t length) {
CHECK(!env->buffer_prototype_object().IsEmpty());
Local<Uint8Array> ui = Uint8Array::New(ab, byte_offset, length);
Maybe<bool> mb =
Expand Down
37 changes: 27 additions & 10 deletions src/node_i18n.cc
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ using v8::NewStringType;
using v8::Object;
using v8::ObjectTemplate;
using v8::String;
using v8::Uint8Array;
using v8::Value;

namespace i18n {
Expand Down Expand Up @@ -227,25 +228,41 @@ class ConverterObject : public BaseObject, Converter {
const char* source = input.data();
size_t source_length = input.length();

if (converter->unicode_ && !converter->ignoreBOM_ && !converter->bomSeen_) {
int32_t bomOffset = 0;
ucnv_detectUnicodeSignature(source, source_length, &bomOffset, &status);
source += bomOffset;
source_length -= bomOffset;
converter->bomSeen_ = true;
}

UChar* target = *result;
ucnv_toUnicode(converter->conv,
&target, target + (limit * sizeof(UChar)),
&source, source + source_length,
nullptr, flush, &status);

if (U_SUCCESS(status)) {
if (limit > 0)
bool omit_initial_bom = false;
if (limit > 0) {
result.SetLength(target - &result[0]);
if (result.length() > 0 &&
converter->unicode_ &&
!converter->ignoreBOM_ &&
!converter->bomSeen_) {
// If the very first result in the stream is a BOM, and we are not
// explicitly told to ignore it, then we mark it for discarding.
if (result[0] == 0xFEFF) {
omit_initial_bom = true;
}
converter->bomSeen_ = true;
}
}
ret = ToBufferEndian(env, &result);
args.GetReturnValue().Set(ret.ToLocalChecked());
if (omit_initial_bom && !ret.IsEmpty()) {
// Peform `ret = ret.slice(2)`.
CHECK(ret.ToLocalChecked()->IsUint8Array());
Local<Uint8Array> orig_ret = ret.ToLocalChecked().As<Uint8Array>();
ret = Buffer::New(env,
orig_ret->Buffer(),
orig_ret->ByteOffset() + 2,
orig_ret->ByteLength() - 2)
.FromMaybe(Local<Uint8Array>());
}
if (!ret.IsEmpty())
args.GetReturnValue().Set(ret.ToLocalChecked());
return;
}

Expand Down
6 changes: 5 additions & 1 deletion src/node_internals.h
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,11 @@ v8::MaybeLocal<v8::Object> New(Environment* env,
char* data,
size_t length,
bool uses_malloc);

// Creates a Buffer instance over an existing Uint8Array.
v8::MaybeLocal<v8::Uint8Array> New(Environment* env,
v8::Local<v8::ArrayBuffer> ab,
size_t byte_offset,
size_t length);
// Construct a Buffer from a MaybeStackBuffer (and also its subclasses like
// Utf8Value and TwoByteValue).
// If |buf| is invalidated, an empty MaybeLocal is returned, and nothing is
Expand Down
5 changes: 1 addition & 4 deletions test/wpt/status/encoding.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,7 @@
"fail": "iso-2022-jp decoder state handling bug: https://encoding.spec.whatwg.org/#iso-2022-jp-decoder"
},
"textdecoder-byte-order-marks.any.js": {
"fail": "Mismatching BOM should not be ignored"
},
"textdecoder-copy.any.js": {
"fail": "Should not have output BOM: https://encoding.spec.whatwg.org/#concept-td-serialize"
"requires": ["small-icu"]
},
"textdecoder-fatal-single-byte.any.js": {
"requires": ["full-icu"],
Expand Down

0 comments on commit a30272b

Please sign in to comment.