From f9d1690bf3933059afb27ee401f17e12b6adbd90 Mon Sep 17 00:00:00 2001 From: inikulin Date: Thu, 1 Jun 2017 00:04:44 +0300 Subject: [PATCH 1/5] non-unicode-character-in-input-stream -> surrogate-in-input-stream --- lib/common/error_codes.js | 2 +- lib/tokenizer/preprocessor.js | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/common/error_codes.js b/lib/common/error_codes.js index 9689ca7fe..7eaf5d283 100644 --- a/lib/common/error_codes.js +++ b/lib/common/error_codes.js @@ -3,7 +3,7 @@ module.exports = { controlCharacterInInputStream: 'control-character-in-input-stream', undefinedCharacterInInputStream: 'undefined-character-in-input-stream', - nonUnicodeCharacterInInputStream: 'non-unicode-character-in-input-stream', + surrogateInInputStream: 'surrogate-in-input-stream', nonVoidHtmlElementStartTagWithTrailingSolidus: 'non-void-html-element-start-tag-with-trailing-solidus', endTagWithAttributes: 'end-tag-with-attributes', endTagWithTrailingSolidus: 'end-tag-with-trailing-solidus', diff --git a/lib/tokenizer/preprocessor.js b/lib/tokenizer/preprocessor.js index 65d8e107d..3dff8bc0c 100644 --- a/lib/tokenizer/preprocessor.js +++ b/lib/tokenizer/preprocessor.js @@ -60,7 +60,7 @@ Preprocessor.prototype._processSurrogate = function (cp) { } //NOTE: isolated surrogate - this._err(ERR.nonUnicodeCharacterInInputStream); + this._err(ERR.surrogateInInputStream); return cp; }; From 79bf8ed741c37b3a03bc2e5571d1819b2927804f Mon Sep 17 00:00:00 2001 From: inikulin Date: Thu, 1 Jun 2017 00:08:14 +0300 Subject: [PATCH 2/5] undefined-character-in-input-stream -> noncharacter-in-input-stream --- lib/common/error_codes.js | 2 +- lib/tokenizer/preprocessor.js | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/common/error_codes.js b/lib/common/error_codes.js index 7eaf5d283..cb3350983 100644 --- a/lib/common/error_codes.js +++ b/lib/common/error_codes.js @@ -2,7 +2,7 @@ module.exports = { controlCharacterInInputStream: 'control-character-in-input-stream', - undefinedCharacterInInputStream: 'undefined-character-in-input-stream', + noncharacterInInputStream: 'noncharacter-in-input-stream', surrogateInInputStream: 'surrogate-in-input-stream', nonVoidHtmlElementStartTagWithTrailingSolidus: 'non-void-html-element-start-tag-with-trailing-solidus', endTagWithAttributes: 'end-tag-with-attributes', diff --git a/lib/tokenizer/preprocessor.js b/lib/tokenizer/preprocessor.js index 3dff8bc0c..cb02c9c21 100644 --- a/lib/tokenizer/preprocessor.js +++ b/lib/tokenizer/preprocessor.js @@ -141,7 +141,7 @@ Preprocessor.prototype._checkForProblematicCharacters = function (cp) { this._err(ERR.controlCharacterInInputStream); else if (unicode.isUndefinedCodePoint(cp)) - this._err(ERR.undefinedCharacterInInputStream); + this._err(ERR.noncharacterInInputStream); }; Preprocessor.prototype.retreat = function () { From ee88bc7bfc08c9c30718ba807c1841a295abae2f Mon Sep 17 00:00:00 2001 From: inikulin Date: Thu, 1 Jun 2017 00:12:00 +0300 Subject: [PATCH 3/5] undefined-character-reference -> noncharacter-character-reference --- lib/common/error_codes.js | 2 +- lib/tokenizer/index.js | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/common/error_codes.js b/lib/common/error_codes.js index cb3350983..50317b07d 100644 --- a/lib/common/error_codes.js +++ b/lib/common/error_codes.js @@ -44,7 +44,7 @@ module.exports = { nullCharacterReference: 'null-character-reference', nonUnicodeCharacterReference: 'non-unicode-character-reference', controlCharacterReference: 'control-character-reference', - undefinedCharacterReference: 'undefined-character-reference', + noncharacterCharacterReference: 'noncharacter-character-reference', missingWhitespaceBeforeDoctypeName: 'missing-whitespace-before-doctype-name', missingDoctypeName: 'missing-doctype-name', invalidCharacterSequenceAfterDoctypeName: 'invalid-character-sequence-after-doctype-name', diff --git a/lib/tokenizer/index.js b/lib/tokenizer/index.js index 7c6aa8efc..dd6675ee7 100644 --- a/lib/tokenizer/index.js +++ b/lib/tokenizer/index.js @@ -2521,7 +2521,7 @@ _[NUMERIC_CHARACTER_REFERENCE_END_STATE] = function numericCharacterReferenceEnd } else if (unicode.isUndefinedCodePoint(this.charRefCode)) - this._err(ERR.undefinedCharacterReference); + this._err(ERR.noncharacterCharacterReference); else if (unicode.isControlCodePoint(this.charRefCode) || this.charRefCode === $.CARRIAGE_RETURN) { this._err(ERR.controlCharacterReference); From d66df99f525b645994983d491bb46bc59e697797 Mon Sep 17 00:00:00 2001 From: inikulin Date: Thu, 1 Jun 2017 00:15:32 +0300 Subject: [PATCH 4/5] non-unicode-character-reference -> surrogate-character-reference --- lib/common/error_codes.js | 2 +- lib/tokenizer/index.js | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/common/error_codes.js b/lib/common/error_codes.js index 50317b07d..d4e3c8c6d 100644 --- a/lib/common/error_codes.js +++ b/lib/common/error_codes.js @@ -42,7 +42,7 @@ module.exports = { eofInCdata: 'eof-in-cdata', absenceOfDigitsInNumericCharacterReference: 'absence-of-digits-in-numeric-character-reference', nullCharacterReference: 'null-character-reference', - nonUnicodeCharacterReference: 'non-unicode-character-reference', + surrogateCharacterReference: 'surrogate-character-reference', controlCharacterReference: 'control-character-reference', noncharacterCharacterReference: 'noncharacter-character-reference', missingWhitespaceBeforeDoctypeName: 'missing-whitespace-before-doctype-name', diff --git a/lib/tokenizer/index.js b/lib/tokenizer/index.js index dd6675ee7..d2e689c88 100644 --- a/lib/tokenizer/index.js +++ b/lib/tokenizer/index.js @@ -2516,7 +2516,7 @@ _[NUMERIC_CHARACTER_REFERENCE_END_STATE] = function numericCharacterReferenceEnd } else if (unicode.isNonUnicodeCodePoint(this.charRefCode)) { - this._err(ERR.nonUnicodeCharacterReference); + this._err(ERR.surrogateCharacterReference); this.charRefCode = $.REPLACEMENT_CHARACTER; } From 5d2483fb5e78db600c629d092b3aeac238bd9b1f Mon Sep 17 00:00:00 2001 From: inikulin Date: Thu, 1 Jun 2017 00:29:35 +0300 Subject: [PATCH 5/5] character-reference-outside-unicode-range --- lib/common/error_codes.js | 1 + lib/common/unicode.js | 6 +----- lib/tokenizer/index.js | 7 ++++++- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/lib/common/error_codes.js b/lib/common/error_codes.js index d4e3c8c6d..6f4027a19 100644 --- a/lib/common/error_codes.js +++ b/lib/common/error_codes.js @@ -43,6 +43,7 @@ module.exports = { absenceOfDigitsInNumericCharacterReference: 'absence-of-digits-in-numeric-character-reference', nullCharacterReference: 'null-character-reference', surrogateCharacterReference: 'surrogate-character-reference', + characterReferenceOutsideUnicodeRange: 'character-reference-outside-unicode-range', controlCharacterReference: 'control-character-reference', noncharacterCharacterReference: 'noncharacter-character-reference', missingWhitespaceBeforeDoctypeName: 'missing-whitespace-before-doctype-name', diff --git a/lib/common/unicode.js b/lib/common/unicode.js index 4f1821772..5c8a2708c 100644 --- a/lib/common/unicode.js +++ b/lib/common/unicode.js @@ -55,7 +55,7 @@ exports.CODE_POINT_SEQUENCES = { //Surrogates -var isSurrogate = exports.isSurrogate = function (cp) { +exports.isSurrogate = function (cp) { return cp >= 0xD800 && cp <= 0xDFFF; }; @@ -77,7 +77,3 @@ exports.isControlCodePoint = function (cp) { exports.isUndefinedCodePoint = function (cp) { return cp >= 0xFDD0 && cp <= 0xFDEF || UNDEFINED_CODE_POINTS.indexOf(cp) > -1; }; - -exports.isNonUnicodeCodePoint = function (cp) { - return isSurrogate(cp) || cp > 0x10FFFF; -}; diff --git a/lib/tokenizer/index.js b/lib/tokenizer/index.js index d2e689c88..4fd0123f7 100644 --- a/lib/tokenizer/index.js +++ b/lib/tokenizer/index.js @@ -2515,7 +2515,12 @@ _[NUMERIC_CHARACTER_REFERENCE_END_STATE] = function numericCharacterReferenceEnd this.charRefCode = $.REPLACEMENT_CHARACTER; } - else if (unicode.isNonUnicodeCodePoint(this.charRefCode)) { + else if (this.charRefCode > 0x10FFFF) { + this._err(ERR.characterReferenceOutsideUnicodeRange); + this.charRefCode = $.REPLACEMENT_CHARACTER; + } + + else if (unicode.isSurrogate(this.charRefCode)) { this._err(ERR.surrogateCharacterReference); this.charRefCode = $.REPLACEMENT_CHARACTER; }