Merge pull request #30 from HTMLParseErrorWG/before-head-errs

Parse errors for a bunch of insertion modes with a word "head" in them
inikulin · Dec 10, 2017 · db23e29 · db23e29
2 parents 41f08b5 + a3e2c2b
commit db23e29
Show file tree

Hide file tree

Showing 5 changed files with 109 additions and 18 deletions.
diff --git a/lib/common/error_codes.js b/lib/common/error_codes.js
@@ -51,5 +51,13 @@ module.exports = {
     invalidCharacterSequenceAfterDoctypeName: 'invalid-character-sequence-after-doctype-name',
     duplicateAttribute: 'duplicate-attribute',
     nonConformingDoctype: 'non-conforming-doctype',
-    missingDoctype: 'missing-doctype'
+    missingDoctype: 'missing-doctype',
+    misplacedDoctype: 'misplaced-doctype',
+    endTagWithoutMatchingOpenElement: 'end-tag-without-matching-open-element',
+    closingOfElementWithOpenChildElements: 'closing-of-element-with-open-child-elements',
+    disallowedContentInNoscriptInHead: 'disallowed-content-in-noscript-in-head',
+    openElementsLeftAfterEof: 'open-elements-left-after-eof',
+    abandonedHeadElementChild: 'abandoned-head-element-child',
+    misplacedStartTagForHeadElement: 'misplaced-start-tag-for-head-element',
+    nestedNoscriptInHead: 'nested-noscript-in-head'
 };
diff --git a/lib/extensions/location_info/tokenizer_mixin.js b/lib/extensions/location_info/tokenizer_mixin.js
@@ -99,9 +99,17 @@ LocationInfoTokenizerMixin.prototype._getOverriddenMethods = function (mxn, orig
                 this.currentCharacterToken.location.endOffset = ctLoc.startOffset;
             }
 
-            ctLoc.endLine = mxn.posTracker.line;
-            ctLoc.endCol = mxn.posTracker.col + 1;
-            ctLoc.endOffset = mxn.posTracker.offset + 1;
+            if (this.currentToken.type === Tokenizer.EOF_TOKEN) {
+                ctLoc.endLine = ctLoc.startLine;
+                ctLoc.endCol = ctLoc.startCol;
+                ctLoc.endOffset = ctLoc.startOffset;
+            }
+
+            else {
+                ctLoc.endLine = mxn.posTracker.line;
+                ctLoc.endCol = mxn.posTracker.col + 1;
+                ctLoc.endOffset = mxn.posTracker.offset + 1;
+            }
 
             orig._emitCurrentToken.call(this);
         },

diff --git a/lib/parser/index.js b/lib/parser/index.js
@@ -112,7 +112,7 @@ _[BEFORE_HEAD_MODE][Tokenizer.CHARACTER_TOKEN] =
 _[BEFORE_HEAD_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = tokenBeforeHead;
 _[BEFORE_HEAD_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = ignoreToken;
 _[BEFORE_HEAD_MODE][Tokenizer.COMMENT_TOKEN] = appendComment;
-_[BEFORE_HEAD_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken;
+_[BEFORE_HEAD_MODE][Tokenizer.DOCTYPE_TOKEN] = misplacedDoctype;
 _[BEFORE_HEAD_MODE][Tokenizer.START_TAG_TOKEN] = startTagBeforeHead;
 _[BEFORE_HEAD_MODE][Tokenizer.END_TAG_TOKEN] = endTagBeforeHead;
 _[BEFORE_HEAD_MODE][Tokenizer.EOF_TOKEN] = tokenBeforeHead;
@@ -122,7 +122,7 @@ _[IN_HEAD_MODE][Tokenizer.CHARACTER_TOKEN] =
 _[IN_HEAD_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = tokenInHead;
 _[IN_HEAD_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = insertCharacters;
 _[IN_HEAD_MODE][Tokenizer.COMMENT_TOKEN] = appendComment;
-_[IN_HEAD_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken;
+_[IN_HEAD_MODE][Tokenizer.DOCTYPE_TOKEN] = misplacedDoctype;
 _[IN_HEAD_MODE][Tokenizer.START_TAG_TOKEN] = startTagInHead;
 _[IN_HEAD_MODE][Tokenizer.END_TAG_TOKEN] = endTagInHead;
 _[IN_HEAD_MODE][Tokenizer.EOF_TOKEN] = tokenInHead;
@@ -132,7 +132,7 @@ _[IN_HEAD_NO_SCRIPT_MODE][Tokenizer.CHARACTER_TOKEN] =
 _[IN_HEAD_NO_SCRIPT_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = tokenInHeadNoScript;
 _[IN_HEAD_NO_SCRIPT_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = insertCharacters;
 _[IN_HEAD_NO_SCRIPT_MODE][Tokenizer.COMMENT_TOKEN] = appendComment;
-_[IN_HEAD_NO_SCRIPT_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken;
+_[IN_HEAD_NO_SCRIPT_MODE][Tokenizer.DOCTYPE_TOKEN] = misplacedDoctype;
 _[IN_HEAD_NO_SCRIPT_MODE][Tokenizer.START_TAG_TOKEN] = startTagInHeadNoScript;
 _[IN_HEAD_NO_SCRIPT_MODE][Tokenizer.END_TAG_TOKEN] = endTagInHeadNoScript;
 _[IN_HEAD_NO_SCRIPT_MODE][Tokenizer.EOF_TOKEN] = tokenInHeadNoScript;
@@ -142,7 +142,7 @@ _[AFTER_HEAD_MODE][Tokenizer.CHARACTER_TOKEN] =
 _[AFTER_HEAD_MODE][Tokenizer.NULL_CHARACTER_TOKEN] = tokenAfterHead;
 _[AFTER_HEAD_MODE][Tokenizer.WHITESPACE_CHARACTER_TOKEN] = insertCharacters;
 _[AFTER_HEAD_MODE][Tokenizer.COMMENT_TOKEN] = appendComment;
-_[AFTER_HEAD_MODE][Tokenizer.DOCTYPE_TOKEN] = ignoreToken;
+_[AFTER_HEAD_MODE][Tokenizer.DOCTYPE_TOKEN] = misplacedDoctype;
 _[AFTER_HEAD_MODE][Tokenizer.START_TAG_TOKEN] = startTagAfterHead;
 _[AFTER_HEAD_MODE][Tokenizer.END_TAG_TOKEN] = endTagAfterHead;
 _[AFTER_HEAD_MODE][Tokenizer.EOF_TOKEN] = tokenAfterHead;
@@ -1025,6 +1025,10 @@ function ignoreToken() {
     //NOTE: do nothing =)
 }
 
+function misplacedDoctype(p) {
+    p._err(ERR.misplacedDoctype);
+}
+
 function appendComment(p, token) {
     p._appendCommentNode(token, p.openElements.currentTmplContent || p.openElements.current);
 }
@@ -1117,6 +1121,8 @@ function endTagBeforeHead(p, token) {
 
     if (tn === $.HEAD || tn === $.BODY || tn === $.HTML || tn === $.BR)
         tokenBeforeHead(p, token);
+    else
+        p._err(ERR.endTagWithoutMatchingOpenElement);
 }
 
 function tokenBeforeHead(p, token) {
@@ -1167,7 +1173,10 @@ function startTagInHead(p, token) {
         p._pushTmplInsertionMode(IN_TEMPLATE_MODE);
     }
 
-    else if (tn !== $.HEAD)
+    else if (tn === $.HEAD)
+        p._err(ERR.misplacedStartTagForHeadElement);
+
+    else
         tokenInHead(p, token);
 }
 
@@ -1182,13 +1191,25 @@ function endTagInHead(p, token) {
     else if (tn === $.BODY || tn === $.BR || tn === $.HTML)
         tokenInHead(p, token);
 
-    else if (tn === $.TEMPLATE && p.openElements.tmplCount > 0) {
-        p.openElements.generateImpliedEndTags();
-        p.openElements.popUntilTagNamePopped($.TEMPLATE);
-        p.activeFormattingElements.clearToLastMarker();
-        p._popTmplInsertionMode();
-        p._resetInsertionMode();
+    else if (tn === $.TEMPLATE) {
+        if (p.openElements.tmplCount > 0) {
+            p.openElements.generateImpliedEndTagsThoroughly();
+
+            if (p.openElements.currentTagName !== $.TEMPLATE)
+                p._err(ERR.closingOfElementWithOpenChildElements);
+
+            p.openElements.popUntilTagNamePopped($.TEMPLATE);
+            p.activeFormattingElements.clearToLastMarker();
+            p._popTmplInsertionMode();
+            p._resetInsertionMode();
+        }
+
+        else
+            p._err(ERR.endTagWithoutMatchingOpenElement);
     }
+
+    else
+        p._err(ERR.endTagWithoutMatchingOpenElement);
 }
 
 function tokenInHead(p, token) {
@@ -1206,10 +1227,14 @@ function startTagInHeadNoScript(p, token) {
     if (tn === $.HTML)
         startTagInBody(p, token);
 
-    else if (tn === $.BASEFONT || tn === $.BGSOUND || tn === $.LINK || tn === $.META || tn === $.NOFRAMES || tn === $.STYLE)
+    else if (tn === $.BASEFONT || tn === $.BGSOUND || tn === $.HEAD || tn === $.LINK ||
+             tn === $.META || tn === $.NOFRAMES || tn === $.STYLE)
         startTagInHead(p, token);
 
-    else if (tn !== $.HEAD && tn !== $.NOSCRIPT)
+    else if (tn === $.NOSCRIPT)
+        p._err(ERR.nestedNoscriptInHead);
+
+    else
         tokenInHeadNoScript(p, token);
 }
 
@@ -1223,9 +1248,17 @@ function endTagInHeadNoScript(p, token) {
 
     else if (tn === $.BR)
         tokenInHeadNoScript(p, token);
+
+    else
+        p._err(ERR.endTagWithoutMatchingOpenElement);
 }
 
 function tokenInHeadNoScript(p, token) {
+    var errCode = token.type === Tokenizer.EOF_TOKEN ?
+                  ERR.openElementsLeftAfterEof :
+                  ERR.disallowedContentInNoscriptInHead;
+
+    p._err(errCode);
     p.openElements.pop();
     p.insertionMode = IN_HEAD_MODE;
     p._processToken(token);
@@ -1252,12 +1285,16 @@ function startTagAfterHead(p, token) {
 
     else if (tn === $.BASE || tn === $.BASEFONT || tn === $.BGSOUND || tn === $.LINK || tn === $.META ||
              tn === $.NOFRAMES || tn === $.SCRIPT || tn === $.STYLE || tn === $.TEMPLATE || tn === $.TITLE) {
+        p._err(ERR.abandonedHeadElementChild);
         p.openElements.push(p.headElement);
         startTagInHead(p, token);
         p.openElements.remove(p.headElement);
     }
 
-    else if (tn !== $.HEAD)
+    else if (tn === $.HEAD)
+        p._err(ERR.misplacedStartTagForHeadElement);
+
+    else
         tokenAfterHead(p, token);
 }
 
@@ -1269,6 +1306,9 @@ function endTagAfterHead(p, token) {
 
     else if (tn === $.TEMPLATE)
         endTagInHead(p, token);
+
+    else
+        p._err(ERR.endTagWithoutMatchingOpenElement);
 }
 
 function tokenAfterHead(p, token) {

diff --git a/lib/parser/open_element_stack.js b/lib/parser/open_element_stack.js
@@ -31,6 +31,34 @@ function isImpliedEndTagRequired(tn) {
     return false;
 }
 
+function isImpliedEndTagRequiredThoroughly(tn) {
+    switch (tn.length) {
+        case 1:
+            return tn === $.P;
+
+        case 2:
+            return tn === $.RB || tn === $.RP || tn === $.RT || tn === $.DD ||
+                   tn === $.DT || tn === $.LI || tn === $.TD || tn === $.TH || tn === $.TR;
+
+        case 3:
+            return tn === $.RTC;
+
+        case 5:
+            return tn === $.TBODY || tn === $.TFOOT || tn === $.THEAD;
+
+        case 6:
+            return tn === $.OPTION;
+
+        case 7:
+            return tn === $.CAPTION;
+
+        case 8:
+            return tn === $.OPTGROUP || tn === $.COLGROUP;
+    }
+
+    return false;
+}
+
 function isScopingElement(tn, ns) {
     switch (tn.length) {
         case 2:
@@ -389,6 +417,11 @@ OpenElementStack.prototype.generateImpliedEndTags = function () {
         this.pop();
 };
 
+OpenElementStack.prototype.generateImpliedEndTagsThoroughly = function () {
+    while (isImpliedEndTagRequiredThoroughly(this.currentTagName))
+        this.pop();
+};
+
 OpenElementStack.prototype.generateImpliedEndTagsWithExclusion = function (exclusionTagName) {
     while (isImpliedEndTagRequired(this.currentTagName) && this.currentTagName !== exclusionTagName)
         this.pop();

diff --git a/test/data/tree_construction/tests18.dat b/test/data/tree_construction/tests18.dat
@@ -56,6 +56,8 @@
 42: Bad start tag in “plaintext” in “head”.
 54: End of file seen and there were open elements.
 42: Unclosed element “plaintext”.
+#new-errors
+(1:32-1:43) disallowed-content-in-noscript
 #document
 | <!DOCTYPE html>
 | <html>