Skip to content

Commit

Permalink
XCOMMONS-2568: Improve comment handling in HTMLCleaner (#306)
Browse files Browse the repository at this point in the history
  • Loading branch information
michitux authored Nov 16, 2022
1 parent a2f20a6 commit 8ff1a9d
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jdom.Comment;
import org.jdom.DocType;
import org.jdom.Element;
import org.jdom.input.DOMBuilder;
Expand Down Expand Up @@ -179,6 +180,22 @@ protected void printElement(Writer out, Element element, int level, NamespaceSta
currentFormat.setExpandEmptyElements(currentFormatPolicy);
}
}

@Override
protected void printComment(Writer out, Comment comment) throws IOException
{
String commentText = comment.getText();

// TODO: remove this again when https://sourceforge.net/p/htmlcleaner/bugs/234/ has been fixed.
// Make sure that the comment text conforms to the HTML specification, in particular: "Optionally, text,
// with the additional restriction that the text must not start with the string ">", nor start with the
// string "->", nor contain the strings "<!--", "-->", or "--!>", nor end with the string "<!-"."
while (commentText.startsWith(">") || commentText.startsWith("->")) {
commentText = commentText.substring(1);
}

super.printComment(out, new Comment(commentText));
}
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,13 @@ private CleanerProperties getDefaultCleanerProperties(HTMLCleanerConfiguration c

defaultProperties.setDeserializeEntities(true);

// Omit comments in restricted mode to avoid any potential parser confusion.
// Any part of the filtered HTML that contains unfiltered input is potentially dangerous/a candidate for
// parser confusion. Comments, style and script elements seem to be frequently found ingredients in successful
// attacks against good sanitizers. We're already removing style and script elements, so removing comments
// seems like a good defense against future attacks.
defaultProperties.setOmitComments(isRestricted(configuration));

return defaultProperties;
}

Expand Down Expand Up @@ -314,8 +321,7 @@ private TrimAttributeCleanerTransformations getDefaultCleanerTransformations(HTM
defaultTransformations.addTransformation(tt);
}

String restricted = configuration.getParameters().get(HTMLCleanerConfiguration.RESTRICTED);
if ("true".equalsIgnoreCase(restricted)) {
if (isRestricted(configuration)) {

tt = new TagTransformation(HTMLConstants.TAG_SCRIPT, HTMLConstants.TAG_PRE, false);
defaultTransformations.addTransformation(tt);
Expand All @@ -336,6 +342,16 @@ private boolean isHTML5(HTMLCleanerConfiguration configuration)
return getHTMLVersion(configuration) == 5;
}

/**
* @param configuration the configuration to parse
* @return if the parsing should happen in restricted mode
*/
private boolean isRestricted(HTMLCleanerConfiguration configuration)
{
String restricted = configuration.getParameters().get(HTMLCleanerConfiguration.RESTRICTED);
return "true".equalsIgnoreCase(restricted);
}

/**
* @param configuration The configuration to parse.
* @return The HTML version specified in the configuration.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.CsvSource;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.xwiki.component.manager.ComponentManager;
Expand Down Expand Up @@ -327,6 +329,44 @@ void restrictedAttributesAndTags() throws Exception
+ "</mi></math>");
}

/**
* Verify comment handling in restricted mode.
*/
@ParameterizedTest
@CsvSource({
"<p><strong>Hello World</strong></p>,<strong>Hello <!-- a comment --> World</strong>",
"'', <!--My favorite operators are > and <!-->",
// FIXME: Actually, just the comment should be removed but due to erroneous parsing in HTMLCleaner, the whole
// string is treated as a comment.
"'', <!--> <a href=\"#\">no comment</a>",
"'', <!---> <a href=\"#\">no comment</a>"
})
void restrictedComments(String expected, String actual)
{
Map<String, String> parameters = new HashMap<>(this.cleanerConfiguration.getParameters());
parameters.put("restricted", "true");
this.cleanerConfiguration.setParameters(parameters);

assertHTML(expected, actual);
}

@ParameterizedTest
@CsvSource({
"<!--My favorite operators are > and <!-->, <!--My favorite operators are > and <!-->",
"<!-- a comment ==!> not a comment-->, <!-- a comment --!> not a comment",
// FIXME: this is wrongly parsed as a full comment.
"<!-- <a foo=`bar`>not a comment</a>-->, <!--> <a foo=`bar`>not a comment</a>",
"<!--=>-->, <!--->",
// FIXME: according to the HTML specification, this should be a comment.
"'', <! fake comment >",
"<!-- <!== comment -->, <!-- <!-- comment -->",
"<!--My favorite operators are > and <!=-->, <!--My favorite operators are > and <!--->"
})
void comments(String expected, String actual)
{
assertHTML(expected, actual);
}

/**
* Verify that passing a fully-formed XHTML header works fine.
*/
Expand Down

0 comments on commit 8ff1a9d

Please sign in to comment.