Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Commit

Permalink
Merge pull request from GHSA-22p3-qrh9-cx32
Browse files Browse the repository at this point in the history
* Make _iterate_over_text easier to read by using simple data structures

* Prefer a set of tags to ignore

In my tests, it's 4x faster to check for containment in a set of this size

* Add a stack size limit to _iterate_over_text

* Continue accepting the case where there is no body element

* Use an early return instead for None

Co-authored-by: Richard van der Hoff <[email protected]>
  • Loading branch information
reivilibre and richvdh authored Jun 28, 2022
1 parent 21e6c0e commit fa13080
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 24 deletions.
63 changes: 39 additions & 24 deletions synapse/rest/media/v1/preview_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import codecs
import itertools
import logging
import re
from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Set, Union
from typing import TYPE_CHECKING, Dict, Generator, Iterable, List, Optional, Set, Union

if TYPE_CHECKING:
from lxml import etree
Expand Down Expand Up @@ -276,7 +275,7 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:

from lxml import etree

TAGS_TO_REMOVE = (
TAGS_TO_REMOVE = {
"header",
"nav",
"aside",
Expand All @@ -291,31 +290,42 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:
"img",
"picture",
etree.Comment,
)
}

# Split all the text nodes into paragraphs (by splitting on new
# lines)
text_nodes = (
re.sub(r"\s+", "\n", el).strip()
for el in _iterate_over_text(tree.find("body"), *TAGS_TO_REMOVE)
for el in _iterate_over_text(tree.find("body"), TAGS_TO_REMOVE)
)
return summarize_paragraphs(text_nodes)


def _iterate_over_text(
tree: "etree.Element", *tags_to_ignore: Union[str, "etree.Comment"]
tree: Optional["etree.Element"],
tags_to_ignore: Set[Union[str, "etree.Comment"]],
stack_limit: int = 1024,
) -> Generator[str, None, None]:
"""Iterate over the tree returning text nodes in a depth first fashion,
skipping text nodes inside certain tags.
Args:
tree: The parent element to iterate. Can be None if there isn't one.
tags_to_ignore: Set of tags to ignore
stack_limit: Maximum stack size limit for depth-first traversal.
Nodes will be dropped if this limit is hit, which may truncate the
textual result.
Intended to limit the maximum working memory when generating a preview.
"""
# This is basically a stack that we extend using itertools.chain.
# This will either consist of an element to iterate over *or* a string

if tree is None:
return

# This is a stack whose items are elements to iterate over *or* strings
# to be returned.
elements = iter([tree])
while True:
el = next(elements, None)
if el is None:
return
elements: List[Union[str, "etree.Element"]] = [tree]
while elements:
el = elements.pop()

if isinstance(el, str):
yield el
Expand All @@ -329,17 +339,22 @@ def _iterate_over_text(
if el.text:
yield el.text

# We add to the stack all the elements children, interspersed with
# each child's tail text (if it exists). The tail text of a node
# is text that comes *after* the node, so we always include it even
# if we ignore the child node.
elements = itertools.chain(
itertools.chain.from_iterable( # Basically a flatmap
[child, child.tail] if child.tail else [child]
for child in el.iterchildren()
),
elements,
)
# We add to the stack all the element's children, interspersed with
# each child's tail text (if it exists).
#
# We iterate in reverse order so that earlier pieces of text appear
# closer to the top of the stack.
for child in el.iterchildren(reversed=True):
if len(elements) > stack_limit:
# We've hit our limit for working memory
break

if child.tail:
# The tail text of a node is text that comes *after* the node,
# so we always include it even if we ignore the child node.
elements.append(child.tail)

elements.append(child)


def summarize_paragraphs(
Expand Down
17 changes: 17 additions & 0 deletions tests/rest/media/v1/test_html_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,23 @@ def test_windows_1252(self) -> None:
og = parse_html_to_open_graph(tree)
self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})

def test_nested_nodes(self) -> None:
"""A body with some nested nodes. Tests that we iterate over children
in the right order (and don't reverse the order of the text)."""
html = b"""
<a href="somewhere">Welcome <b>the bold <u>and underlined text <svg>
with a cheeky SVG</svg></u> and <strong>some</strong> tail text</b></a>
"""
tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)
self.assertEqual(
og,
{
"og:title": None,
"og:description": "Welcome\n\nthe bold\n\nand underlined text\n\nand\n\nsome\n\ntail text",
},
)


class MediaEncodingTestCase(unittest.TestCase):
def test_meta_charset(self) -> None:
Expand Down

0 comments on commit fa13080

Please sign in to comment.