Skip to content

Commit

Permalink
replace lxml.html.Cleaner (#104)
Browse files Browse the repository at this point in the history
* replace lxml.html.Cleaner

* lint code

* syntax

* re-activate test

* clean code
  • Loading branch information
adbar authored Oct 16, 2023
1 parent 567189b commit 5ba8f70
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 41 deletions.
6 changes: 3 additions & 3 deletions htmldate/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@
THREE_COMP_REGEX_B,
TWO_COMP_REGEX,
)
from .settings import CACHE_SIZE, HTML_CLEANER, MAX_POSSIBLE_CANDIDATES
from .utils import load_html
from .settings import CACHE_SIZE, CLEANING_LIST, MAX_POSSIBLE_CANDIDATES
from .utils import clean_html, load_html
from .validators import (
check_extracted_reference,
compare_values,
Expand Down Expand Up @@ -1068,7 +1068,7 @@ def find_date(

# clean before string search
try:
cleaned_html = HTML_CLEANER.clean_html(tree)
cleaned_html = clean_html(tree, CLEANING_LIST)
# rare LXML error: no NULL bytes or control characters
except ValueError: # pragma: no cover
cleaned_html = tree
Expand Down
59 changes: 21 additions & 38 deletions htmldate/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,6 @@

from datetime import datetime

from lxml.html.clean import Cleaner # type: ignore


# Function cache
CACHE_SIZE: int = 8192

Expand All @@ -25,38 +22,24 @@
# set an upper limit to the number of candidates
MAX_POSSIBLE_CANDIDATES: int = 1000

# HTML_CLEANER config
# https://lxml.de/api/lxml.html.clean.Cleaner-class.html
# https://lxml.de/apidoc/lxml.html.clean.html
HTML_CLEANER: Cleaner = Cleaner(
annoying_tags=False,
comments=False,
embedded=True, # affects recall?
forms=False,
frames=True,
javascript=False,
links=False,
meta=False,
page_structure=True,
processing_instructions=False,
remove_unknown_tags=False,
safe_attrs_only=False,
scripts=False,
style=False,
kill_tags=[
"applet",
"audio",
"canvas",
"datalist",
"embed",
"figure",
"label",
"map",
"math",
"object",
"picture",
"rdf",
"svg",
"video",
],
)
CLEANING_LIST = [
"applet",
"audio",
"canvas",
"datalist",
"embed",
"frame",
"frameset",
"figure",
"label",
"map",
"math",
"noframes",
"object",
"picture",
"rdf",
"svg",
"track",
"video",
]
# "iframe", "input", "layer", "param", "source"
10 changes: 10 additions & 0 deletions htmldate/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,3 +216,13 @@ def load_html(htmlobject: Union[bytes, str, HtmlElement]) -> Optional[HtmlElemen
)
tree = None
return tree


def clean_html(tree: HtmlElement, elemlist: List[str]) -> HtmlElement:
"Delete selected elements."
for element in tree.iter(elemlist):
try:
element.drop_tree()
except AttributeError: # pragma: no cover
element.getparent().remove(element)
return tree

0 comments on commit 5ba8f70

Please sign in to comment.