Skip to content

Commit

Permalink
extraction patterns more strict (#103)
Browse files Browse the repository at this point in the history
* stricter extractors: discard xmlns attributes

* add further test case, make code stricter and simpler

* further specify German regex

* review test
  • Loading branch information
adbar authored Oct 13, 2023
1 parent a4f6b63 commit 567189b
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 22 deletions.
28 changes: 10 additions & 18 deletions htmldate/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from copy import deepcopy
from datetime import datetime
from functools import lru_cache, partial
from typing import Match, Optional, Pattern, Tuple, Union, Counter as Counter_Type
from typing import Match, Optional, Pattern, Union, Counter as Counter_Type

from lxml.html import HtmlElement, tostring # type: ignore

Expand Down Expand Up @@ -637,15 +637,13 @@ def examine_time_elements(
return None


def normalize_match(match: Optional[Match[str]]) -> Tuple[str, str]:
"""Normalize string output by adding "0" if necessary."""
day = match[1] # type: ignore[index]
if len(day) == 1:
day = "0" + day
month = match[2] # type: ignore[index]
if len(month) == 1:
month = "0" + month
return day, month
def normalize_match(match: Optional[Match[str]]) -> str:
"""Normalize string output by adding "0" if necessary,
and optionally expand the year from two to four digits."""
day, month, year = (g.zfill(2) for g in match.groups() if g) # type: ignore[union-attr]
if len(year) == 2:
year = "19" + year if year[0] == "9" else "20" + year
return f"{year}-{month}-{day}"


def search_page(
Expand Down Expand Up @@ -760,8 +758,7 @@ def search_page(
replacement = {}
for item in candidates:
match = THREE_COMP_REGEX_A.match(item)
day, month = normalize_match(match)
candidate = "-".join([match[3], month, day]) # type: ignore[index]
candidate = normalize_match(match)
replacement[candidate] = candidates[item]
candidates = Counter(replacement)
# select
Expand Down Expand Up @@ -815,12 +812,7 @@ def search_page(
replacement = {}
for item in candidates:
match = THREE_COMP_REGEX_B.match(item)
day, month = normalize_match(match)
if match[3][0] == "9": # type: ignore[index]
year = "19" + match[3] # type: ignore[index]
else:
year = "20" + match[3] # type: ignore[index]
candidate = "-".join([year, month, day])
candidate = normalize_match(match)
replacement[candidate] = candidates[item]
candidates = Counter(replacement)
bestmatch = select_candidate(
Expand Down
12 changes: 8 additions & 4 deletions htmldate/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,15 +178,17 @@
# use of regex module for speed?
TEXT_PATTERNS = re.compile(
r'(?:date[^0-9"]{,20}|updated|published) *?(?:in)? *?:? *?([0-9]{1,4})[./]([0-9]{1,2})[./]([0-9]{2,4})|' # EN
r"(?:Datum|Stand): ?([0-9]{1,2})\.([0-9]{1,2})\.([0-9]{2,4})|" # DE
r"(?:Datum|Stand|[Vv]eröffentlicht am):? ?([0-9]{1,2})\.([0-9]{1,2})\.([0-9]{2,4})|" # DE
r"(?:güncellen?me|yayı(?:m|n)lan?ma) *?(?:tarihi)? *?:? *?([0-9]{1,2})[./]([0-9]{1,2})[./]([0-9]{2,4})|"
r"([0-9]{1,2})[./]([0-9]{1,2})[./]([0-9]{2,4}) *?(?:'de|'da|'te|'ta|’de|’da|’te|’ta|tarihinde) *(?:güncellendi|yayı(?:m|n)landı)", # TR
re.I,
)

# core patterns
THREE_COMP_REGEX_A = re.compile(r"([0-3]?[0-9])[/.-]([01]?[0-9])[/.-]([0-9]{4})")
THREE_COMP_REGEX_B = re.compile(r"([0-3]?[0-9])[/.-]([01]?[0-9])[/.-]([0-9]{2})")
THREE_COMP_REGEX_B = re.compile(
r"([0-3]?[0-9])/([01]?[0-9])/([0-9]{2})|([0-3][0-9])[.-]([01][0-9])[.-]([0-9]{2})"
)
TWO_COMP_REGEX = re.compile(r"([0-3]?[0-9])[/.-]([0-9]{4})")

# extensive search patterns
Expand All @@ -205,13 +207,15 @@
r"(\D19[0-9]{2}[01][0-9][0-3][0-9]\D|\D20[0-9]{2}[01][0-9][0-3][0-9]\D)"
)
DATESTRINGS_CATCH = re.compile(r"([12][0-9]{3})([01][0-9])([0-3][0-9])")
SLASHES_PATTERN = re.compile(r"\D([0-3]?[0-9][/.][01]?[0-9][/.][0129][0-9])\D")
SLASHES_PATTERN = re.compile(
r"\D([0-3]?[0-9]/[01]?[0-9]/[0129][0-9]|[0-3][0-9]\.[01][0-9]\.[0129][0-9])\D"
)
SLASHES_YEAR = re.compile(r"([0-9]{2})$")
YYYYMM_PATTERN = re.compile(r"\D([12][0-9]{3}[/.-][01][0-9])\D")
YYYYMM_CATCH = re.compile(r"([12][0-9]{3})[/.-]([01][0-9])")
MMYYYY_PATTERN = re.compile(r"\D([01]?[0-9][/.-][12][0-9]{3})\D")
MMYYYY_YEAR = re.compile(r"([12][0-9]{3})\D?$")
SIMPLE_PATTERN = re.compile(r"\D(199[0-9]|20[0-9]{2})\D")
SIMPLE_PATTERN = re.compile(r"(?<!w3.org)\D(199[0-9]|20[0-9]{2})\D")


def discard_unwanted(tree: HtmlElement) -> Tuple[HtmlElement, List[HtmlElement]]:
Expand Down
26 changes: 26 additions & 0 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1529,6 +1529,26 @@ def test_search_html(original_date=False, min_date=MIN_DATE, max_date=LATEST_POS
)
== "2019-01-01"
)
assert (
search_page(
'<html><head><link xmlns="http://www.w3.org/1999/xhtml"/></head></html>',
OUTPUTFORMAT,
original_date,
min_date,
max_date,
)
is None
)
assert (
search_page(
'<html><body><link href="//homepagedesigner.telekom.de/.cm4all/res/static/beng-editor/5.1.98/css/deploy.css"/></body></html>',
OUTPUTFORMAT,
original_date,
min_date,
max_date,
)
is None
)


def test_idiosyncrasies():
Expand Down Expand Up @@ -1616,6 +1636,12 @@ def test_idiosyncrasies():
)
== "2020-05-05"
)
assert (
find_date(
"<html><body><p>veröffentlicht am 6.12.06</p></body></html>",
)
== "2006-12-06"
)


def test_parser():
Expand Down

0 comments on commit 567189b

Please sign in to comment.