Skip to content

Commit

Permalink
dates: Add and use fuzzy skeleton matching
Browse files Browse the repository at this point in the history
  • Loading branch information
akx committed Jan 23, 2016
1 parent 796c3d2 commit cd70395
Show file tree
Hide file tree
Showing 2 changed files with 119 additions and 12 deletions.
114 changes: 107 additions & 7 deletions babel/dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -741,7 +741,7 @@ def format_time(time=None, format='medium', tzinfo=None, locale=LC_TIME):
return parse_pattern(format).apply(time, locale)


def format_skeleton(skeleton, datetime=None, tzinfo=None, locale=LC_TIME):
def format_skeleton(skeleton, datetime=None, tzinfo=None, fuzzy=True, locale=LC_TIME):
r"""Return a time and/or date formatted according to the given pattern.
The skeletons are defined in the CLDR data and provide more flexibility
Expand All @@ -754,6 +754,12 @@ def format_skeleton(skeleton, datetime=None, tzinfo=None, locale=LC_TIME):
u'dim. 1 avr.'
>>> format_skeleton('MMMEd', t, locale='en')
u'Sun, Apr 1'
>>> format_skeleton('yMMd', t, locale='fi') # yMMd is not in the Finnish locale; yMd gets used
u'1.4.2007'
>>> format_skeleton('yMMd', t, fuzzy=False, locale='fi') # yMMd is not in the Finnish locale, an error is thrown
Traceback (most recent call last):
...
KeyError: yMMd
After the skeleton is resolved to a pattern `format_datetime` is called so
all timezone processing etc is the same as for that.
Expand All @@ -762,9 +768,13 @@ def format_skeleton(skeleton, datetime=None, tzinfo=None, locale=LC_TIME):
:param datetime: the ``time`` or ``datetime`` object; if `None`, the current
time in UTC is used
:param tzinfo: the time-zone to apply to the time for display
:param fuzzy: If the skeleton is not found, allow choosing a skeleton that's
close enough to it.
:param locale: a `Locale` object or a locale identifier
"""
locale = Locale.parse(locale)
if fuzzy and skeleton not in locale.datetime_skeletons:
skeleton = match_skeleton(skeleton, locale.datetime_skeletons)
format = locale.datetime_skeletons[skeleton]
return format_datetime(datetime, format, tzinfo, locale)

Expand Down Expand Up @@ -905,7 +915,7 @@ def _format_fallback_interval(start, end, skeleton, tzinfo, locale):
)


def format_interval(start, end, skeleton, tzinfo=None, locale=LC_TIME):
def format_interval(start, end, skeleton=None, tzinfo=None, fuzzy=True, locale=LC_TIME):
"""
Format an interval between two instants according to the locale's rules.
Expand All @@ -926,10 +936,23 @@ def format_interval(start, end, skeleton, tzinfo=None, locale=LC_TIME):
>>> format_interval(time(16, 18), time(16, 18), "Hm", locale="it")
'16:18'
Unknown skeletons fall back to "default" formatting.
>>> format_interval(date(2015, 1, 1), date(2017, 1, 1), "wzq", locale="ja")
'2015/01/01\uff5e2017/01/01'
>>> format_interval(time(16, 18), time(16, 24), "xxx", locale="ja")
'16:18:00\uff5e16:24:00'
>>> format_interval(date(2016, 1, 15), date(2016, 1, 17), "xxx", locale="de")
'15.01.2016 \u2013 17.01.2016'
:param start: First instant (datetime/date/time)
:param end: Second instant (datetime/date/time)
:param skeleton: The "skeleton format" to use for formatting.
:param tzinfo: tzinfo to use (if none is already attached)
:param fuzzy: If the skeleton is not found, allow choosing a skeleton that's
close enough to it.
:param locale: A locale object or identifier.
:return: Formatted interval
"""
Expand All @@ -942,19 +965,26 @@ def format_interval(start, end, skeleton, tzinfo=None, locale=LC_TIME):
# > starting in the current locale and then following the locale fallback
# > chain up to, but not including root.

if skeleton not in locale.interval_formats:
interval_formats = locale.interval_formats

if skeleton not in interval_formats or not skeleton:
# > If no match was found from the previous step, check what the closest
# > match is in the fallback locale chain, as in availableFormats. That
# > is, this allows for adjusting the string value field's width,
# > including adjusting between "MMM" and "MMMM", and using different
# > variants of the same field, such as 'v' and 'z'.
# TODO: Implement closest-match instead of immediately falling back
return _format_fallback_interval(start, end, skeleton, tzinfo, locale)
if skeleton and fuzzy:
skeleton = match_skeleton(skeleton, interval_formats)
else:
skeleton = None
if not skeleton: # Still no match whatsoever?
# > Otherwise, format the start and end datetime using the fallback pattern.
return _format_fallback_interval(start, end, skeleton, tzinfo, locale)

skel_formats = locale.interval_formats[skeleton]
skel_formats = interval_formats[skeleton]

if start == end:
return format_skeleton(skeleton, start, tzinfo, locale)
return format_skeleton(skeleton, start, tzinfo, fuzzy=fuzzy, locale=locale)

start = _ensure_datetime_tzinfo(_get_datetime(start), tzinfo=tzinfo)
end = _ensure_datetime_tzinfo(_get_datetime(end), tzinfo=tzinfo)
Expand Down Expand Up @@ -1489,3 +1519,73 @@ def split_interval_pattern(pattern):
parts[-1].append((tok_type, tok_value))

return [untokenize_pattern(tokens) for tokens in parts]


def match_skeleton(skeleton, options, allow_different_fields=False):
"""
Find the closest match for the given datetime skeleton among the options given.
This uses the rules outlined in the TR35 document.
>>> match_skeleton('yMMd', ('yMd', 'yMMMd'))
'yMd'
>>> match_skeleton('yMMd', ('jyMMd',), allow_different_fields=True)
'jyMMd'
>>> match_skeleton('yMMd', ('qyMMd',), allow_different_fields=False)
>>> match_skeleton('hmz', ('hmv',))
'hmv'
:param skeleton: The skeleton to match
:type skeleton: str
:param options: An iterable of other skeletons to match against
:type options: Iterable[str]
:return: The closest skeleton match, or if no match was found, None.
:rtype: str|None
"""

# TODO: maybe implement pattern expansion?

# Based on the implementation in
# http://source.icu-project.org/repos/icu/icu4j/trunk/main/classes/core/src/com/ibm/icu/text/DateIntervalInfo.java

# Filter out falsy values and sort for stability; when `interval_formats` is passed in, there may be a None key.
options = sorted(option for option in options if option)

if 'z' in skeleton and not any('z' in option for option in options):
skeleton = skeleton.replace('z', 'v')

get_input_field_width = dict(t[1] for t in tokenize_pattern(skeleton) if t[0] == "field").get
best_skeleton = None
best_distance = None
for option in options:
get_opt_field_width = dict(t[1] for t in tokenize_pattern(option) if t[0] == "field").get
distance = 0
for field in PATTERN_CHARS:
input_width = get_input_field_width(field, 0)
opt_width = get_opt_field_width(field, 0)
if input_width == opt_width:
continue
if opt_width == 0 or input_width == 0:
if not allow_different_fields: # This one is not okay
option = None
break
distance += 0x1000 # Magic weight constant for "entirely different fields"
elif field == 'M' and ((input_width > 2 and opt_width <= 2) or (input_width <= 2 and opt_width > 2)):
distance += 0x100 # Magic weight for "text turns into a number"
else:
distance += abs(input_width - opt_width)

if not option: # We lost the option along the way (probably due to "allow_different_fields")
continue

if not best_skeleton or distance < best_distance:
best_skeleton = option
best_distance = distance

if distance == 0: # Found a perfect match!
break

return best_skeleton
17 changes: 12 additions & 5 deletions tests/test_date_intervals.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,25 +13,25 @@


def test_format_interval_same_instant_1():
assert dates.format_interval(TEST_DT, TEST_DT, "yMMMd", locale="fi") == "8. tammikuuta 2016"
assert dates.format_interval(TEST_DT, TEST_DT, "yMMMd", fuzzy=False, locale="fi") == "8. tammikuuta 2016"


def test_format_interval_same_instant_2():
assert dates.format_interval(TEST_DT, TEST_DT, "xxx", locale="fi") == "8.1.2016 klo 11.46.15"
assert dates.format_interval(TEST_DT, TEST_DT, "xxx", fuzzy=False, locale="fi") == "8.1.2016 klo 11.46.15"


def test_format_interval_same_instant_3():
assert dates.format_interval(TEST_TIME, TEST_TIME, "xxx", locale="fi") == "11.46.15"
assert dates.format_interval(TEST_TIME, TEST_TIME, "xxx", fuzzy=False, locale="fi") == "11.46.15"


def test_format_interval_same_instant_4():
assert dates.format_interval(TEST_DATE, TEST_DATE, "xxx", locale="fi") == "8.1.2016"
assert dates.format_interval(TEST_DATE, TEST_DATE, "xxx", fuzzy=False, locale="fi") == "8.1.2016"


def test_format_interval_no_difference():
t1 = TEST_DT
t2 = t1 + datetime.timedelta(minutes=8)
assert dates.format_interval(t1, t2, "yMd", locale="fi") == "8.1.2016"
assert dates.format_interval(t1, t2, "yMd", fuzzy=False, locale="fi") == "8.1.2016"


def test_format_interval_in_tz():
Expand All @@ -45,3 +45,10 @@ def test_format_interval_12_hour():
t2 = TEST_DT
t1 = t2 - datetime.timedelta(hours=1)
assert dates.format_interval(t1, t2, "hm", locale="en") == "10:46 \u2013 11:46 AM"


def test_format_interval_invalid_skeleton():
t1 = TEST_DATE
t2 = TEST_DATE + datetime.timedelta(days=1)
assert dates.format_interval(t1, t2, "mumumu", fuzzy=False, locale="fi") == u"8.1.2016\u20139.1.2016"
assert dates.format_interval(t1, t2, fuzzy=False, locale="fi") == u"8.1.2016\u20139.1.2016"

0 comments on commit cd70395

Please sign in to comment.