From 10f34e450d643fcea018d2994fbce89fc1c7ac0f Mon Sep 17 00:00:00 2001 From: Derek Gulbranson Date: Fri, 31 Aug 2018 13:50:25 -0700 Subject: [PATCH] refactor prefix handling to correctly parse Portuguese prefixes #72 while continuing to support multiple names after a prefix #23 --- docs/customize.rst | 16 +++--- nameparser/config/prefixes.py | 10 +++- nameparser/parser.py | 101 ++++++++++++++++++---------------- tests.py | 48 ++++++++++++---- 4 files changed, 108 insertions(+), 67 deletions(-) diff --git a/docs/customize.rst b/docs/customize.rst index b4c45ca..46a60c9 100644 --- a/docs/customize.rst +++ b/docs/customize.rst @@ -39,14 +39,14 @@ instantiate the :py:class:`~nameparser.parser.HumanName` class (see below). Editable attributes of nameparser.config.CONSTANTS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -* :py:class:`~nameparser.config.CONSTANTS.titles` - Pieces that come before the name. Includes all `first_name_titles`. Cannot include things that may be first names. -* :py:class:`~nameparser.config.CONSTANTS.first_name_titles` - Titles that, when followed by a single name, that name is a first name, e.g. "King David". -* :py:class:`~nameparser.config.CONSTANTS.suffix_acronyms` - Pieces that come at the end of the name that may or may not have periods separating the letters, e.g. "m.d.". -* :py:class:`~nameparser.config.CONSTANTS.suffix_not_acronyms` - Pieces that come at the end of the name that never have periods separating the letters, e.g. "Jr.". -* :py:class:`~nameparser.config.CONSTANTS.conjunctions` - Connectors like "and" that join the preceding piece to the following piece. -* :py:class:`~nameparser.config.CONSTANTS.prefixes` - Connectors like "del" and "bin" that join to the following piece but not the preceding, similar to titles but can appear anywhere in the name. -* :py:class:`~nameparser.config.CONSTANTS.capitalization_exceptions` - Dictionary of pieces that do not capitalize the first letter, e.g. "Ph.D". -* :py:class:`~nameparser.config.CONSTANTS.regexes` - Regular expressions used to find words, initials, nicknames, etc. +* :py:obj:`~nameparser.config.CONSTANTS.titles` - Pieces that come before the name. Includes all `first_name_titles`. Cannot include things that may be first names. +* :py:obj:`~nameparser.config.CONSTANTS.first_name_titles` - Titles that, when followed by a single name, that name is a first name, e.g. "King David". +* :py:obj:`~nameparser.config.CONSTANTS.suffix_acronyms` - Pieces that come at the end of the name that may or may not have periods separating the letters, e.g. "m.d.". +* :py:obj:`~nameparser.config.CONSTANTS.suffix_not_acronyms` - Pieces that come at the end of the name that never have periods separating the letters, e.g. "Jr.". +* :py:obj:`~nameparser.config.CONSTANTS.conjunctions` - Connectors like "and" that join the preceding piece to the following piece. +* :py:obj:`~nameparser.config.CONSTANTS.prefixes` - Connectors like "del" and "bin" that join to the following piece but not the preceding, similar to titles but can appear anywhere in the name. +* :py:obj:`~nameparser.config.CONSTANTS.capitalization_exceptions` - Dictionary of pieces that do not capitalize the first letter, e.g. "Ph.D". +* :py:obj:`~nameparser.config.CONSTANTS.regexes` - Regular expressions used to find words, initials, nicknames, etc. Each set of constants comes with :py:func:`~nameparser.config.SetManager.add` and :py:func:`~nameparser.config.SetManager.remove` methods for tuning the constants for your project. These methods automatically lower case and diff --git a/nameparser/config/prefixes.py b/nameparser/config/prefixes.py index b2a9386..fbcc3f2 100644 --- a/nameparser/config/prefixes.py +++ b/nameparser/config/prefixes.py @@ -1,7 +1,15 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals -#: Name pieces that appear before a last name. They join to the piece that follows them to make one new piece. +#: Name pieces that appear before a last name. Prefixes join to the piece +# that follows them to make one new piece. They can be chained together, e.g +# "von der" and "de la". Because they only appear in middle or last names, +# they also signifiy that all following name pieces should be in the same name +# part, for example, "von" will be joined to all following pieces that are not +# prefixes or suffixes, allowing recognition of double last names when they +# appear after a prefixes. So in "pennie von bergen wessels MD", "von" will +# join with all following name pieces until the suffix "MD", resulting in the +# correct parsing of the last name "von bergen wessels". PREFIXES = set([ 'abu', 'bin', diff --git a/nameparser/parser.py b/nameparser/parser.py index 0b45efe..1b20018 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -501,14 +501,6 @@ def parse_full_name(self): self.last_list.append(piece) self.suffix_list += pieces[i+1:] break - if piece in self.prefix_joins: - last_piece = pieces[-1:][0] - if self.is_suffix(last_piece): - self.last_list += pieces[i:-1] - self.suffix = last_piece - else: - self.last_list += pieces[i:] - break if not nxt: self.last_list.append(piece) continue @@ -548,14 +540,6 @@ def parse_full_name(self): self.last_list.append(piece) self.suffix_list = pieces[i+1:] + self.suffix_list break - if piece in self.prefix_joins: - last_piece = pieces[-1:][0] - if self.is_suffix(last_piece): - self.last_list += pieces[i:-1] - self.suffix_list.insert(0, last_piece) - else: - self.last_list += pieces[i:] - break if not nxt: self.last_list.append(piece) continue @@ -596,9 +580,6 @@ def parse_full_name(self): if self.is_suffix(piece): self.suffix_list.append(piece) continue - if piece in self.prefix_joins: - self.last_list += pieces[i:] - break self.middle_list.append(piece) try: if parts[2]: @@ -685,16 +666,16 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): # don't join on conjunctions if there's only 2 parts if length < 3: return pieces - + rootname_pieces = [p for p in pieces if self.is_rootname(p)] total_length = len(rootname_pieces) + additional_parts_count - + # find all the conjunctions, join any conjunctions that are next to each # other, then join those newly joined conjunctions and any single # conjunctions to the piece before and after it - conj_index = [i for i, piece in enumerate(pieces) + conj_index = [i for i, piece in enumerate(pieces) if self.is_conjunction(piece)] - + contiguous_conj_i = [] for i, val in enumerate(conj_index): try: @@ -702,10 +683,10 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): contiguous_conj_i += [val] except IndexError: pass - + contiguous_conj_i = group_contiguous_integers(conj_index) - - delete_i = [] + + delete_i = [] for i in contiguous_conj_i: if type(i) == tuple: new_piece = " ".join(pieces[ i[0] : i[1]+1] ) @@ -717,7 +698,7 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): pieces[i] = new_piece #add newly joined conjunctions to constants to be found later self.C.conjunctions.add(new_piece) - + for i in reversed(delete_i): # delete pieces in reverse order or the index changes on each delete del pieces[i] @@ -728,7 +709,7 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): # refresh conjunction index locations conj_index = [i for i, piece in enumerate(pieces) if self.is_conjunction(piece)] - + for i in conj_index: if len(pieces[i]) == 1 and total_length < 4: # if there are only 3 total parts (minus known titles, suffixes @@ -736,7 +717,7 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): # treating it as an initial rather than a conjunction. # http://code.google.com/p/python-nameparser/issues/detail?id=11 continue - + if i is 0: new_piece = " ".join(pieces[i:i+2]) if self.is_title(pieces[i+1]): @@ -748,8 +729,8 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): for j,val in enumerate(conj_index): if val > i: conj_index[j]=val-1 - - else: + + else: new_piece = " ".join(pieces[i-1:i+2]) if self.is_title(pieces[i-1]): # when joining to a title, make new_piece a title too @@ -767,23 +748,51 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0): for j,val in enumerate(conj_index): if val > i: conj_index[j] = val - rm_count - - - # join prefixes to following lastnames: ['de la Vega'], ['van Buren'] + + + # join prefixes to following lastnames: ['de la Vega'], ['van Buren III'] prefixes = list(filter(self.is_prefix, pieces)) if prefixes: - i = pieces.index(prefixes[0]) - # join everything after the prefix until the next non prefix - # store joined pieces in prefix_joins. When a prefix occurs in a last name, - # I think it means the rest of the name is part of the last name, so prefix_joins - # lets us do that in the parser flow. - non_suffixes = list(filter(lambda x: not self.is_prefix(x), pieces[i:])) - if non_suffixes: - j = pieces.index(non_suffixes[0]) - new_piece = ' '.join(pieces[i:j + 1]) - self.prefix_joins += [new_piece] - pieces = pieces[:i] + [new_piece] + pieces[j + 1:] - + for prefix in prefixes: + try: + i = pieces.index(prefix) + except ValueError: + # If the prefix is no longer in pieces, it's because it has been + # combined with the prefix that appears right before (or before that when + # chained together) in the last loop, so the index of that newly created + # piece is the same as in the last loop, i==i still, and we want to join + # it to the next piece. + pass + + new_piece = '' + + # join everything after the prefix until the next non prefix + # store joined pieces in prefix_joins. When a prefix occurs in a last name, + # I think it means the rest of the name is part of the last name, so prefix_joins + # lets us do that in the parser flow. + # for prefix in prefixes: + + try: + next_prefix = next(iter(filter(self.is_prefix, pieces[i + 1:]))) + j = pieces.index(next_prefix) + if j == i + 1: + # if there are two prefixes in sequence, join to the following piece + j += 1 + new_piece = ' '.join(pieces[i:j]) + pieces = pieces[:i] + [new_piece] + pieces[j:] + except StopIteration: + try: + # if there are no more prefixes, look for a suffix to stop at + stop_at = next(iter(filter(self.is_suffix, pieces[i + 1:]))) + j = pieces.index(stop_at) + new_piece = ' '.join(pieces[i:j]) + pieces = pieces[:i] + [new_piece] + pieces[j:] + except StopIteration: + # if there were no suffixes, nothing to stop at so join all + # remaining pieces + new_piece = ' '.join(pieces[i:]) + pieces = pieces[:i] + [new_piece] + log.debug("pieces: {0}".format(pieces)) return pieces diff --git a/tests.py b/tests.py index f2979d8..fb15674 100644 --- a/tests.py +++ b/tests.py @@ -1247,18 +1247,6 @@ def test_name_is_conjunctions(self): hn = HumanName("e and e") self.m(hn.first, "e and e", hn) - def test_portuguese_dos(self): - hn = HumanName("Rafael Sousa dos Anjos") - self.m(hn.first, "Rafael", hn) - self.m(hn.middle, "Sousa", hn) - self.m(hn.last, "dos Anjos", hn) - - def test_portuguese_prefixes(self): - hn = HumanName("Joao da Silva do Amaral de Souza") - self.m(hn.first, "Joao", hn) - self.m(hn.middle, "", hn) - self.m(hn.last, "da Silva do Amaral de Souza", hn) - class ConstantsCustomization(HumanNameTestBase): @@ -1518,6 +1506,42 @@ def test_title_two_part_last_name_with_suffix_in_first_part(self): self.m(hn.last, "von bergen wessels", hn) self.m(hn.suffix, "MD, III", hn) + def test_portuguese_dos(self): + hn = HumanName("Rafael Sousa dos Anjos") + self.m(hn.first, "Rafael", hn) + self.m(hn.middle, "Sousa", hn) + self.m(hn.last, "dos Anjos", hn) + + def test_portuguese_prefixes(self): + hn = HumanName("Joao da Silva do Amaral de Souza") + self.m(hn.first, "Joao", hn) + self.m(hn.middle, "da Silva do Amaral", hn) + self.m(hn.last, "de Souza", hn) + + def test_three_conjunctions(self): + hn = HumanName("Dr. Juan Q. Xavier de la dos Vega III") + self.m(hn.first, "Juan", hn) + self.m(hn.last, "de la dos Vega", hn) + self.m(hn.title, "Dr.", hn) + self.m(hn.middle, "Q. Xavier", hn) + self.m(hn.suffix, "III", hn) + + def test_lastname_three_conjunctions(self): + hn = HumanName("de la dos Vega, Dr. Juan Q. Xavier III") + self.m(hn.first, "Juan", hn) + self.m(hn.last, "de la dos Vega", hn) + self.m(hn.title, "Dr.", hn) + self.m(hn.middle, "Q. Xavier", hn) + self.m(hn.suffix, "III", hn) + + def test_comma_three_conjunctions(self): + hn = HumanName("Dr. Juan Q. Xavier de la dos Vega, III") + self.m(hn.first, "Juan", hn) + self.m(hn.last, "de la dos Vega", hn) + self.m(hn.title, "Dr.", hn) + self.m(hn.middle, "Q. Xavier", hn) + self.m(hn.suffix, "III", hn) + class SuffixesTestCase(HumanNameTestBase):