Skip to content

Commit

Permalink
refactor prefix handling to correctly parse Portuguese prefixes #72
Browse files Browse the repository at this point in the history
while continuing to support multiple names after a prefix #23
  • Loading branch information
derek73 committed Aug 31, 2018
1 parent e9fd11e commit 10f34e4
Show file tree
Hide file tree
Showing 4 changed files with 108 additions and 67 deletions.
16 changes: 8 additions & 8 deletions docs/customize.rst
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,14 @@ instantiate the :py:class:`~nameparser.parser.HumanName` class (see below).
Editable attributes of nameparser.config.CONSTANTS
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

* :py:class:`~nameparser.config.CONSTANTS.titles` - Pieces that come before the name. Includes all `first_name_titles`. Cannot include things that may be first names.
* :py:class:`~nameparser.config.CONSTANTS.first_name_titles` - Titles that, when followed by a single name, that name is a first name, e.g. "King David".
* :py:class:`~nameparser.config.CONSTANTS.suffix_acronyms` - Pieces that come at the end of the name that may or may not have periods separating the letters, e.g. "m.d.".
* :py:class:`~nameparser.config.CONSTANTS.suffix_not_acronyms` - Pieces that come at the end of the name that never have periods separating the letters, e.g. "Jr.".
* :py:class:`~nameparser.config.CONSTANTS.conjunctions` - Connectors like "and" that join the preceding piece to the following piece.
* :py:class:`~nameparser.config.CONSTANTS.prefixes` - Connectors like "del" and "bin" that join to the following piece but not the preceding, similar to titles but can appear anywhere in the name.
* :py:class:`~nameparser.config.CONSTANTS.capitalization_exceptions` - Dictionary of pieces that do not capitalize the first letter, e.g. "Ph.D".
* :py:class:`~nameparser.config.CONSTANTS.regexes` - Regular expressions used to find words, initials, nicknames, etc.
* :py:obj:`~nameparser.config.CONSTANTS.titles` - Pieces that come before the name. Includes all `first_name_titles`. Cannot include things that may be first names.
* :py:obj:`~nameparser.config.CONSTANTS.first_name_titles` - Titles that, when followed by a single name, that name is a first name, e.g. "King David".
* :py:obj:`~nameparser.config.CONSTANTS.suffix_acronyms` - Pieces that come at the end of the name that may or may not have periods separating the letters, e.g. "m.d.".
* :py:obj:`~nameparser.config.CONSTANTS.suffix_not_acronyms` - Pieces that come at the end of the name that never have periods separating the letters, e.g. "Jr.".
* :py:obj:`~nameparser.config.CONSTANTS.conjunctions` - Connectors like "and" that join the preceding piece to the following piece.
* :py:obj:`~nameparser.config.CONSTANTS.prefixes` - Connectors like "del" and "bin" that join to the following piece but not the preceding, similar to titles but can appear anywhere in the name.
* :py:obj:`~nameparser.config.CONSTANTS.capitalization_exceptions` - Dictionary of pieces that do not capitalize the first letter, e.g. "Ph.D".
* :py:obj:`~nameparser.config.CONSTANTS.regexes` - Regular expressions used to find words, initials, nicknames, etc.

Each set of constants comes with :py:func:`~nameparser.config.SetManager.add` and :py:func:`~nameparser.config.SetManager.remove` methods for tuning
the constants for your project. These methods automatically lower case and
Expand Down
10 changes: 9 additions & 1 deletion nameparser/config/prefixes.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,15 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals

#: Name pieces that appear before a last name. They join to the piece that follows them to make one new piece.
#: Name pieces that appear before a last name. Prefixes join to the piece
# that follows them to make one new piece. They can be chained together, e.g
# "von der" and "de la". Because they only appear in middle or last names,
# they also signifiy that all following name pieces should be in the same name
# part, for example, "von" will be joined to all following pieces that are not
# prefixes or suffixes, allowing recognition of double last names when they
# appear after a prefixes. So in "pennie von bergen wessels MD", "von" will
# join with all following name pieces until the suffix "MD", resulting in the
# correct parsing of the last name "von bergen wessels".
PREFIXES = set([
'abu',
'bin',
Expand Down
101 changes: 55 additions & 46 deletions nameparser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,14 +501,6 @@ def parse_full_name(self):
self.last_list.append(piece)
self.suffix_list += pieces[i+1:]
break
if piece in self.prefix_joins:
last_piece = pieces[-1:][0]
if self.is_suffix(last_piece):
self.last_list += pieces[i:-1]
self.suffix = last_piece
else:
self.last_list += pieces[i:]
break
if not nxt:
self.last_list.append(piece)
continue
Expand Down Expand Up @@ -548,14 +540,6 @@ def parse_full_name(self):
self.last_list.append(piece)
self.suffix_list = pieces[i+1:] + self.suffix_list
break
if piece in self.prefix_joins:
last_piece = pieces[-1:][0]
if self.is_suffix(last_piece):
self.last_list += pieces[i:-1]
self.suffix_list.insert(0, last_piece)
else:
self.last_list += pieces[i:]
break
if not nxt:
self.last_list.append(piece)
continue
Expand Down Expand Up @@ -596,9 +580,6 @@ def parse_full_name(self):
if self.is_suffix(piece):
self.suffix_list.append(piece)
continue
if piece in self.prefix_joins:
self.last_list += pieces[i:]
break
self.middle_list.append(piece)
try:
if parts[2]:
Expand Down Expand Up @@ -685,27 +666,27 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0):
# don't join on conjunctions if there's only 2 parts
if length < 3:
return pieces

rootname_pieces = [p for p in pieces if self.is_rootname(p)]
total_length = len(rootname_pieces) + additional_parts_count

# find all the conjunctions, join any conjunctions that are next to each
# other, then join those newly joined conjunctions and any single
# conjunctions to the piece before and after it
conj_index = [i for i, piece in enumerate(pieces)
conj_index = [i for i, piece in enumerate(pieces)
if self.is_conjunction(piece)]

contiguous_conj_i = []
for i, val in enumerate(conj_index):
try:
if conj_index[i+1] == val+1:
contiguous_conj_i += [val]
except IndexError:
pass

contiguous_conj_i = group_contiguous_integers(conj_index)
delete_i = []

delete_i = []
for i in contiguous_conj_i:
if type(i) == tuple:
new_piece = " ".join(pieces[ i[0] : i[1]+1] )
Expand All @@ -717,7 +698,7 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0):
pieces[i] = new_piece
#add newly joined conjunctions to constants to be found later
self.C.conjunctions.add(new_piece)

for i in reversed(delete_i):
# delete pieces in reverse order or the index changes on each delete
del pieces[i]
Expand All @@ -728,15 +709,15 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0):

# refresh conjunction index locations
conj_index = [i for i, piece in enumerate(pieces) if self.is_conjunction(piece)]

for i in conj_index:
if len(pieces[i]) == 1 and total_length < 4:
# if there are only 3 total parts (minus known titles, suffixes
# and prefixes) and this conjunction is a single letter, prefer
# treating it as an initial rather than a conjunction.
# http://code.google.com/p/python-nameparser/issues/detail?id=11
continue

if i is 0:
new_piece = " ".join(pieces[i:i+2])
if self.is_title(pieces[i+1]):
Expand All @@ -748,8 +729,8 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0):
for j,val in enumerate(conj_index):
if val > i:
conj_index[j]=val-1
else:

else:
new_piece = " ".join(pieces[i-1:i+2])
if self.is_title(pieces[i-1]):
# when joining to a title, make new_piece a title too
Expand All @@ -767,23 +748,51 @@ def join_on_conjunctions(self, pieces, additional_parts_count=0):
for j,val in enumerate(conj_index):
if val > i:
conj_index[j] = val - rm_count
# join prefixes to following lastnames: ['de la Vega'], ['van Buren']


# join prefixes to following lastnames: ['de la Vega'], ['van Buren III']
prefixes = list(filter(self.is_prefix, pieces))
if prefixes:
i = pieces.index(prefixes[0])
# join everything after the prefix until the next non prefix
# store joined pieces in prefix_joins. When a prefix occurs in a last name,
# I think it means the rest of the name is part of the last name, so prefix_joins
# lets us do that in the parser flow.
non_suffixes = list(filter(lambda x: not self.is_prefix(x), pieces[i:]))
if non_suffixes:
j = pieces.index(non_suffixes[0])
new_piece = ' '.join(pieces[i:j + 1])
self.prefix_joins += [new_piece]
pieces = pieces[:i] + [new_piece] + pieces[j + 1:]

for prefix in prefixes:
try:
i = pieces.index(prefix)
except ValueError:
# If the prefix is no longer in pieces, it's because it has been
# combined with the prefix that appears right before (or before that when
# chained together) in the last loop, so the index of that newly created
# piece is the same as in the last loop, i==i still, and we want to join
# it to the next piece.
pass

new_piece = ''

# join everything after the prefix until the next non prefix
# store joined pieces in prefix_joins. When a prefix occurs in a last name,
# I think it means the rest of the name is part of the last name, so prefix_joins
# lets us do that in the parser flow.
# for prefix in prefixes:

try:
next_prefix = next(iter(filter(self.is_prefix, pieces[i + 1:])))
j = pieces.index(next_prefix)
if j == i + 1:
# if there are two prefixes in sequence, join to the following piece
j += 1
new_piece = ' '.join(pieces[i:j])
pieces = pieces[:i] + [new_piece] + pieces[j:]
except StopIteration:
try:
# if there are no more prefixes, look for a suffix to stop at
stop_at = next(iter(filter(self.is_suffix, pieces[i + 1:])))
j = pieces.index(stop_at)
new_piece = ' '.join(pieces[i:j])
pieces = pieces[:i] + [new_piece] + pieces[j:]
except StopIteration:
# if there were no suffixes, nothing to stop at so join all
# remaining pieces
new_piece = ' '.join(pieces[i:])
pieces = pieces[:i] + [new_piece]

log.debug("pieces: {0}".format(pieces))
return pieces

Expand Down
48 changes: 36 additions & 12 deletions tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1247,18 +1247,6 @@ def test_name_is_conjunctions(self):
hn = HumanName("e and e")
self.m(hn.first, "e and e", hn)

def test_portuguese_dos(self):
hn = HumanName("Rafael Sousa dos Anjos")
self.m(hn.first, "Rafael", hn)
self.m(hn.middle, "Sousa", hn)
self.m(hn.last, "dos Anjos", hn)

def test_portuguese_prefixes(self):
hn = HumanName("Joao da Silva do Amaral de Souza")
self.m(hn.first, "Joao", hn)
self.m(hn.middle, "", hn)
self.m(hn.last, "da Silva do Amaral de Souza", hn)


class ConstantsCustomization(HumanNameTestBase):

Expand Down Expand Up @@ -1518,6 +1506,42 @@ def test_title_two_part_last_name_with_suffix_in_first_part(self):
self.m(hn.last, "von bergen wessels", hn)
self.m(hn.suffix, "MD, III", hn)

def test_portuguese_dos(self):
hn = HumanName("Rafael Sousa dos Anjos")
self.m(hn.first, "Rafael", hn)
self.m(hn.middle, "Sousa", hn)
self.m(hn.last, "dos Anjos", hn)

def test_portuguese_prefixes(self):
hn = HumanName("Joao da Silva do Amaral de Souza")
self.m(hn.first, "Joao", hn)
self.m(hn.middle, "da Silva do Amaral", hn)
self.m(hn.last, "de Souza", hn)

def test_three_conjunctions(self):
hn = HumanName("Dr. Juan Q. Xavier de la dos Vega III")
self.m(hn.first, "Juan", hn)
self.m(hn.last, "de la dos Vega", hn)
self.m(hn.title, "Dr.", hn)
self.m(hn.middle, "Q. Xavier", hn)
self.m(hn.suffix, "III", hn)

def test_lastname_three_conjunctions(self):
hn = HumanName("de la dos Vega, Dr. Juan Q. Xavier III")
self.m(hn.first, "Juan", hn)
self.m(hn.last, "de la dos Vega", hn)
self.m(hn.title, "Dr.", hn)
self.m(hn.middle, "Q. Xavier", hn)
self.m(hn.suffix, "III", hn)

def test_comma_three_conjunctions(self):
hn = HumanName("Dr. Juan Q. Xavier de la dos Vega, III")
self.m(hn.first, "Juan", hn)
self.m(hn.last, "de la dos Vega", hn)
self.m(hn.title, "Dr.", hn)
self.m(hn.middle, "Q. Xavier", hn)
self.m(hn.suffix, "III", hn)


class SuffixesTestCase(HumanNameTestBase):

Expand Down

0 comments on commit 10f34e4

Please sign in to comment.