URL: http://github.com/derek73/python-nameparser/pull/122.patch
ss. Subclass of ``dict``. Makes the tuple constants - more friendly. - ''' - def __getattr__(self, attr): - return self.get(attr) - __setattr__= dict.__setitem__ - __delattr__= dict.__delitem__ - - def __getstate__(self): - return dict(self) - - def __setstate__(self, state): - self.__init__(state) - - def __reduce__(self): - return (TupleManager, (), self.__getstate__()) - -REGEXES = [ - ("spaces", re.compile(r"\s+", re.U)), - ("word", re.compile(r"(\w|\.)+", re.U)), - ("mac", re.compile(r'^(ma?c)(\w{2,})', re.I | re.U)), - ("initial", re.compile(r'^(\w\.|[A-Z])?$', re.U)), - ("quoted_word", re.compile(r'(? Date: Mon, 22 Mar 2021 14:11:19 -0400 Subject: [PATCH 3/4] added nickname tests * test for adding nickname * test for multiple nicknames --- nameparser/config/regexes.py | 1 + nameparser/config/suffixes.py | 2 + nameparser/parser.py | 75 +++++++++++++++++++++++++++++------ tests.py | 45 +++++++++++++++++++++ 4 files changed, 110 insertions(+), 13 deletions(-) diff --git a/nameparser/config/regexes.py b/nameparser/config/regexes.py index e520169..ab2e8bf 100644 --- a/nameparser/config/regexes.py +++ b/nameparser/config/regexes.py @@ -36,6 +36,7 @@ ("period_not_at_end",re.compile(r'.*\..+$', re.I | re.U)), ("emoji",re_emoji), ("phd", re.compile(r'\s(ph\.?\s+d\.?)', re.I | re.U)), + ("nn_sep_safe", re.compile(r'[^ ,]', re.U)), ] """ All regular expressions used by the parser are precompiled and stored in the config. diff --git a/nameparser/config/suffixes.py b/nameparser/config/suffixes.py index 9765b92..7af82b8 100644 --- a/nameparser/config/suffixes.py +++ b/nameparser/config/suffixes.py @@ -6,6 +6,7 @@ 'esq', 'esquire', 'jr', + 'jr.', 'jnr', 'junior', 'sr', @@ -25,6 +26,7 @@ """ SUFFIX_ACRONYMS = set([ '(ret)', + '(ret.)', '(vet)', '8-vsb', 'aas', diff --git a/nameparser/parser.py b/nameparser/parser.py index e49fd8f..4b9abd9 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -254,7 +254,11 @@ def nickname(self): The person's nicknames. Any text found inside of quotes (``""``) or parenthesis (``()``) """ - return " ".join(self.nickname_list) or self.C.empty_attribute_default + if len(self.nickname_list) <= 1: + f_string = '{0}' + else: + f_string = '"{0}"' + return ", ".join([f_string.format(nn) for nn in self.nickname_list]) or self.C.empty_attribute_default @property def surnames_list(self): @@ -408,11 +412,14 @@ def pre_process(self): def post_process(self): """ This happens at the end of the :py:func:`parse_full_name` after - all other processing has taken place. Runs :py:func:`handle_firstnames` - and :py:func:`handle_capitalization`. + all other processing has taken place. Runs + :py:func:`handle_firstnames` + :py:func:`handle_capitalization` + :py:func:`check_suffixes_in_nicknames` #skipping this feature """ self.handle_firstnames() self.handle_capitalization() + #self.check_suffixes_in_nicknames() def fix_phd(self): _re = self.C.regexes.phd @@ -423,21 +430,49 @@ def fix_phd(self): def parse_nicknames(self): """ - The content of parenthesis or quotes in the name will be added to the + The content of defined nickname regex patterns in the name will be added to the nicknames list. This happens before any other processing of the name. - - Single quotes cannot span white space characters and must border - white space to allow for quotes in names like O'Connor and Kawai'ae'a. - Double quotes and parenthesis can span white space. - + + Some basic rules for nickname processing: + * Nicknames must begin with a word character. + * Nickname patterns should include an outer (not processed) + delimiter that excludes word characters. + Loops through :py:data:`~nameparser.config.regexes.REGEXES` with label/tag like "nickname" """ - + #ToDo: + # * create a list of matches + # * sort the list by span + # * check inter-match strings for commas + # * remove the commas if safe to remove + # safe = character(s) between matches are ONLY + # spaces and commas + # * iterate the matches, collecting the nicknames + # and removing the matches from self._full_name + nn_matches = [] + nn_sep = self.C.regexes.nn_sep_safe + _fn = self._full_name for _re in self._nickname_regexes: - if _re.search(self._full_name): - self.nickname_list += [x for x in _re.findall(self._full_name)] - self._full_name = _re.sub(' ', self._full_name) + if _re.search(_fn): + nn_matches.extend( _re.finditer(_fn) ) + #remove matches from string + for _match in _re.finditer(_fn): + _fn = (' ' * (_match.end() - _match.start())).join([_fn[:_match.start()], _fn[_match.end():]]) + + if len(nn_matches) == 0: + return #"empty matches" + + nn_matches.sort(key=lambda x: x.span()) + + #remove any inter-match commas, if safe to do so + for low, high in zip(nn_matches[0:-1], nn_matches[1:]): + if nn_sep.search(self._full_name[low.span()[1]:high.span()[0]]) is None: + self._full_name = ' '.join([self._full_name[:low.span()[1]], self._full_name[high.span()[0]:] ]) + + for nn_match in nn_matches: + self.nickname_list.append( nn_match.groups(0)[0] ) + self._full_name = nn_match.re.sub(' ', self._full_name, 1) def squash_emoji(self): """ @@ -459,6 +494,20 @@ def handle_firstnames(self): and not lc(self.title) in self.C.first_name_titles: self.last, self.first = self.first, self.last + def check_suffixes_in_nicknames(self): + """ + Iterate the nicknames, testing whether any of them are suffixes. + If there isn't (also) an identical suffix, then move that nickname + to the suffix_list + """ + for _nn in self.nickname_list: + if (_nn.lower() in self.C.suffix_acronyms or \ + _nn.lower() in self.C.suffix_not_acronyms) and \ + _nn not in self.suffix_list: + self.suffix_list.append(_nn) + self.nickname_list.remove(_nn) + + def parse_full_name(self): """ diff --git a/tests.py b/tests.py index 5f976b8..5c88c6a 100644 --- a/tests.py +++ b/tests.py @@ -27,6 +27,7 @@ from nameparser import HumanName from nameparser.util import u from nameparser.config import Constants +import re log = logging.getLogger('HumanName') @@ -1491,7 +1492,36 @@ def test_nickname_and_last_name_with_title(self): self.m(hn.last, "Edmonds", hn) self.m(hn.nickname, "Rick", hn) + def test_append_nickname(self): + hn = HumanName() + new_rgx = re.compile(r'(?!\w)\(_open(\w[^)]*?)\):close(?!\w)', re.UNICODE) + hn._nickname_regexes.append(new_rgx) + self.assertEqual(hn._nickname_regexes[-1], new_rgx) + hn.full_name = r"Benjamin (_openBen):close Franklin" + self.m(hn.first, "Benjamin", hn) + self.m(hn.middle, ":close", hn) + self.m(hn.last, "Franklin", hn) + self.m(hn.nickname, "_openBen", hn) + def test_prepend_nickname(self): + hn = HumanName() + new_rgx = re.compile(r'(?!\w)\(_open(\w[^)]*?)\):close(?!\w)', re.UNICODE) + hn._nickname_regexes.insert(0, new_rgx) + self.assertEqual(hn._nickname_regexes[0], new_rgx) + hn.full_name = r"Benjamin (_openBen):close Franklin" + self.m(hn.first, "Benjamin", hn) + self.m(hn.middle, "", hn) + self.m(hn.last, "Franklin", hn) + self.m(hn.nickname, "Ben", hn) + + def test_multiple_nicknames(self): + hn = HumanName('Chief Justice John (JR), "No Glove, No Love" Glover Roberts, Jr.') + self.m(hn.title, 'Chief Justice', hn) + self.m(hn.first, "John", hn) + self.m(hn.middle, "Glover", hn) + self.m(hn.last, "Roberts", hn) + self.m(hn.suffix, "Jr.", hn) + self.m(hn.nickname, '"JR", "No Glove, No Love"', hn) # class MaidenNameTestCase(HumanNameTestBase): # @@ -1766,6 +1796,21 @@ def test_suffix_with_periods_with_lastname_comma(self): self.m(hn.last, "Doe", hn) self.m(hn.suffix, "Msc.Ed.", hn) + @unittest.SkipTest + def test_suffix_in_nickname_dup(self): + hn = HumanName("John (JR) Roberts, JR") + self.m(hn.first, "John", hn) + self.m(hn.last, "Roberts", hn) + self.m(hn.suffix, "JR", hn) + self.m(hn.nickname, "JR", hn) + + @unittest.SkipTest + def test_suffix_in_nickname_solo(self): + hn = HumanName("John (JR) Roberts") + self.m(hn.first, "John", hn) + self.m(hn.last, "Roberts", hn) + self.m(hn.suffix, "JR", hn) + self.m(hn.nickname, "", hn) class TitleTestCase(HumanNameTestBase): From 129667cbfb893d9f2439ca4245ce0c075bf1501a Mon Sep 17 00:00:00 2001 From: aikimarkNote: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.
Alternative Proxies: