Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid recursive suffix stripping in wordnet morphy #3225

Open
wants to merge 7 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
52 changes: 16 additions & 36 deletions nltk/corpus/reader/wordnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -1423,6 +1423,7 @@ def _next_token():
# map lemmas and parts of speech to synsets
self._lemma_pos_offset_map[lemma][pos] = synset_offsets
if pos == ADJ:
# Duplicate all adjectives indiscriminately?:
self._lemma_pos_offset_map[lemma][ADJ_SAT] = synset_offsets

def _load_exception_map(self):
Expand Down Expand Up @@ -2018,8 +2019,9 @@ def morphy(self, form, pos=None, check_exceptions=True):
"""
Find a possible base form for the given form, with the given
part of speech, by checking WordNet's list of exceptional
forms, and by recursively stripping affixes for this part of
speech until a form in WordNet is found.
forms, or by substituting suffixes for this part of speech.
If pos=None, try every part of speech until finding lemmas.
Return the first form found in WordNet, or eventually None.

>>> from nltk.corpus import wordnet as wn
>>> print(wn.morphy('dogs'))
Expand All @@ -2035,19 +2037,11 @@ def morphy(self, form, pos=None, check_exceptions=True):
book
>>> wn.morphy('book', wn.ADJ)
"""

if pos is None:
morphy = self._morphy
analyses = chain(a for p in POS_LIST for a in morphy(form, p))
else:
for pos in [pos] if pos else POS_LIST:
analyses = self._morphy(form, pos, check_exceptions)

# get the first one we find
first = list(islice(analyses, 1))
if len(first) == 1:
return first[0]
else:
return None
if analyses:
# Stop (don't try more parts of speech):
return analyses[0]

MORPHOLOGICAL_SUBSTITUTIONS = {
NOUN: [
Expand Down Expand Up @@ -2082,8 +2076,7 @@ def _morphy(self, form, pos, check_exceptions=True):
# Given an original string x
# 1. Apply rules once to the input to get y1, y2, y3, etc.
# 2. Return all that are in the database
# 3. If there are no matches, keep applying rules until you either
# find a match or you can't go any further
# (edited by ekaf) If there are no matches return an empty list.

exceptions = self._exception_map[pos]
substitutions = self.MORPHOLOGICAL_SUBSTITUTIONS[pos]
Expand All @@ -2107,28 +2100,15 @@ def filter_forms(forms):
seen.add(form)
return result

# 0. Check the exception lists
if check_exceptions:
if form in exceptions:
return filter_forms([form] + exceptions[form])

# 1. Apply rules once to the input to get y1, y2, y3, etc.
forms = apply_rules([form])
if check_exceptions and form in exceptions:
# 0. Check the exception lists
forms = exceptions[form]
else:
# 1. Apply rules once to the input to get y1, y2, y3, etc.
forms = apply_rules([form])

# 2. Return all that are in the database (and check the original too)
results = filter_forms([form] + forms)
if results:
return results

# 3. If there are no matches, keep applying rules until we find a match
while forms:
forms = apply_rules(forms)
results = filter_forms(forms)
if results:
return results

# Return an empty list if we can't find anything
return []
return filter_forms([form] + forms)

#############################################################
# Create information content from corpus
Expand Down
51 changes: 42 additions & 9 deletions nltk/stem/wordnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# Copyright (C) 2001-2023 NLTK Project
# Author: Steven Bird <stevenbird1@gmail.com>
# Edward Loper <edloper@gmail.com>
# Eric Kafe <kafe.eric@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

Expand All @@ -13,8 +14,45 @@ class WordNetLemmatizer:
"""
WordNet Lemmatizer

Lemmatize using WordNet's built-in morphy function.
Returns the input word unchanged if it cannot be found in WordNet.
Provides 3 lemmatizer modes:

1. _morphy() is an alias to WordNet's _morphy lemmatizer.
It returns a list of all lemmas found in WordNet.

>>> wnl = WordNetLemmatizer()
>>> print(wnl._morphy('us', 'n'))
['us', 'u']

2. morphy() is a restrictive wrapper around _morphy().
It returns the first lemma found in WordNet,
or None if no lemma is found.

>>> print(wnl.morphy('us', 'n'))
us

>>> print(wnl.morphy('catss'))
None

3. lemmatize() is a permissive wrapper around _morphy().
It returns the shortest lemma found in WordNet,
or the input string unchanged if nothing is found.

>>> print(wnl.lemmatize('us', 'n'))
u

>>> print(wnl.lemmatize('Anythinggoeszxcv'))
Anythinggoeszxcv

"""

morphy = wn.morphy

_morphy = wn._morphy

def lemmatize(self, word: str, pos: str = "n") -> str:
"""Lemmatize `word` by picking the shortest of the possible lemmas,
using the wordnet corpus reader's built-in _morphy function.
Returns the input word unchanged if it cannot be found in WordNet.

>>> from nltk.stem import WordNetLemmatizer
>>> wnl = WordNetLemmatizer()
Expand All @@ -28,21 +66,16 @@ class WordNetLemmatizer:
abacus
>>> print(wnl.lemmatize('hardrock'))
hardrock
"""

def lemmatize(self, word: str, pos: str = "n") -> str:
"""Lemmatize `word` using WordNet's built-in morphy function.
Returns the input word unchanged if it cannot be found in WordNet.

:param word: The input word to lemmatize.
:type word: str
:param pos: The Part Of Speech tag. Valid options are `"n"` for nouns,
`"v"` for verbs, `"a"` for adjectives, `"r"` for adverbs and `"s"`
for satellite adjectives.
:type pos: str
:return: The lemma of `word`, for the given `pos`.
:return: The shortest lemma of `word`, for the given `pos`.
"""
lemmas = wn._morphy(word, pos)
lemmas = self._morphy(word, pos)
return min(lemmas, key=len) if lemmas else word

def __repr__(self):
Expand Down