Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix issue ArabicStemmer AttributeError #1852 #1856

Merged
merged 2 commits into from Oct 21, 2018
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
11 changes: 8 additions & 3 deletions nltk/stem/snowball.py
Expand Up @@ -294,7 +294,7 @@ def _rv_standard(self, word, vowels):

return rv

class ArabicStemmer(_LanguageSpecificStemmer):
class ArabicStemmer(_StandardStemmer):
"""
https://github.com/snowballstem/snowball/blob/master/algorithms/arabic/stem_Unicode.sbl (Original Algorithm)
The Snowball Arabic light Stemmer
Expand Down Expand Up @@ -516,7 +516,7 @@ def __Suffix_Verb_Step1(self, token):

def __Suffix_Verb_Step2a(self, token):
for suffix in self.__suffix_verb_step2a:
if token.endswith(suffix):
if token.endswith(suffix) and len(token) > 3:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just out of curiosity, is there a linguistic reason to avoid words with 2 characters?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We didn't study the case of words that have 2 length yet, We're mentioned it in the list of our todos too.

if suffix == '\u062a' and len(token) >= 4:
token = token[:-1]
self.suffix_verb_step2a_success = True
Expand Down Expand Up @@ -750,14 +750,19 @@ def stem(self, word):
self.__checks_1(modified_word)
# checks2
self.__checks_2(modified_word)
# Pre_Normalization
modified_word = self.__normalize_pre(modified_word)
# Avoid stopwords
if modified_word in self.stopwords or len(modified_word) <= 2:
return modified_word
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

# Start stemming
if self.is_verb:
modified_word = self.__Suffix_Verb_Step1(modified_word)
if self.suffixes_verb_step1_success:
modified_word = self.__Suffix_Verb_Step2a(modified_word)
if not self.suffix_verb_step2a_success :
modified_word = self.__Suffix_Verb_Step2c(modified_word)
#or next
#or next TODO: How to deal with or next instruction
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In which cases would there be more steps that needs to be applied here? Perhaps, it'll be good to list these cases down.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We're working on it and other todos when we solve them we'll send PR for updates
"or next" i mean this line from the original algorithm as you know i've rewrote the algorithm by hand to follow nltk guideline style code and avoid the generated code from snowball generator.
You can take a look at this list of issues and todos.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add a link to the assem-ch/arabicstemmer#1 in the github comment too? That'll be helpful for us to track later. Thanks!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've created an issue in nltk to track the changes later and added a comment in assem-ch/arabicstemmer#1 , I hope this is helpful, sorry for the late replay.

else:
modified_word = self.__Suffix_Verb_Step2b(modified_word)
if not self.suffix_verb_step2b_success:
Expand Down
16 changes: 9 additions & 7 deletions nltk/test/unit/test_stem.py
Expand Up @@ -15,14 +15,16 @@ def test_arabic(self):
this unit testing for test the snowball arabic light stemmer
this stemmer deals with prefixes and suffixes
"""
ar_stemmer = SnowballStemmer("arabic")
ar_stemmer = SnowballStemmer("arabic", True)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add another test where the ignore_stopwords=False.

assert ar_stemmer.stem('الْعَرَبِــــــيَّة') == "عرب"
assert ar_stemmer.stem("العربية") == "عرب"
assert ar_stemmer.stem("فقالوا") == "قال"
assert ar_stemmer.stem("الطالبات") == "طالب"
assert ar_stemmer.stem("فالطالبات") == "طالب"
assert ar_stemmer.stem("والطالبات") == "طالب"
assert ar_stemmer.stem("الطالبون") == "طالب"
assert ar_stemmer.stem("اللذان") == "اللذان"
assert ar_stemmer.stem("من") == "من"

def test_russian(self):
# Russian words both consisting of Cyrillic
Expand Down Expand Up @@ -54,11 +56,11 @@ def test_short_strings_bug(self):
assert stemmer.stem("y's") == 'y'

class PorterTest(unittest.TestCase):

def _vocabulary(self):
with closing(data.find('stemmers/porter_test/porter_vocabulary.txt').open(encoding='utf-8')) as fp:
return fp.read().splitlines()

def _test_against_expected_output(self, stemmer_mode, expected_stems):
stemmer = PorterStemmer(mode=stemmer_mode)
for word, true_stem in zip(self._vocabulary(), expected_stems):
Expand All @@ -68,10 +70,10 @@ def _test_against_expected_output(self, stemmer_mode, expected_stems):
word, true_stem, stemmer_mode, our_stem
)
)

def test_vocabulary_martin_mode(self):
"""Tests all words from the test vocabulary provided by M Porter

The sample vocabulary and output were sourced from:
http://tartarus.org/martin/PorterStemmer/voc.txt
http://tartarus.org/martin/PorterStemmer/output.txt
Expand All @@ -84,14 +86,14 @@ def test_vocabulary_martin_mode(self):
PorterStemmer.MARTIN_EXTENSIONS,
fp.read().splitlines()
)

def test_vocabulary_nltk_mode(self):
with closing(data.find('stemmers/porter_test/porter_nltk_output.txt').open(encoding='utf-8')) as fp:
self._test_against_expected_output(
PorterStemmer.NLTK_EXTENSIONS,
fp.read().splitlines()
)

def test_vocabulary_original_mode(self):
# The list of stems for this test was generated by taking the
# Martin-blessed stemmer from
Expand Down