Skip to content

Commit

Permalink
Merge pull request #1856 from LBenzahia/fix/arabicstemmer-attributeError
Browse files Browse the repository at this point in the history
Fix issue ArabicStemmer AttributeError #1852
  • Loading branch information
stevenbird committed Oct 21, 2018
2 parents e7f4635 + c818bb5 commit e84c526
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 4 deletions.
11 changes: 8 additions & 3 deletions nltk/stem/snowball.py
Expand Up @@ -294,7 +294,7 @@ def _rv_standard(self, word, vowels):

return rv

class ArabicStemmer(_LanguageSpecificStemmer):
class ArabicStemmer(_StandardStemmer):
"""
https://github.com/snowballstem/snowball/blob/master/algorithms/arabic/stem_Unicode.sbl (Original Algorithm)
The Snowball Arabic light Stemmer
Expand Down Expand Up @@ -516,7 +516,7 @@ def __Suffix_Verb_Step1(self, token):

def __Suffix_Verb_Step2a(self, token):
for suffix in self.__suffix_verb_step2a:
if token.endswith(suffix):
if token.endswith(suffix) and len(token) > 3:
if suffix == '\u062a' and len(token) >= 4:
token = token[:-1]
self.suffix_verb_step2a_success = True
Expand Down Expand Up @@ -750,14 +750,19 @@ def stem(self, word):
self.__checks_1(modified_word)
# checks2
self.__checks_2(modified_word)
# Pre_Normalization
modified_word = self.__normalize_pre(modified_word)
# Avoid stopwords
if modified_word in self.stopwords or len(modified_word) <= 2:
return modified_word
# Start stemming
if self.is_verb:
modified_word = self.__Suffix_Verb_Step1(modified_word)
if self.suffixes_verb_step1_success:
modified_word = self.__Suffix_Verb_Step2a(modified_word)
if not self.suffix_verb_step2a_success :
modified_word = self.__Suffix_Verb_Step2c(modified_word)
#or next
#or next TODO: How to deal with or next instruction
else:
modified_word = self.__Suffix_Verb_Step2b(modified_word)
if not self.suffix_verb_step2b_success:
Expand Down
17 changes: 16 additions & 1 deletion nltk/test/unit/test_stem.py
Expand Up @@ -17,14 +17,29 @@ def test_arabic(self):
this unit testing for test the snowball arabic light stemmer
this stemmer deals with prefixes and suffixes
"""
ar_stemmer = SnowballStemmer("arabic")
# Test where the ignore_stopwords=True.
ar_stemmer = SnowballStemmer("arabic", True)
assert ar_stemmer.stem('الْعَرَبِــــــيَّة') == "عرب"
assert ar_stemmer.stem("العربية") == "عرب"
assert ar_stemmer.stem("فقالوا") == "قال"
assert ar_stemmer.stem("الطالبات") == "طالب"
assert ar_stemmer.stem("فالطالبات") == "طالب"
assert ar_stemmer.stem("والطالبات") == "طالب"
assert ar_stemmer.stem("الطالبون") == "طالب"
assert ar_stemmer.stem("اللذان") == "اللذان"
assert ar_stemmer.stem("من") == "من"
#Test where the ignore_stopwords=False.
ar_stemmer = SnowballStemmer("arabic", False)
assert ar_stemmer.stem("اللذان") == "اللذ" # this is a stop word
assert ar_stemmer.stem("الطالبات") == "طالب"
assert ar_stemmer.stem("الكلمات") == "كلم"
# test where create the arabic stemmer without given init value to ignore_stopwords
ar_stemmer = SnowballStemmer("arabic")
assert ar_stemmer.stem('الْعَرَبِــــــيَّة') == "عرب"
assert ar_stemmer.stem("العربية") == "عرب"
assert ar_stemmer.stem("فقالوا") == "قال"
assert ar_stemmer.stem("الطالبات") == "طالب"
assert ar_stemmer.stem("الكلمات") == "كلم"

def test_russian(self):
# Russian words both consisting of Cyrillic
Expand Down

0 comments on commit e84c526

Please sign in to comment.