Skip to content

Commit

Permalink
add arabic stopwords list / fix issue ArabicStemmer AttributeError nl…
Browse files Browse the repository at this point in the history
  • Loading branch information
greenat92 committed Oct 16, 2017
1 parent 0477ceb commit 6b58c8c
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 10 deletions.
11 changes: 8 additions & 3 deletions nltk/stem/snowball.py
Expand Up @@ -294,7 +294,7 @@ def _rv_standard(self, word, vowels):

return rv

class ArabicStemmer(_LanguageSpecificStemmer):
class ArabicStemmer(_StandardStemmer):
"""
https://github.com/snowballstem/snowball/blob/master/algorithms/arabic/stem_Unicode.sbl (Original Algorithm)
The Snowball Arabic light Stemmer
Expand Down Expand Up @@ -516,7 +516,7 @@ def __Suffix_Verb_Step1(self, token):

def __Suffix_Verb_Step2a(self, token):
for suffix in self.__suffix_verb_step2a:
if token.endswith(suffix):
if token.endswith(suffix) and len(token) > 3:
if suffix == '\u062a' and len(token) >= 4:
token = token[:-1]
self.suffix_verb_step2a_success = True
Expand Down Expand Up @@ -750,14 +750,19 @@ def stem(self, word):
self.__checks_1(modified_word)
# checks2
self.__checks_2(modified_word)
# Pre_Normalization
modified_word = self.__normalize_pre(modified_word)
# Avoid stopwords
if modified_word in self.stopwords or len(modified_word) <= 2:
return modified_word
# Start stemming
if self.is_verb:
modified_word = self.__Suffix_Verb_Step1(modified_word)
if self.suffixes_verb_step1_success:
modified_word = self.__Suffix_Verb_Step2a(modified_word)
if not self.suffix_verb_step2a_success :
modified_word = self.__Suffix_Verb_Step2c(modified_word)
#or next
#or next TODO: How to deal with or next instruction
else:
modified_word = self.__Suffix_Verb_Step2b(modified_word)
if not self.suffix_verb_step2b_success:
Expand Down
16 changes: 9 additions & 7 deletions nltk/test/unit/test_stem.py
Expand Up @@ -15,14 +15,16 @@ def test_arabic(self):
this unit testing for test the snowball arabic light stemmer
this stemmer deals with prefixes and suffixes
"""
ar_stemmer = SnowballStemmer("arabic")
ar_stemmer = SnowballStemmer("arabic", True)
assert ar_stemmer.stem('الْعَرَبِــــــيَّة') == "عرب"
assert ar_stemmer.stem("العربية") == "عرب"
assert ar_stemmer.stem("فقالوا") == "قال"
assert ar_stemmer.stem("الطالبات") == "طالب"
assert ar_stemmer.stem("فالطالبات") == "طالب"
assert ar_stemmer.stem("والطالبات") == "طالب"
assert ar_stemmer.stem("الطالبون") == "طالب"
assert ar_stemmer.stem("اللذان") == "اللذان"
assert ar_stemmer.stem("من") == "من"

def test_russian(self):
# Russian words both consisting of Cyrillic
Expand Down Expand Up @@ -54,11 +56,11 @@ def test_short_strings_bug(self):
assert stemmer.stem("y's") == 'y'

class PorterTest(unittest.TestCase):

def _vocabulary(self):
with closing(data.find('stemmers/porter_test/porter_vocabulary.txt').open(encoding='utf-8')) as fp:
return fp.read().splitlines()

def _test_against_expected_output(self, stemmer_mode, expected_stems):
stemmer = PorterStemmer(mode=stemmer_mode)
for word, true_stem in zip(self._vocabulary(), expected_stems):
Expand All @@ -68,10 +70,10 @@ def _test_against_expected_output(self, stemmer_mode, expected_stems):
word, true_stem, stemmer_mode, our_stem
)
)

def test_vocabulary_martin_mode(self):
"""Tests all words from the test vocabulary provided by M Porter
The sample vocabulary and output were sourced from:
http://tartarus.org/martin/PorterStemmer/voc.txt
http://tartarus.org/martin/PorterStemmer/output.txt
Expand All @@ -84,14 +86,14 @@ def test_vocabulary_martin_mode(self):
PorterStemmer.MARTIN_EXTENSIONS,
fp.read().splitlines()
)

def test_vocabulary_nltk_mode(self):
with closing(data.find('stemmers/porter_test/porter_nltk_output.txt').open(encoding='utf-8')) as fp:
self._test_against_expected_output(
PorterStemmer.NLTK_EXTENSIONS,
fp.read().splitlines()
)

def test_vocabulary_original_mode(self):
# The list of stems for this test was generated by taking the
# Martin-blessed stemmer from
Expand Down

0 comments on commit 6b58c8c

Please sign in to comment.