add arabic stopwords list / fix issue ArabicStemmer AttributeError nl…

…tk#1852
greenat92 · Oct 16, 2017 · 6b58c8c · 6b58c8c
1 parent 0477ceb
commit 6b58c8c
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 10 deletions.
diff --git a/nltk/stem/snowball.py b/nltk/stem/snowball.py
@@ -294,7 +294,7 @@ def _rv_standard(self, word, vowels):
 
         return rv
 
-class ArabicStemmer(_LanguageSpecificStemmer):
+class ArabicStemmer(_StandardStemmer):
     """
         https://github.com/snowballstem/snowball/blob/master/algorithms/arabic/stem_Unicode.sbl (Original Algorithm)
         The Snowball Arabic light Stemmer
@@ -516,7 +516,7 @@ def __Suffix_Verb_Step1(self, token):
 
     def __Suffix_Verb_Step2a(self, token):
         for suffix in self.__suffix_verb_step2a:
-            if token.endswith(suffix):
+            if token.endswith(suffix) and len(token) > 3:
                 if suffix == '\u062a' and len(token) >= 4:
                     token = token[:-1]
                     self.suffix_verb_step2a_success = True
@@ -750,14 +750,19 @@ def stem(self, word):
         self.__checks_1(modified_word)
         # checks2
         self.__checks_2(modified_word)
+        # Pre_Normalization
         modified_word = self.__normalize_pre(modified_word)
+        # Avoid stopwords
+        if modified_word in self.stopwords or len(modified_word) <= 2:
+            return modified_word
+        # Start stemming
         if self.is_verb:
             modified_word = self.__Suffix_Verb_Step1(modified_word)
             if  self.suffixes_verb_step1_success:
                 modified_word = self.__Suffix_Verb_Step2a(modified_word)
                 if not self.suffix_verb_step2a_success :
                     modified_word = self.__Suffix_Verb_Step2c(modified_word)
-                #or next
+                #or next TODO: How to deal with or next instruction
             else:
                 modified_word = self.__Suffix_Verb_Step2b(modified_word)
                 if not self.suffix_verb_step2b_success:

diff --git a/nltk/test/unit/test_stem.py b/nltk/test/unit/test_stem.py
@@ -15,14 +15,16 @@ def test_arabic(self):
         this unit testing for test the snowball arabic light stemmer
         this stemmer deals with prefixes and suffixes
         """
-        ar_stemmer = SnowballStemmer("arabic")
+        ar_stemmer = SnowballStemmer("arabic", True)
         assert ar_stemmer.stem('الْعَرَبِــــــيَّة') == "عرب"
         assert ar_stemmer.stem("العربية") == "عرب"
         assert ar_stemmer.stem("فقالوا") == "قال"
         assert ar_stemmer.stem("الطالبات") == "طالب"
         assert ar_stemmer.stem("فالطالبات") == "طالب"
         assert ar_stemmer.stem("والطالبات") == "طالب"
         assert ar_stemmer.stem("الطالبون") == "طالب"
+        assert ar_stemmer.stem("اللذان") == "اللذان"
+        assert ar_stemmer.stem("من") == "من"
 
     def test_russian(self):
         # Russian words both consisting of Cyrillic
@@ -54,11 +56,11 @@ def test_short_strings_bug(self):
         assert stemmer.stem("y's") == 'y'
 
 class PorterTest(unittest.TestCase):
-    
+
     def _vocabulary(self):
         with closing(data.find('stemmers/porter_test/porter_vocabulary.txt').open(encoding='utf-8')) as fp:
             return fp.read().splitlines()
-        
+
     def _test_against_expected_output(self, stemmer_mode, expected_stems):
         stemmer = PorterStemmer(mode=stemmer_mode)
         for word, true_stem in zip(self._vocabulary(), expected_stems):
@@ -68,10 +70,10 @@ def _test_against_expected_output(self, stemmer_mode, expected_stems):
                     word, true_stem, stemmer_mode, our_stem
                 )
             )
-    
+
     def test_vocabulary_martin_mode(self):
         """Tests all words from the test vocabulary provided by M Porter
-        
+
         The sample vocabulary and output were sourced from:
             http://tartarus.org/martin/PorterStemmer/voc.txt
             http://tartarus.org/martin/PorterStemmer/output.txt
@@ -84,14 +86,14 @@ def test_vocabulary_martin_mode(self):
                 PorterStemmer.MARTIN_EXTENSIONS,
                 fp.read().splitlines()
             )
-        
+
     def test_vocabulary_nltk_mode(self):
         with closing(data.find('stemmers/porter_test/porter_nltk_output.txt').open(encoding='utf-8')) as fp:
             self._test_against_expected_output(
                 PorterStemmer.NLTK_EXTENSIONS,
                 fp.read().splitlines()
             )
-        
+
     def test_vocabulary_original_mode(self):
         # The list of stems for this test was generated by taking the
         # Martin-blessed stemmer from