From 6b58c8cc27e53fd111226cec55306fa1984dab7e Mon Sep 17 00:00:00 2001 From: LBenzahia Date: Fri, 13 Oct 2017 14:35:29 +0100 Subject: [PATCH] add arabic stopwords list / fix issue ArabicStemmer AttributeError #1852 --- nltk/stem/snowball.py | 11 ++++++++--- nltk/test/unit/test_stem.py | 16 +++++++++------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/nltk/stem/snowball.py b/nltk/stem/snowball.py index 00b511c183..3d8863fd53 100644 --- a/nltk/stem/snowball.py +++ b/nltk/stem/snowball.py @@ -294,7 +294,7 @@ def _rv_standard(self, word, vowels): return rv -class ArabicStemmer(_LanguageSpecificStemmer): +class ArabicStemmer(_StandardStemmer): """ https://github.com/snowballstem/snowball/blob/master/algorithms/arabic/stem_Unicode.sbl (Original Algorithm) The Snowball Arabic light Stemmer @@ -516,7 +516,7 @@ def __Suffix_Verb_Step1(self, token): def __Suffix_Verb_Step2a(self, token): for suffix in self.__suffix_verb_step2a: - if token.endswith(suffix): + if token.endswith(suffix) and len(token) > 3: if suffix == '\u062a' and len(token) >= 4: token = token[:-1] self.suffix_verb_step2a_success = True @@ -750,14 +750,19 @@ def stem(self, word): self.__checks_1(modified_word) # checks2 self.__checks_2(modified_word) + # Pre_Normalization modified_word = self.__normalize_pre(modified_word) + # Avoid stopwords + if modified_word in self.stopwords or len(modified_word) <= 2: + return modified_word + # Start stemming if self.is_verb: modified_word = self.__Suffix_Verb_Step1(modified_word) if self.suffixes_verb_step1_success: modified_word = self.__Suffix_Verb_Step2a(modified_word) if not self.suffix_verb_step2a_success : modified_word = self.__Suffix_Verb_Step2c(modified_word) - #or next + #or next TODO: How to deal with or next instruction else: modified_word = self.__Suffix_Verb_Step2b(modified_word) if not self.suffix_verb_step2b_success: diff --git a/nltk/test/unit/test_stem.py b/nltk/test/unit/test_stem.py index 5f359d4900..868c35b1f2 100644 --- a/nltk/test/unit/test_stem.py +++ b/nltk/test/unit/test_stem.py @@ -15,7 +15,7 @@ def test_arabic(self): this unit testing for test the snowball arabic light stemmer this stemmer deals with prefixes and suffixes """ - ar_stemmer = SnowballStemmer("arabic") + ar_stemmer = SnowballStemmer("arabic", True) assert ar_stemmer.stem('الْعَرَبِــــــيَّة') == "عرب" assert ar_stemmer.stem("العربية") == "عرب" assert ar_stemmer.stem("فقالوا") == "قال" @@ -23,6 +23,8 @@ def test_arabic(self): assert ar_stemmer.stem("فالطالبات") == "طالب" assert ar_stemmer.stem("والطالبات") == "طالب" assert ar_stemmer.stem("الطالبون") == "طالب" + assert ar_stemmer.stem("اللذان") == "اللذان" + assert ar_stemmer.stem("من") == "من" def test_russian(self): # Russian words both consisting of Cyrillic @@ -54,11 +56,11 @@ def test_short_strings_bug(self): assert stemmer.stem("y's") == 'y' class PorterTest(unittest.TestCase): - + def _vocabulary(self): with closing(data.find('stemmers/porter_test/porter_vocabulary.txt').open(encoding='utf-8')) as fp: return fp.read().splitlines() - + def _test_against_expected_output(self, stemmer_mode, expected_stems): stemmer = PorterStemmer(mode=stemmer_mode) for word, true_stem in zip(self._vocabulary(), expected_stems): @@ -68,10 +70,10 @@ def _test_against_expected_output(self, stemmer_mode, expected_stems): word, true_stem, stemmer_mode, our_stem ) ) - + def test_vocabulary_martin_mode(self): """Tests all words from the test vocabulary provided by M Porter - + The sample vocabulary and output were sourced from: http://tartarus.org/martin/PorterStemmer/voc.txt http://tartarus.org/martin/PorterStemmer/output.txt @@ -84,14 +86,14 @@ def test_vocabulary_martin_mode(self): PorterStemmer.MARTIN_EXTENSIONS, fp.read().splitlines() ) - + def test_vocabulary_nltk_mode(self): with closing(data.find('stemmers/porter_test/porter_nltk_output.txt').open(encoding='utf-8')) as fp: self._test_against_expected_output( PorterStemmer.NLTK_EXTENSIONS, fp.read().splitlines() ) - + def test_vocabulary_original_mode(self): # The list of stems for this test was generated by taking the # Martin-blessed stemmer from