-
Notifications
You must be signed in to change notification settings - Fork 3
/
stemmer.py
59 lines (45 loc) · 2.08 KB
/
stemmer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from nltk.stem.porter import *
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from console_progressbar import ProgressBar
class StemTokenizer(object):
counter = 0
def __init__(self, num_docs=None):
self.num_docs = num_docs
self.pb = ProgressBar(total=self.num_docs, suffix='Pre-processing documents', decimals=0, length=50, fill='█', zfill='-')
def __call__(self, doc):
"""
Takes as an input a document and returns a list of stems corresponding to the
constituent words of that document.
Filters out:
1) Words whose pos_tag is contained in the stop_pos_tags list
2) Words that are contained in a stop word list, i.e. stopwords.words('english')
3) Words whose stem is contained in a stop word list, i.e. stopwords.words('english')
"""
# pos_tags to exclude
stop_pos_tags = ['CD', 'RB', 'CC', 'DT']
stemmer = PorterStemmer()
stemmed_words = []
# Tokenise document
tokenised_text = word_tokenize(doc)
# Pos tag document
tagged_text = pos_tag(tokenised_text)
for tag in tagged_text:
word = tag[0]
p_tag = tag[1]
stemmed_word = stemmer.stem(word)
'''
Check whether:
1) length of word is greater than 1 and
2) and pos tag of word, i.e. p_tag, is not contained in the stop_pos_tags list and
3) word is not contained in the stopwords.words('english')
4) stemmed_word is not contained in the stopwords.words('english')
'''
if len(word) > 1 and p_tag not in stop_pos_tags \
and word not in stopwords.words('english') \
and stemmed_word not in stopwords.words('english'):
stemmed_words.append(stemmed_word)
StemTokenizer.counter += 1
# print('Done processing pre-processing doc', StemTokenizer.counter)
self.pb.print_progress_bar(StemTokenizer.counter)
return stemmed_words