/
IMDB_Sentiment_Analysis.py
145 lines (101 loc) · 4.65 KB
/
IMDB_Sentiment_Analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 11 07:08:08 2019
@author: Sreeju
"""
import nltk
import random
from nltk.classify.scikitlearn import SklearnClassifier
import pickle
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
from nltk.tokenize import word_tokenize
import re
import os
files_pos = os.listdir('D:/sentiment/aclImdb/train/pos')
files_pos = [open('D:/sentiment/aclImdb/train/pos'+f, 'r', encoding='utf8').read() for f in file_pos]
files_neg = os.listdir('D:/sentiment/aclImdb/train/neg')
files_neg = [open('D:/sentiment/aclImdb/train/neg'+f, 'r', encoding='utf8').read() for f in file_pos]
len(files_pos), len(files_neg)
all_words = []
documents = []
from nltk.corpus import stopwords
stop_words = list(set(stopwords.words('english')))
# j is adjective, r is adverb, v is verb
allowed_word_types = ["J"]
for p in files_pos:
# create list of tuples where the first element is review
# second element is the label
document.append(p,'pos')
# remove punctuation
cleaned = re.sub(r'[^(a-zA-Z)\s'),'',p)
# tokenize
tokenized = word_tokenize(cleaned)
#remove stop words
stopped = [w for w in tokenized if not w in stop_words]
#part of speech tagging of each words
pos = nltk.pos_tag(stopped)
# make list of all adjectives identified by allowed word types
for w in pos:
if w[0] in allowed_types:
all_words.append(w[0].lower())
for p in files_neg:
# create list of tuples where the first element is review
# second element is the label
document.append(p,'neg')
# remove punctuation
cleaned = re.sub(r'[^(a-zA-Z)\s'),'',p)
# tokenize
tokenized = word_tokenize(cleaned)
#remove stop words
stopped = [w for w in tokenized if not w in stop_words]
#part of speech tagging of each words
neg = nltk.pos_tag(stopped)
# make list of all adjectives identified by allowed word types
for w in neg:
if w[0] in allowed_types:
all_words.append(w[0].lower())
# creating feequency distribution of each words
all_words = nltk.FreqDist(all_words)
import matplotlib.pyplot as plt
all_words.plot(30,cumulative=False)
plt.show()
# list the 1000 most frequent words
word_features = list(all_words.keys())[:1000]
# function to create a dictionary of features for each review in the list document
# keys are the words in the word_features
# the value of each key are either true or false for whether that feature appears in the review or not
def find_features(document):
words = word_tokenize(document)
features = {}
for w in word_features:
features[w] = (w in words)
return features
# create features for each review
featuresets = [(find_features(rev),category) for (rev, category) in documents]
# shuffling the documents
random.shuffle(featuresets)
training_set = featuresets[:800]
testing_set = featuresets[800:]
classifier = nltk.NaiveBayesClassifier.train(training_set)
print("NaiveBayes Classifier accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*1000)
classifier.show_most_informative__features(15)
from sklearn import metrics
MNB_clf = SKlearnClassifer(MultinomialNB())
mnb_cls = MNB_clf_train(training_set)
print("MultinomialNB Classifier accuracy percent:", (nltk.classify.accuracy(mnb_cls, testing_set))*100)
BNB_clf = SKlearnClassifer(BernoulliNB())
bnb_cls = BNB_clf.clf.train(training_set)
print("BernoulliNB Classifier accuracy percent:", (nltk.classify.accuracy(bnb_cls, testing_set))*100)
LogReg_clf = SKlearnClassifer(LogisticRegression())
logReg_cls = LogReg_clf.clf.train(training_set)
print("LogisticRegression Classifier accuracy percent:", (nltk.classify.accuracy(logReg_cls, testing_set))*100)
SGD_clf = SKlearnClassifer(SGDClassifier())
sgd_cls = SGD_clf.clf.train(training_set)
print("SGD Classifier accuracy percent:", (nltk.classify.accuracy(sgd_cls, testing_set))*100)
SVC_clf = SKlearnClassifer(SVCClassifier())
sgd_cls = SVC_clf.clf.train(training_set)
print("SCV Classifier accuracy percent:", (nltk.classify.accuracy(sgd_cls, testing_set))*100)