/
yelp rnn.py
130 lines (103 loc) · 4.48 KB
/
yelp rnn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# -*- coding: utf-8 -*-
"""
In this script we will work on yelp review data set and try to predict
reviews as 1 star or 5 stars using Recurrent Neural Networks
"""
import numpy as np
import pandas as pd
import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding,LSTM
data = pd.read_csv('yelp.csv')
data.head()
'''
business_id date review_id stars
0 9yKzy9PApeiPPOUJEtnvkg 2011-01-26 fWKvX83p0-ka4JS3dc6E5A 5
1 ZRJwVLyzEJq1VAihDhYiow 2011-07-27 IjZ33sJrzXqU-0X6U8NwyA 5
2 6oRAC4uyJCsJl1X0WZpVSA 2012-06-14 IESLBzqUCLdSzSqm0eCSxQ 4
3 _1QQZuf4zZOyFCvXc0o6Vg 2010-05-27 G-WvGaISbqqaMHlNnByodA 5
4 6ozycU1RpktNG2-1BroVtw 2012-01-05 1uJFq2r5QfJG_6ExMRCaGw 5
text type
0 My wife took me here on my birthday for breakf... review
1 I have no idea why some people give bad review... review
2 love the gyro plate. Rice is so good and I als... review
3 Rosie, Dakota, and I LOVE Chaparral Dog Park!!... review
4 General Manager Scott Petello is a good egg!!!... review
user_id cool useful funny
0 rLtl8ZkDX5vH5nAx9C3q5Q 2 5 0
1 0a2KyEL0d3Yb1V6aivbIuQ 0 0 0
2 0hT2KtfLiobPvh6cDC8JQg 0 1 0
3 uZetl9T0NcROGOyFfughhg 1 2 0
4 vYmM4KTsC8ZfQBg-j5MWkw 0 0 0
'''
data.info()
'''RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
business_id 10000 non-null object
date 10000 non-null object
review_id 10000 non-null object
stars 10000 non-null int64
text 10000 non-null object
type 10000 non-null object
user_id 10000 non-null object
cool 10000 non-null int64
useful 10000 non-null int64
funny 10000 non-null int64
dtypes: int64(4), object(6)
'''
'''No missing data here!'''
'''We will only classify a review as 5 stars or 1 star'''
data_mod = data[(data.stars==1) | (data.stars==5)]
'''Splitting features and label'''
X = data_mod['text']
y = data_mod['stars']
'''
Converting 1 star to 0 and 5 star to 1 for the model
'''
y = y.apply(lambda x: (x-1) if x == 1 else 1)
'''
Now we need to process the reviews which are in text to some form which
the model is able to understand.
We will use tokenizer which will convert the text into sequence
The num_words indicates it will use the top 20,000 words only '''
from keras.preprocessing.text import Tokenizer
tz = Tokenizer(num_words=20000,
filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
lower=True,
split=" ",
char_level=False)
tz.fit_on_texts(X)
X = tz.texts_to_sequences(X)
'''Limiting reviews to first 300 words to be able to train faster'''
X = sequence.pad_sequences(X, maxlen=300)
'''Splitting into train and test set'''
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.33)
'''Implementing the Recurrent Neural Network'''
'''
Embedding layer converts the input data into dense vectors of fixed size which
neural network can process better. 20,000 is our vocabulary size which we chose
in our tokenizer and 128 is the output dimension of 128 units.
Next is LSTM layer here, which stands for Long short term memory which will
retain words in the review. Dropout prevents overfitting too much on training data.
At the end we have an output layer.'''
classifier = Sequential()
classifier.add(Embedding(20000, 128))
classifier.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
classifier.add(Dense(1, activation='sigmoid'))
classifier.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
classifier.fit(X_train, y_train, batch_size=25 , epochs=10, validation_data=(X_test, y_test))
'''
This is the last epoch
Epoch 10/10
2737/2737 [==============================] - 58s 21ms/step
- loss: 0.0186 - acc: 0.9949 - val_loss: 0.4810 - val_acc: 0.9125
Obtained an accuracy of 91.25% with the test set. '''
score = classifier.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
'''
Test loss: 0.480989840669
Test accuracy: 0.912527798413
'''