Skip to content

Commit bec594c

Browse files
authored
w2vcnn
1 parent 3bf4db9 commit bec594c

File tree

1 file changed

+204
-0
lines changed

1 file changed

+204
-0
lines changed

W2VCNN.py

Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
import re
2+
3+
import gensim
4+
import numpy as np
5+
import pandas as pd
6+
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Embedding, Dropout
7+
from keras.models import Sequential
8+
from keras.preprocessing.sequence import pad_sequences
9+
from keras.preprocessing.text import Tokenizer
10+
from nltk.corpus import stopwords
11+
from sklearn.model_selection import train_test_split
12+
13+
14+
###################################################
15+
def remove_stopwords(data):
16+
stop_words = set(stopwords.words('indonesian'))
17+
wordss = []
18+
for sentence in data:
19+
sentence = sentence.lower().split()
20+
# print(sentence)
21+
x = [word.strip() for word in sentence if word not in stop_words]
22+
# print(x)
23+
wordss.append(x)
24+
return wordss
25+
26+
27+
def remove_punc_emoji(data):
28+
# REMOVE EMOJI
29+
preprocess = [re.sub(r'[^\x00-\x7F]+', '', sentence) for sentence in data]
30+
# REMOVE ?
31+
preprocess2 = [re.sub(r'\?+', '', sentence) for sentence in preprocess]
32+
# REMOVE .
33+
preprocess3 = [re.sub(r'\.+', ' ', sentence) for sentence in preprocess2]
34+
# REMOVE HASHTAG
35+
preprocess4 = [re.sub(r'(#[^\s]+)+', '', sentence) for sentence in preprocess3]
36+
# REMOVE ' '
37+
preprocess5 = [re.sub(r'', '', sentence) for sentence in preprocess4]
38+
# REMOVE ,
39+
preprocess6 = [re.sub(r',', ' ', sentence) for sentence in preprocess5]
40+
41+
return preprocess6
42+
43+
44+
###################################################
45+
46+
# OPEN TRAIN DATA USING PANDAS
47+
train_data = pd.read_csv("Training_Jokowimundurlahbatch1.csv", sep=';', encoding='latin-1')
48+
print(train_data.head())
49+
print("Total rows: {0}".format(len(train_data)))
50+
print(list(train_data))
51+
52+
# OPEN TEST DATA USING PANDAS
53+
test_data = pd.read_csv('Test_data.csv', sep=';', encoding='latin-1')
54+
print(test_data.head())
55+
print("Total rows: {0}".format(len(test_data)))
56+
print(list(test_data))
57+
58+
# Comment in train data
59+
print("\nComments")
60+
print("-----------")
61+
Train_Comments = train_data.Comment
62+
print(Train_Comments)
63+
# print(row[0])
64+
65+
# Comment in test data
66+
print("\nComments")
67+
print("-----------")
68+
Test_Comments = test_data.Comment
69+
print(Test_Comments)
70+
71+
# Labels in train data
72+
print("\nLabels")
73+
print("-----------")
74+
# labels = ['Label']
75+
# train_labels = train_data[labels]
76+
77+
# train_labels = train_data.Label
78+
79+
labels = train_data.Label.unique()
80+
dic = {}
81+
for i, Label in enumerate(labels):
82+
dic[Label] = i
83+
train_labels = train_data.Label.apply(lambda x: dic[x])
84+
print(train_labels)
85+
86+
# PREPROCESSING TRAINING DATA
87+
preprocessed_train_comment = remove_punc_emoji(Train_Comments)
88+
print(preprocessed_train_comment)
89+
90+
# PREPROCESSING TEST DATA
91+
preprocessed_test_comment = remove_punc_emoji(Test_Comments)
92+
print(preprocessed_test_comment)
93+
94+
# LOWER AND SPLIT SENTENCE TO WORDS ALSO REMOVE STOPWORDS
95+
train_words = remove_stopwords(preprocessed_train_comment) # TRAINING DATA
96+
test_words = remove_stopwords(preprocessed_test_comment) # TEST DATA
97+
98+
# INPUT TO WORD2VEC(SIZE = DIMENSION, MIN_COUNT = MINIMAL NUMBER OF WORDS APPEAR WILL BE LEARNED, ITER = ITERATION OF
99+
# SENTENCE WILL BE LEARNED)
100+
Embedding_dim = 300
101+
train_word_model = gensim.models.Word2Vec(train_words, size=Embedding_dim, min_count=1, iter=10, sg=1, window=3)
102+
103+
# CHECK SIMILARITY WORD BANGSAT
104+
# print(train_word_model.wv.most_similar(positive='bangsat'))
105+
106+
# EMBED WORD2VEC VOCABULARY TO MATRIX
107+
108+
# MAX_NB_WORDS = 100000
109+
# nb_words = min(MAX_NB_WORDS, len(train_word_model.wv.vocab))+1
110+
111+
embedding_matrix = np.zeros((len(train_word_model.wv.vocab) + 1, Embedding_dim))
112+
# embedding_matrix = np.zeros((nb_words, Embedding_dim))
113+
for i, vec in enumerate(train_word_model.wv.vectors):
114+
embedding_matrix[i] = vec
115+
print(train_word_model.wv.vocab)
116+
117+
118+
# how many features should the tokenizer extract
119+
features = 500
120+
tokenizer = Tokenizer(num_words=features)
121+
122+
# fit the tokenizer on our text
123+
tokenizer.fit_on_texts(train_words)
124+
125+
# get all words that the tokenizer knows
126+
word_index = tokenizer.word_index
127+
128+
# put the tokens in a matrix
129+
X = tokenizer.texts_to_sequences(train_words)
130+
X = pad_sequences(X)
131+
132+
# prepare the labels
133+
y = pd.get_dummies(train_labels)
134+
135+
# split in train and test
136+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False)
137+
138+
########################################################################################################################
139+
model = Sequential()
140+
model.add(
141+
Embedding(len(train_word_model.wv.vocab) + 1, Embedding_dim, input_length=X.shape[1], weights=[embedding_matrix],
142+
trainable=False))
143+
144+
model.add(Dropout(0.2))
145+
146+
model.add(Conv1D(300,3,padding='valid',activation='relu',strides=2))
147+
model.add(Conv1D(150,3,padding='valid',activation='relu',strides=2))
148+
model.add(Conv1D(75,3,padding='valid',activation='relu',strides=2))
149+
150+
# model.add(Conv1D(filters=16, kernel_size=3, activation='relu'))
151+
# model.add(Conv1D(filters=16, kernel_size=4, activation='relu'))
152+
# model.add(Conv1D(filters=16, kernel_size=5, activation='relu'))
153+
# model.add(Conv1D(filters=16, kernel_size=8, activation='relu'))
154+
155+
# model.add(Conv1D(filters=32, kernel_size=3, activation='relu'))
156+
# model.add(Conv1D(filters=32, kernel_size=4, activation='relu'))
157+
# model.add(Conv1D(filters=32, kernel_size=5, activation='relu'))
158+
# model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
159+
160+
# model.add(Conv2D(filters=100, kernel_size=8, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
161+
# model.add(Conv2D(filters=100, kernel_size=4, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
162+
# model.add(Conv2D(filters=100, kernel_size=5, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
163+
164+
model.add(MaxPooling1D(pool_size=2))
165+
# model.add(MaxPooling2D(pool_size=2))
166+
167+
model.add(Flatten())
168+
model.add(Dropout(0.2))
169+
# model.add(Dense(10, activation='relu'))
170+
171+
model.add(Dense(150, activation='sigmoid'))
172+
173+
model.add(Dropout(0.2))
174+
# model.add(Dense(1, activation='sigmoid'))
175+
176+
model.add(Dense(3, activation='sigmoid'))
177+
178+
# model.add(Dense(y.shape[1], activation="softmax"))
179+
model.add(Dense(y.shape[1], activation="sigmoid"))
180+
181+
print(model.summary())
182+
# model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=['acc'])
183+
model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=['acc'])
184+
# model.compile(loss='mean_squared_error', optimizer='sgd', metrics=['mae', 'acc'])
185+
#############################################
186+
187+
# EVALUATION TEST
188+
batch = 128
189+
epo = 2
190+
model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=batch, epochs=epo)
191+
192+
evaluate = model.evaluate(X_test, y_test, verbose=0)
193+
print('\nTest Loss:', evaluate[0])
194+
print('Test Accuracy:', evaluate[1])
195+
###############################################
196+
197+
# REAL DATA TEST
198+
sequences_test = tokenizer.texts_to_sequences(test_data)
199+
X_real_test = pad_sequences(sequences_test, maxlen=X_train.shape[1])
200+
# X_real_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)
201+
y_pred = model.predict(X_real_test)
202+
to_submit = pd.DataFrame(index=test_data.Id, data={'Label': y_pred[:, dic['Label']]})
203+
to_submit.to_csv('submit.csv')
204+
##############################################

0 commit comments

Comments
 (0)