|
| 1 | +import re |
| 2 | + |
| 3 | +import gensim |
| 4 | +import numpy as np |
| 5 | +import pandas as pd |
| 6 | +from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Embedding, Dropout |
| 7 | +from keras.models import Sequential |
| 8 | +from keras.preprocessing.sequence import pad_sequences |
| 9 | +from keras.preprocessing.text import Tokenizer |
| 10 | +from nltk.corpus import stopwords |
| 11 | +from sklearn.model_selection import train_test_split |
| 12 | + |
| 13 | + |
| 14 | +################################################### |
| 15 | +def remove_stopwords(data): |
| 16 | + stop_words = set(stopwords.words('indonesian')) |
| 17 | + wordss = [] |
| 18 | + for sentence in data: |
| 19 | + sentence = sentence.lower().split() |
| 20 | + # print(sentence) |
| 21 | + x = [word.strip() for word in sentence if word not in stop_words] |
| 22 | + # print(x) |
| 23 | + wordss.append(x) |
| 24 | + return wordss |
| 25 | + |
| 26 | + |
| 27 | +def remove_punc_emoji(data): |
| 28 | + # REMOVE EMOJI |
| 29 | + preprocess = [re.sub(r'[^\x00-\x7F]+', '', sentence) for sentence in data] |
| 30 | + # REMOVE ? |
| 31 | + preprocess2 = [re.sub(r'\?+', '', sentence) for sentence in preprocess] |
| 32 | + # REMOVE . |
| 33 | + preprocess3 = [re.sub(r'\.+', ' ', sentence) for sentence in preprocess2] |
| 34 | + # REMOVE HASHTAG |
| 35 | + preprocess4 = [re.sub(r'(#[^\s]+)+', '', sentence) for sentence in preprocess3] |
| 36 | + # REMOVE ' ' |
| 37 | + preprocess5 = [re.sub(r'', '', sentence) for sentence in preprocess4] |
| 38 | + # REMOVE , |
| 39 | + preprocess6 = [re.sub(r',', ' ', sentence) for sentence in preprocess5] |
| 40 | + |
| 41 | + return preprocess6 |
| 42 | + |
| 43 | + |
| 44 | +################################################### |
| 45 | + |
| 46 | +# OPEN TRAIN DATA USING PANDAS |
| 47 | +train_data = pd.read_csv("Training_Jokowimundurlahbatch1.csv", sep=';', encoding='latin-1') |
| 48 | +print(train_data.head()) |
| 49 | +print("Total rows: {0}".format(len(train_data))) |
| 50 | +print(list(train_data)) |
| 51 | + |
| 52 | +# OPEN TEST DATA USING PANDAS |
| 53 | +test_data = pd.read_csv('Test_data.csv', sep=';', encoding='latin-1') |
| 54 | +print(test_data.head()) |
| 55 | +print("Total rows: {0}".format(len(test_data))) |
| 56 | +print(list(test_data)) |
| 57 | + |
| 58 | +# Comment in train data |
| 59 | +print("\nComments") |
| 60 | +print("-----------") |
| 61 | +Train_Comments = train_data.Comment |
| 62 | +print(Train_Comments) |
| 63 | +# print(row[0]) |
| 64 | + |
| 65 | +# Comment in test data |
| 66 | +print("\nComments") |
| 67 | +print("-----------") |
| 68 | +Test_Comments = test_data.Comment |
| 69 | +print(Test_Comments) |
| 70 | + |
| 71 | +# Labels in train data |
| 72 | +print("\nLabels") |
| 73 | +print("-----------") |
| 74 | +# labels = ['Label'] |
| 75 | +# train_labels = train_data[labels] |
| 76 | + |
| 77 | +# train_labels = train_data.Label |
| 78 | + |
| 79 | +labels = train_data.Label.unique() |
| 80 | +dic = {} |
| 81 | +for i, Label in enumerate(labels): |
| 82 | + dic[Label] = i |
| 83 | +train_labels = train_data.Label.apply(lambda x: dic[x]) |
| 84 | +print(train_labels) |
| 85 | + |
| 86 | +# PREPROCESSING TRAINING DATA |
| 87 | +preprocessed_train_comment = remove_punc_emoji(Train_Comments) |
| 88 | +print(preprocessed_train_comment) |
| 89 | + |
| 90 | +# PREPROCESSING TEST DATA |
| 91 | +preprocessed_test_comment = remove_punc_emoji(Test_Comments) |
| 92 | +print(preprocessed_test_comment) |
| 93 | + |
| 94 | +# LOWER AND SPLIT SENTENCE TO WORDS ALSO REMOVE STOPWORDS |
| 95 | +train_words = remove_stopwords(preprocessed_train_comment) # TRAINING DATA |
| 96 | +test_words = remove_stopwords(preprocessed_test_comment) # TEST DATA |
| 97 | + |
| 98 | +# INPUT TO WORD2VEC(SIZE = DIMENSION, MIN_COUNT = MINIMAL NUMBER OF WORDS APPEAR WILL BE LEARNED, ITER = ITERATION OF |
| 99 | +# SENTENCE WILL BE LEARNED) |
| 100 | +Embedding_dim = 300 |
| 101 | +train_word_model = gensim.models.Word2Vec(train_words, size=Embedding_dim, min_count=1, iter=10, sg=1, window=3) |
| 102 | + |
| 103 | +# CHECK SIMILARITY WORD BANGSAT |
| 104 | +# print(train_word_model.wv.most_similar(positive='bangsat')) |
| 105 | + |
| 106 | +# EMBED WORD2VEC VOCABULARY TO MATRIX |
| 107 | + |
| 108 | +# MAX_NB_WORDS = 100000 |
| 109 | +# nb_words = min(MAX_NB_WORDS, len(train_word_model.wv.vocab))+1 |
| 110 | + |
| 111 | +embedding_matrix = np.zeros((len(train_word_model.wv.vocab) + 1, Embedding_dim)) |
| 112 | +# embedding_matrix = np.zeros((nb_words, Embedding_dim)) |
| 113 | +for i, vec in enumerate(train_word_model.wv.vectors): |
| 114 | + embedding_matrix[i] = vec |
| 115 | +print(train_word_model.wv.vocab) |
| 116 | + |
| 117 | + |
| 118 | +# how many features should the tokenizer extract |
| 119 | +features = 500 |
| 120 | +tokenizer = Tokenizer(num_words=features) |
| 121 | + |
| 122 | +# fit the tokenizer on our text |
| 123 | +tokenizer.fit_on_texts(train_words) |
| 124 | + |
| 125 | +# get all words that the tokenizer knows |
| 126 | +word_index = tokenizer.word_index |
| 127 | + |
| 128 | +# put the tokens in a matrix |
| 129 | +X = tokenizer.texts_to_sequences(train_words) |
| 130 | +X = pad_sequences(X) |
| 131 | + |
| 132 | +# prepare the labels |
| 133 | +y = pd.get_dummies(train_labels) |
| 134 | + |
| 135 | +# split in train and test |
| 136 | +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False) |
| 137 | + |
| 138 | +######################################################################################################################## |
| 139 | +model = Sequential() |
| 140 | +model.add( |
| 141 | + Embedding(len(train_word_model.wv.vocab) + 1, Embedding_dim, input_length=X.shape[1], weights=[embedding_matrix], |
| 142 | + trainable=False)) |
| 143 | + |
| 144 | +model.add(Dropout(0.2)) |
| 145 | + |
| 146 | +model.add(Conv1D(300,3,padding='valid',activation='relu',strides=2)) |
| 147 | +model.add(Conv1D(150,3,padding='valid',activation='relu',strides=2)) |
| 148 | +model.add(Conv1D(75,3,padding='valid',activation='relu',strides=2)) |
| 149 | + |
| 150 | +# model.add(Conv1D(filters=16, kernel_size=3, activation='relu')) |
| 151 | +# model.add(Conv1D(filters=16, kernel_size=4, activation='relu')) |
| 152 | +# model.add(Conv1D(filters=16, kernel_size=5, activation='relu')) |
| 153 | +# model.add(Conv1D(filters=16, kernel_size=8, activation='relu')) |
| 154 | + |
| 155 | +# model.add(Conv1D(filters=32, kernel_size=3, activation='relu')) |
| 156 | +# model.add(Conv1D(filters=32, kernel_size=4, activation='relu')) |
| 157 | +# model.add(Conv1D(filters=32, kernel_size=5, activation='relu')) |
| 158 | +# model.add(Conv1D(filters=32, kernel_size=8, activation='relu')) |
| 159 | + |
| 160 | +# model.add(Conv2D(filters=100, kernel_size=8, activation='relu', kernel_regularizer=regularizers.l2(0.01))) |
| 161 | +# model.add(Conv2D(filters=100, kernel_size=4, activation='relu', kernel_regularizer=regularizers.l2(0.01))) |
| 162 | +# model.add(Conv2D(filters=100, kernel_size=5, activation='relu', kernel_regularizer=regularizers.l2(0.01))) |
| 163 | + |
| 164 | +model.add(MaxPooling1D(pool_size=2)) |
| 165 | +# model.add(MaxPooling2D(pool_size=2)) |
| 166 | + |
| 167 | +model.add(Flatten()) |
| 168 | +model.add(Dropout(0.2)) |
| 169 | +# model.add(Dense(10, activation='relu')) |
| 170 | + |
| 171 | +model.add(Dense(150, activation='sigmoid')) |
| 172 | + |
| 173 | +model.add(Dropout(0.2)) |
| 174 | +# model.add(Dense(1, activation='sigmoid')) |
| 175 | + |
| 176 | +model.add(Dense(3, activation='sigmoid')) |
| 177 | + |
| 178 | +# model.add(Dense(y.shape[1], activation="softmax")) |
| 179 | +model.add(Dense(y.shape[1], activation="sigmoid")) |
| 180 | + |
| 181 | +print(model.summary()) |
| 182 | +# model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=['acc']) |
| 183 | +model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=['acc']) |
| 184 | +# model.compile(loss='mean_squared_error', optimizer='sgd', metrics=['mae', 'acc']) |
| 185 | +############################################# |
| 186 | + |
| 187 | +# EVALUATION TEST |
| 188 | +batch = 128 |
| 189 | +epo = 2 |
| 190 | +model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=batch, epochs=epo) |
| 191 | + |
| 192 | +evaluate = model.evaluate(X_test, y_test, verbose=0) |
| 193 | +print('\nTest Loss:', evaluate[0]) |
| 194 | +print('Test Accuracy:', evaluate[1]) |
| 195 | +############################################### |
| 196 | + |
| 197 | +# REAL DATA TEST |
| 198 | +sequences_test = tokenizer.texts_to_sequences(test_data) |
| 199 | +X_real_test = pad_sequences(sequences_test, maxlen=X_train.shape[1]) |
| 200 | +# X_real_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH) |
| 201 | +y_pred = model.predict(X_real_test) |
| 202 | +to_submit = pd.DataFrame(index=test_data.Id, data={'Label': y_pred[:, dic['Label']]}) |
| 203 | +to_submit.to_csv('submit.csv') |
| 204 | +############################################## |
0 commit comments