classifyingTextENSEMBLE.py

#written by Viktor Zenkov in 2018

#this file classifies using the Ensemble model, making predictions based on the predictions from the text and hex models

from __future__ import print_function
from __future__ import absolute_import

import sys

import numpy as np
from numpy.random import seed
from tensorflow import set_random_seed

import matplotlib.pyplot as plt
import math
import time
import datetime

from keras.preprocessing import sequence
from keras.models import Sequential, Model
from keras.models import load_model
from keras.layers import Input, Dense, Embedding
from keras.layers import LSTM, Dropout, Activation
from keras.layers import concatenate
from keras.utils import to_categorical, print_summary
from keras.callbacks import EarlyStopping
from sklearn.metrics import confusion_matrix
import itertools

print('sys.argv[0]: {0!r}'.format(sys.argv[0]))
print('sys.path[0]: {0!r}'.format(sys.path[0]))
 
#this plots a confusion matrix
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion Matrix', cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    
    print(cm)
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=30)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks,classes, rotation=45,fontsize=22)
    plt.yticks(tick_marks,classes, fontsize=22)
    
    fmt = '.2f'
    thresh = cm.max()/2.
    for i,j in itertools.product(range(cm.shape[0]),range(cm.shape[1])):
        plt.text(j, i, format(cm[i,j],fmt),horizontalalignment="center",color='white' if cm[i,j]>thresh else 'black')
        
    plt.ylabel('True label', fontsize=25)
    plt.xlabel('Predicted label', fontsize=25)


import classifyingFunctions


max_features = 5000  #this is the number of unique integers we will keep
maxlen = 15000  # this is the number of integers from each file to keep, the sequence length
batch_size = 16

modelN = load_model('model2018-11-021541168475.h5')
modelH = load_model('model2018-10-301540939919.h5')

print('Loading number data...')

#num_words' default is None, which is taken to mean that all unique integers should be kept.
(x_train_N, y_train_N), (x_test_N, y_test_N) = classifyingFunctions.load_data(num_words=max_features,numOrHex=True) #all integers greater than max_features get turned into 2's, so there's max_features number of unique "words"
print(len(x_train_N), 'train sequences')
print(len(x_test_N), 'test sequences')

#this makes all the sequences be exactly maxlen integers long
print('Pad sequences (samples x time)')
x_train_N = sequence.pad_sequences(x_train_N, maxlen=maxlen)
x_test_N = sequence.pad_sequences(x_test_N, maxlen=maxlen)
print('x_train_N shape:', x_train_N.shape)
print('x_test_N shape:', x_test_N.shape)

y_test_N_ld = y_test_N

#We have classes from 1 to 9, which is 9 classes, but to_categorical will make an array with spots from 0 to max_class, so we subtract 1 such that our classes are 0 to 8 and we can use 9 classes.
y_train_N -= 1
y_test_N -= 1

#to_categorical replaces each number between 0 and 8 with a 9-length array of 0's except a 1 in the place of the number.
y_train_N = to_categorical(y_train_N, 9)
y_test_N = to_categorical(y_test_N, 9)

print('y_train_N shape:', y_train_N.shape)
print('y_test_N shape:', y_test_N.shape)


#we also need the hex data
print('Loading hex data...')

(x_train_H, y_train_H), (x_test_H, y_test_H) = classifyingFunctions.load_data(num_words=max_features,numOrHex=False) 
print(len(x_train_H), 'train sequences')
print(len(x_test_H), 'test sequences')

print('Pad sequences (samples x time)')
x_train_H = sequence.pad_sequences(x_train_H, maxlen=maxlen)
x_test_H = sequence.pad_sequences(x_test_H, maxlen=maxlen)
print('x_train_N shape:', x_train_N.shape)
print('x_test_N shape:', x_test_N.shape)

y_test_H_ld = y_test_H

y_train_H -= 1
y_test_H -= 1

y_train_H = to_categorical(y_train_H, 9)
y_test_H = to_categorical(y_test_H, 9)

print('y_train_H shape:', y_train_H.shape)
print('y_test_H shape:', y_test_H.shape)


#printing the test accuracies of the text and hex models
score_N, acc_N = modelN.evaluate(x_test_N, y_test_N,
                            batch_size=batch_size)
print('Text Test score:', score_N)
print('Text Test accuracy:', acc_N)

score_H, acc_H = modelH.evaluate(x_test_H, y_test_H,
                            batch_size=batch_size)
print('Hex Test score:', score_H)
print('Hex Test accuracy:', acc_H)


y_probs_N = modelN.predict(x_train_N)
y_probs_testN = modelN.predict(x_test_N)


y_probs_H = modelH.predict(x_train_H)
y_probs_testH = modelH.predict(x_test_H)


#we concatenate the predicted probabilities of the text and hex data
y_probs_C = np.concatenate((y_probs_N, y_probs_H),axis=1)
y_probs_testC = np.concatenate((y_probs_testN, y_probs_testH),axis=1)

print(np.size(y_probs_C))


model = Sequential()

#output layer has 9 probabilities; input has 18 entries (9 text and 9 hex probabilities)
model.add(Dense(9, activation='softmax', input_shape=(18,), name='aux_output'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

es = EarlyStopping(monitor="val_acc",min_delta=0.005,patience=28,mode='max')

#we train. This usually runs very quickly
print('Train...')
model.fit(y_probs_C, y_train_H,
          batch_size=batch_size,
          epochs=100, 
          validation_split=0.15, callbacks=[es])
          
model.save('model'+str(datetime.date.today())+str(math.floor(time.time()))+'.h5')

score, acc = model.evaluate(y_probs_testC, y_test_H,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)


#the rest of this is for the confusion matrix

y_probs = model.predict(y_probs_testC)

#this block of code makes a 1D array of predicted labels, which is an input to the confusion matrix
y_pred_ld = []
for i in range(0, len(y_probs)):
    probs = y_probs[i]
    predicted_index = np.argmax(probs)
    y_pred_ld.append(predicted_index)

#cnf_matrix = confusion_matrix(y_test_N_ld, y_pred_ld)
#plt.figure(figsize=(24,20))
cnf_matrix = confusion_matrix(y_test_N_ld, y_pred_ld)
print(cnf_matrix)
cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
print(cnf_matrix)

print_summary(model)


#plt.subplot(221)
#plot_confusion_matrix(cnf_matrix,classes=range(1,10),normalize=False,title="Confusion matrix")
#plt.subplot(222)
#plot_confusion_matrix(cnf_matrix,classes=range(0,10),normalize=True,title="Confusion matrix")
#plt.show()
print("end of file")