-
Notifications
You must be signed in to change notification settings - Fork 10
/
Retrain_Sklearn.py
139 lines (110 loc) · 3.86 KB
/
Retrain_Sklearn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
from sklearn.externals import joblib
import config
import time
import random
import numpy as np
from sklearn.tree import DecisionTreeClassifier
def extract_inputs(filename):
X = []
Y = []
i = 0
neg_count = 0
pos_count = 0
with open(filename, "r") as ins:
for line in ins:
line = line.strip()
line1 = line.split(',')
if (i == 0):
i += 1
continue
L = map(int, line1[:-1])
# L[sens_arg-1]=-1
X.append(L)
if (int(line1[-1]) == 0):
Y.append(-1)
neg_count = neg_count + 1
else:
Y.append(1)
pos_count = pos_count + 1
return X, Y
X, Y = extract_inputs("cleaned_train")
X_original = np.array(X)
Y_original = np.array(Y)
num_trials = 100
samples = 100
classifier_name = config.classifier_name
current_model = joblib.load(classifier_name)
input_bounds = config.input_bounds
params = config.params
sensitive_param = config.sensitive_param
retraining_inputs = config.retraining_inputs
def retrain(X_original, Y_original, X_additional, Y_additional):
X = np.concatenate((X_original, X_additional), axis = 0)
Y = np.concatenate((Y_original, Y_additional), axis = 0)
current_model.fit(X, Y)
return current_model
def get_random_input():
x = []
for i in xrange(params):
random.seed(time.time())
x.append(random.randint(input_bounds[i][0], input_bounds[i][1]))
x[sensitive_param - 1] = 0
return x
def evaluate_input(inp, model):
inp0 = [int(i) for i in inp]
inp1 = [int(i) for i in inp]
inp0[sensitive_param - 1] = 0
inp1[sensitive_param - 1] = 1
inp0 = np.asarray(inp0)
inp0 = np.reshape(inp0, (1, -1))
inp1 = np.asarray(inp1)
inp1 = np.reshape(inp1, (1, -1))
out0 = model.predict(inp0)
out1 = model.predict(inp1)
return (abs(out0 + out1) == 0)
def get_estimate(model):
estimate_array = []
rolling_average = 0.0
for i in xrange(num_trials):
disc_count = 0
total_count = 0
for j in xrange(samples):
total_count = total_count + 1
if(evaluate_input(get_random_input(), model)):
disc_count = disc_count + 1
estimate = float(disc_count)/total_count
rolling_average = ((rolling_average * i) + estimate)/(i + 1)
estimate_array.append(estimate)
# print estimate, rolling_average
print np.average(estimate_array)
return np.average(estimate_array)
current_estimate = get_estimate(current_model)
def retrain_search():
global current_estimate
global current_model
X_retrain, Y_retrain = extract_inputs(retraining_inputs)
retrain_len = len(X_retrain)
for i in xrange(7):
X_additional = []
Y_additional = []
retraining_input_set = set()
additive_percentage = random.uniform(pow(2, i), pow(2, i + 1))
num_inputs_for_retrain = int((additive_percentage * len(X))/100)
if (num_inputs_for_retrain > retrain_len):
raise ValueError('Number of inputs in retraining are not enough. Please add more inputs')
while (len(retraining_input_set) < num_inputs_for_retrain):
retraining_input_set.add(random.randint(0, retrain_len - 1))
for i in retraining_input_set:
X_additional.append(X_retrain[i])
Y_additional.append(Y_retrain[i])
retrained_model = retrain(X_original, Y_original, np.array(X_additional), np.array(Y_additional))
retrained_estimate = get_estimate(retrained_model)
if (retrained_estimate > current_estimate):
return current_model
else:
current_model = retrained_model
current_estimate = retrained_estimate
del retrained_estimate
del retrained_model
return current_model
retrain_search()