-
Notifications
You must be signed in to change notification settings - Fork 11
/
idf_regression.py
executable file
·276 lines (256 loc) · 13.9 KB
/
idf_regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
import numpy as np
import sklearn.linear_model as lm
import matplotlib.pyplot as plt
from copy import deepcopy
from database.make_patent_db import load_session, Patent
from corpus_utils import PatentCorpus
from plot_utils import plot_score_distr, calc_simcoef_distr, group_combis, calc_auc
from nlputils.preprocessing import FeatureTransform, features2mat
from nlputils.dict_utils import norm_dict
from nlputils.simcoefs import compute_sim
from sklearn.metrics import auc, roc_curve
def pointwise_dict_multiply(dict1, dict2):
"""
Inputs:
- dict1, dict2: dicts with {key: float}
Returns:
- dict_mult: dict with {key: dict1[key]*dict2[key]} for all keys dict1 and dict2 have in common
"""
common_keys = set(dict1.keys()) & set(dict2.keys())
return {k: dict1[k]*dict2[k] for k in common_keys}
def postprocess_weights(idf_weights, zero, sqrt):
"""
Inputs:
- idf_weights: Weights calculated in regression
- zero: Boolean indicating if negative weights
should be set to zero
- sqrt: Boolean indicating if the squareroot should
be taken for positive weights
Returns:
-weights: postprocessed weights
"""
weights = {}
for word, weight in idf_weights.items():
if (weight < 0) & zero:
weights[word] = 0.
elif (weight >= 0) & sqrt:
weights[word] = np.sqrt(weight)
else:
weights[word] = weight
return weights
def model_selection(combis, patfeats_pairs, single_pat_corpus, binary_label_pairs, human_label_pairs):
alphas = np.arange(10)/100000.
param_auc_dict = {}
param_auc_dict['cited'] = {}
param_auc_dict['human'] = {}
for alpha in alphas:
param_auc_dict['cited']['%.5f' %alpha] = {}
param_auc_dict['human']['%.5f' %alpha] = {}
for wtype in ['idf_weights', 'idf_weights_sqrt', 'idf_weights_zeroed', 'idf_weights_zeroed_sqrt']:
param_auc_dict['cited']['%.5f' %alpha][wtype] = []
param_auc_dict['human']['%.5f' %alpha][wtype] = []
## model selection
for n in range(5):
print "testing for the %ith time" %n
# train/test split
combis_perm = np.random.permutation(combis)
trainids = combis_perm[:int(np.ceil(len(combis)*0.7))]
testids = combis_perm[int(np.ceil(len(combis)*0.7)):]
patfeats_pairs_train = {}
for combi in trainids:
target_id, pid = combi
patfeats_pairs_train[(target_id, pid)] = patfeats_pairs[(target_id, pid)]
train_pair_ids = patfeats_pairs_train.keys()
# transform into feature matrix (number of pairs) x (bow-dim)
print "make feature matrix train"
featmat_train, featurenames = features2mat(patfeats_pairs_train, train_pair_ids)
# same for test set
patfeats_pairs_test = {}
for combi in testids:
target_id, pid = combi
patfeats_pairs_test[(target_id, pid)] = patfeats_pairs[(target_id, pid)]
test_pair_ids = patfeats_pairs_test.keys()
print "make feature matrix test"
featmat_test, featurenames = features2mat(patfeats_pairs_test, test_pair_ids, featurenames)
# get the corresponding label vectors
y_human_train = [human_label_pairs[tid] for tid in train_pair_ids]
y_human_test = [human_label_pairs[tid] for tid in test_pair_ids]
y_binary_train = [binary_label_pairs[tid] for tid in train_pair_ids]
y_binary_test = [binary_label_pairs[tid] for tid in test_pair_ids]
for alpha in alphas:
# perform the linear regression for binary (cited/not cited) labels
print "perform regression for binary scoring"
clf = lm.Lasso(alpha=alpha, fit_intercept=True, random_state=13)
clf.fit(featmat_train, y_binary_train)
## calculate AUC-values
# the fitted coefficients are now our word weights
# perform regression for all weight postprocessings
weights = {}
weights['idf_weights'] = norm_dict(dict(zip(featurenames, clf.coef_)))
weights['idf_weights_zeroed'] = postprocess_weights(weights['idf_weights'], zero=True, sqrt=False)
weights['idf_weights_sqrt'] = postprocess_weights(weights['idf_weights'], zero=False, sqrt=False)
weights['idf_weights_zeroed_sqrt'] = postprocess_weights(weights['idf_weights'], zero=True, sqrt=True)
# multiply patfeats with idf weights
for wtype in ['idf_weights', 'idf_weights_sqrt', 'idf_weights_zeroed', 'idf_weights_zeroed_sqrt']:
ft = FeatureTransform(identify_bigrams=False, norm=None, weight=True, renorm='length')
ft.Dw = weights[wtype]
patfeats_idf = ft.texts2features(single_pat_corpus)
# calculate auc for cited/not cited on test set
for simcoef in ['linear']:
y_true = []
y_pred = []
for combi in testids:
y_true.append(binary_label_pairs[(combi[0], combi[1])])
y_pred.append(compute_sim(patfeats_idf[combi[0]], patfeats_idf[combi[1]], simcoef))
fpr, tpr, thresholds = roc_curve(y_true, y_pred, pos_label=1)
auc_val = auc(fpr, tpr)
print "cited, alpha: %.5f, AUC: %.4f" %(alpha, auc_val)
param_auc_dict['cited']['%.5f' %alpha][wtype].append(auc_val)
print "perform regression for human scoring"
clf = lm.Lasso(alpha=alpha, fit_intercept=True, random_state=13)
clf.fit(featmat_train, y_human_train)
## calculate AUC-values
# the fitted coefficients are now our word weights
# perform regression for all weight postprocessings
weights = {}
weights['idf_weights'] = norm_dict(dict(zip(featurenames, clf.coef_)))
weights['idf_weights_zeroed'] = postprocess_weights(weights['idf_weights'], zero=True, sqrt=False)
weights['idf_weights_sqrt'] = postprocess_weights(weights['idf_weights'], zero=False, sqrt=False)
weights['idf_weights_zeroed_sqrt'] = postprocess_weights(weights['idf_weights'], zero=True, sqrt=True)
# multiply patfeats with idf weights
for wtype in ['idf_weights', 'idf_weights_sqrt', 'idf_weights_zeroed', 'idf_weights_zeroed_sqrt']:
ft = FeatureTransform(identify_bigrams=False, norm=None, weight=True, renorm='length')
ft.Dw = weights[wtype]
patfeats_idf = ft.texts2features(single_pat_corpus)
# calculate auc for cited/not cited on test set
for simcoef in ['linear']:
y_true = []
y_pred = []
for combi in testids:
y_true.append(int(human_label_pairs[(combi[0], combi[1])] >= 0.5))
y_pred.append(compute_sim(patfeats_idf[combi[0]], patfeats_idf[combi[1]], simcoef))
fpr, tpr, thresholds = roc_curve(y_true, y_pred, pos_label=1)
auc_val = auc(fpr, tpr)
print "human, alpha: %.5f, AUC: %.4f" %(alpha, auc_val)
param_auc_dict['human']['%.5f' %alpha][wtype].append(auc_val)
np.save('human_eval/regression/param_auc_dict.npy', param_auc_dict)
if __name__ == "__main__":
pat_corpus = PatentCorpus()
pat_corpus.mode = 'regression'
# ugly hack to invoke __iter__() function :-( :
list(pat_corpus)
pat_ids = pat_corpus.pat_ids
combis = pat_corpus.combis
binary_label_pairs = pat_corpus.binary_label_pairs
human_label_pairs = pat_corpus.human_label_pairs
single_pat_corpus = pat_corpus.single_pat_corpus
## baseline: cosine similarity calculation with idf weights
print "baseline: idf weights"
# make features
ft = FeatureTransform(identify_bigrams=False, norm=None, weight=True, renorm='length')
patfeats_org = ft.texts2features(single_pat_corpus)
#save corpus and ids for training word2vec
np.save('human_eval/corpus_info/pat_ids.npy', pat_ids)
np.save('human_eval/corpus_info/combis.npy', combis)
np.save('human_eval/corpus_info/binary_label_pairs.npy', binary_label_pairs)
np.save('human_eval/corpus_info/human_label_pairs.npy', human_label_pairs)
np.save('human_eval/corpus_info/patfeats_human_eval.npy', patfeats_org)
np.save('human_eval/corpus_info/single_pat_corpus.npy', single_pat_corpus)
# save the idf weights
Dw_all = {}
Dw_all['idf'] = deepcopy(ft.Dw)
## our case: weights learned by regression
# transform into very basic features, i.e. w/o idf weights
print "making patent pair features"
ft = FeatureTransform(identify_bigrams=False, norm=None, weight=False, renorm=None)
# transform into pair features + baseline cosine labels
patfeats = ft.texts2features(single_pat_corpus)
# make pairwise feature matrix
print "making feature matrix"
patfeats_pairs = {}
binary_labels = []
human_labels = []
for combi in combis:
target_id, pid = combi
binary_labels.append(binary_label_pairs[(target_id, pid)])
#if performing logistic regression, uncomment the following line and comment the one after that:
#human_labels.append(int(human_label_pairs[(target_id, pid)] >=0.5))
human_labels.append(human_label_pairs[(target_id, pid)])
patfeats_pairs[(target_id, pid)] = norm_dict(pointwise_dict_multiply(patfeats[target_id], patfeats[pid]), 'length')
featmat, featurenames = features2mat(patfeats_pairs, combis)
# perform model selection
#print "performing model selection"
#model_selection(combis, patfeats_pairs, single_pat_corpus, binary_label_pairs, human_label_pairs)
# preform regression for cited/not cited labels
print "performing regression for cited/not cited"
# if logistic regression should be performed, uncomment the following line
#clf = lm.LogisticRegression(C=1.5, fit_intercept=True, solver='liblinear', random_state=13)
clf = lm.Lasso(alpha=0.00005, fit_intercept=True, random_state=13)
clf.fit(featmat, binary_labels)
idf_weights = norm_dict(dict(zip(featurenames, clf.coef_)))
weights = postprocess_weights(idf_weights, zero=True, sqrt=False)
Dw_all['binary'] = weights
# preform regression for relevant/not relevant (human) labels
print "performing regression for relevant/not relevant"
# if logistic regression should be performed, uncomment the following line
#clf = lm.LogisticRegression(C=1.5, fit_intercept=True, solver='liblinear', random_state=13)
clf = lm.Lasso(alpha=0.00006, fit_intercept=True, random_state=13)
clf.fit(featmat, human_labels)
idf_weights = norm_dict(dict(zip(featurenames, clf.coef_)))
weights = postprocess_weights(idf_weights, zero=True, sqrt=False)
Dw_all['human'] = weights
# combine normally calculated idf weights with regression idf weights
Dw_all['human_idf'] = pointwise_dict_multiply(Dw_all['idf'], Dw_all['human'])
Dw_all['binary_idf'] = pointwise_dict_multiply(Dw_all['idf'], Dw_all['binary'])
np.save('human_eval/regression/Dw_all.npy', Dw_all)
## plot all the word weights against each other to see the differences
#assert set(Dw_all['idf'].keys()) == set(Dw_all['reg_sanity'].keys())
wordlist = sorted(set(Dw_all['idf'].keys()) & set(Dw_all['binary'].keys()))
assert wordlist == sorted(set(Dw_all['human'].keys()))
idf_weights = [Dw_all['idf'][w] for w in wordlist]
binary_weights = [Dw_all['binary'][w] for w in wordlist]
human_weights = [Dw_all['human'][w] for w in wordlist]
plt.figure()
plt.scatter(idf_weights, human_weights)
plt.xlabel('idf weights')
plt.ylabel('human weights')
plt.savefig('human_eval/regression/idf_human.pdf')
plt.close()
plt.figure()
plt.scatter(idf_weights, binary_weights)
plt.xlabel('idf weights')
plt.ylabel('binary weights')
plt.savefig('human_eval/regression/idf_binary.pdf')
plt.close()
plt.figure()
plt.scatter(idf_weights, binary_weights)
plt.xlabel('human weights')
plt.ylabel('binary weights')
plt.savefig('human_eval/regression/human_binary.pdf')
plt.close()
# multiply patfeats with binary idf weights
ft = FeatureTransform(identify_bigrams=False, norm=None, weight=True, renorm='length')
ft.Dw = Dw_all['binary_idf']
patfeats_cited = ft.texts2features(single_pat_corpus)
# multiply patfeats with human idf weights
ft = FeatureTransform(identify_bigrams=False, norm=None, weight=True, renorm='length')
ft.Dw = Dw_all['human_idf']
patfeats_human = ft.texts2features(single_pat_corpus)
# plot the distributions
binary_sim_combis, binary_diff_combis = group_combis(binary_label_pairs)
human_sim_combis, human_diff_combis = group_combis(human_label_pairs)
for simcoef in ['linear', 'jaccard']:
binary_scores = calc_simcoef_distr(patfeats_cited, ['cited', 'random'],
{'cited': binary_sim_combis, 'random': binary_diff_combis},
simcoef)
human_scores = calc_simcoef_distr(patfeats_human, ['relevant', 'not relevant'],
{'relevant': human_sim_combis, 'not relevant': human_diff_combis},
simcoef)
binary_auc = calc_auc(binary_scores['cited'], binary_scores['random'])[2]
human_auc = calc_auc(human_scores['relevant'], human_scores['not relevant'])[2]
plot_score_distr('human_eval', simcoef, ['cited', 'random'],
{'cited': binary_scores['cited'], 'random': binary_scores['random']},
binary_auc, ['cited'], histdir='reg_idf', bins=50)
plot_score_distr('human_eval', simcoef, ['relevant', 'not relevant'],
{'relevant': human_scores['relevant'], 'not relevant': human_scores['not relevant']},
human_auc, ['relevant'], histdir='reg_idf', bins=50)