-
Notifications
You must be signed in to change notification settings - Fork 11
/
doc2vec.py
executable file
·174 lines (161 loc) · 8.45 KB
/
doc2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# import modules and set up logging
import gensim
import logging
import numpy as np
import cPickle as pkl
from corpus_utils import PatentCorpus, make_d2v_corpus
from plot_utils import plot_score_distr, calc_simcoef_distr, group_combis, calc_auc, make_combis
from nlputils.dict_utils import norm_dict
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
def train_doc2vec(doc_corpus, corpus='patents', size=50):
# train the model
model = gensim.models.doc2vec.Doc2Vec(size=size, min_count=5, iter=18, seed=1, workers=1)
model.build_vocab(doc_corpus)
model.train(doc_corpus)
# pickle the entire model to disk, so we can load&resume training later
saven = "%s_dm_%i_min5_iter18.model" % (corpus, size)
print "saving model"
#pkl.dump(model, open("/home/lhelmers/Documents/doc2vec/models/%s"%saven,'wb'), -1)
pkl.dump(model, open("human_eval/models/%s_no_target_pats"%saven,'wb'), -1)
return model
def make_doc2vec_corpus(model, target_pat_corpus=False):
patfeats_d2v = {}
vecsize = len(model.docvecs[0])
# get doc vecs for training documents
for pid in model.docvecs.doctags.keys():
patfeats_d2v[pid] = norm_dict(dict(zip(range(vecsize), model.docvecs[pid])), 'length')
if target_pat_corpus:
# infer doc vecs for target patents
for pid, pat in target_pat_corpus.items():
patfeats_d2v[pid] = norm_dict(dict(zip(range(vecsize), model.infer_vector(pat))), 'length')
return patfeats_d2v
def corpus_to_patfeats(model, corpus, target_ids):
'''
Transform corpus into doc2vec feature vectors
Checks if the patents in the corpus are contained in the model
If so: take the learned document vector
otherwise: infer the vector
'''
patfeats_d2v = {}
vecsize = len(model.docvecs[0])
cont = 0
not_cont = 0
for pid, pat in corpus.items():
# check if the patents in the corpus are contained in the model
if pid in model.docvecs.doctags.keys():
patfeats_d2v[pid] = norm_dict(dict(zip(range(vecsize), model.docvecs[pid])), 'length')
cont+=1
else:
not_cont+=1
patfeats_d2v[pid] = norm_dict(dict(zip(range(vecsize), model.infer_vector(pat.lower().split()))), 'length')
for tid in target_ids:
patfeats_d2v[tid] = norm_dict(dict(zip(range(vecsize), model.infer_vector(corpus[tid].lower().split()))), 'length')
print cont, not_cont
return patfeats_d2v
def infer_patfeats(corpus, model):
patfeats_d2v = {}
vecsize = len(model.docvecs[0])
for pid, pat in corpus.items():
patfeats_d2v[pid] = norm_dict(dict(zip(range(vecsize), model.infer_vector(pat.lower().split()))), 'length')
return patfeats_d2v
def apply_d2v_sectionwise():
'''
Evaluate doc2vec for comparison of patent sections
'''
# define embedding size
size=50
# load data
model = pkl.load(open("../doc2vec/models/full_pat_corpus_dm_50_min5_iter18.model"))
corpus = np.load('../corpus/corpus_claims.npy').item()
target_ids = np.load('../corpus/target_ids.npy')
random_ids = np.load('../corpus/random_ids.npy')
dupl_ids = np.load('../corpus/dupl_ids.npy').item()
cited_ids = np.load('../corpus/cited_ids.npy').item()
id_dict = make_combis(target_ids, random_ids, cited_ids, dupl_ids)
patfeats_d2v = infer_patfeats(corpus, model)
scores = calc_simcoef_distr(patfeats_d2v, ['cited', 'duplicate', 'random'],
id_dict, 'linear')
auc = calc_auc(scores['cited'], scores['random'])[2]
'''
# guarantee that scores range between 0 and 1
for label, vals in scores.items():
scores[label] = scores[label] - np.min(scores[label])
scores[label] = scores[label]/np.max(scores[label])
'''
plot_score_distr('human_eval', 'linear', ['random', 'cited', 'duplicate'],
{'cited': scores['cited'], 'random': scores['random'], 'duplicate': scores['duplicate']},
auc, ['cited'], histdir='doc2vec_full%i_claims' %size, bins=50)
def apply_d2v_full_corpus():
target_ids = np.load('../corpus/target_ids.npy')
random_ids = np.load('../corpus/random_ids.npy')
dupl_ids = np.load('../corpus/dupl_ids.npy').item()
cited_ids = np.load('../corpus/cited_ids.npy').item()
id_dict = make_combis(target_ids, random_ids, cited_ids, dupl_ids)
pat_corpus = np.load('../corpus/corpus.npy').item()
for size in [50]:
pat_corpus, target_pat_corpus = make_d2v_corpus(target_ids)
#train model
model = pkl.load(open("../doc2vec/models/full_pat_corpus_dm_50_min5_iter18.model"))
#load model
#model = pkl.load(open('human_eval/patents_dm_50_min5_iter10.model'))
#patfeats_d2v = infer_patfeats(pat_corpus, model)
#patfeats_d2v = corpus_to_patfeats(model, pat_corpus, target_ids)
patfeats_d2v = make_doc2vec_corpus(model, target_pat_corpus)
#np.save('../doc2vec/patfeats_d2v%i.npy' %size, patfeats_d2v)
scores = calc_simcoef_distr(patfeats_d2v, ['cited', 'duplicate', 'random'],
id_dict, 'linear')
auc = calc_auc(scores['cited'], scores['random'])[2]
'''
# guarantee that scores range between 0 and 1
for label, vals in scores.items():
scores[label] = scores[label] - np.min(scores[label])
scores[label] = scores[label]/np.max(scores[label])
'''
plot_score_distr('human_eval', 'linear', ['random', 'cited', 'duplicate'],
{'cited': scores['cited'], 'random': scores['random'], 'duplicate': scores['duplicate']},
auc, ['cited'], histdir='doc2vec_full%i_no_target' %size, bins=50)
def apply_d2v_rel_corpus():
"""
Evaluate the doc2vec feature vectors on the smaller corpus for
cited/random and relevant/irrelevant labellings
"""
# load text
#pat_corpus = PatentCorpus()
#pat_corpus.mode = 'd2v'
#list(pat_corpus)
combis = np.load('human_eval/corpus_info/combis.npy')
target_ids = list(set([comb[0] for comb in combis]))
#pat_corpus = np.load('human_eval/doc2vec/corpus.npy')
#pat_corpus = [gensim.models.doc2vec.TaggedDocument(a[0], a[1]) for a in pat_corpus if a[1][0] not in target_ids]
## Plot AUC
# load model trained on entire patent corpus
model = pkl.load(open("../doc2vec/models/full_pat_corpus_dm_50_min5_iter18.model"))
#model = pkl.load(open("../doc2vec/models/full_pat_corpus_dm_50_min5_iter18.model"))
#model = train_doc2vec(pat_corpus)
# get doc2vec feature vectors
single_pat_corpus = np.load('human_eval/corpus_info/single_pat_corpus.npy').item()
patfeats_d2v = infer_patfeats(single_pat_corpus, model)
#patfeats_d2v = corpus_to_patfeats(model, single_pat_corpus, [])
#patfeats_d2v = make_doc2vec_corpus(model, single_pat_corpus, target_ids)
pat_ids = np.load('human_eval/corpus_info/pat_ids.npy')
binary_label_pairs = np.load('human_eval/corpus_info/binary_label_pairs.npy').item()
human_label_pairs = np.load('human_eval/corpus_info/human_label_pairs.npy').item()
binary_sim_combis, binary_diff_combis = group_combis(binary_label_pairs)
human_sim_combis, human_diff_combis = group_combis(human_label_pairs)
for simcoef in ['linear']:
binary_scores = calc_simcoef_distr(patfeats_d2v, ['random', 'cited'],
{'cited': binary_sim_combis, 'random': binary_diff_combis},
simcoef)
human_scores = calc_simcoef_distr(patfeats_d2v, ['irrelevant', 'relevant'],
{'relevant': human_sim_combis, 'irrelevant': human_diff_combis},
simcoef)
binary_auc = calc_auc(binary_scores['cited'], binary_scores['random'])[2]
human_auc = calc_auc(human_scores['relevant'], human_scores['irrelevant'])[2]
plot_score_distr('human_eval', simcoef, ['random', 'cited'],
{'cited': binary_scores['cited'], 'random': binary_scores['random']},
binary_auc, ['cited'], histdir='doc2vec_full50_rel_corp', bins=20)
plot_score_distr('human_eval', simcoef, ['irrelevant', 'relevant'],
{'relevant': human_scores['relevant'], 'irrelevant': human_scores['irrelevant']},
human_auc, ['relevant'], histdir='doc2vec_full50_rel_corp', bins=20)
if __name__ == "__main__":
apply_d2v_rel_corpus()