-
Notifications
You must be signed in to change notification settings - Fork 11
/
word2vec_app.py
executable file
·101 lines (93 loc) · 4.94 KB
/
word2vec_app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# import modules and set up logging
import word2vec
import logging
import numpy as np
import cPickle as pkl
import gensim
from corpus_utils import PatentCorpus, make_w2v_corpus
#from plot_utils import plot_score_distr, calc_simcoef_distr, group_combis, calc_auc
from nlputils.preprocessing import features2mat
from nlputils.simcoefs import compute_sim
from sklearn.preprocessing import normalize
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
def train_word2vec(pat_corpus, corpus='patents', seed=1, embed_dim=300):
"""Train the word2vec model"""
# train the skipgram model; default window=5
model = word2vec.Word2Vec(pat_corpus, mtype='sg', hs=1, neg=13, embed_dim=embed_dim, seed=seed)
# delete the huge stupid table again
model.table = None
# pickle the entire model to disk, so we can load&resume training later
saven = "%s_sg_%i_hs0_neg13_seed%i.model" % (corpus, embed_dim, seed)
print "saving model"
pkl.dump(model, open("human_eval/models/%s"%saven,'wb'), -1)
return model
def embed_features(model, patfeats, pat_ids):
"""
Combine BOW-features with word2vec embedding
Input:
- model: name of the previously trained w2v model
where: row 10 of model contains word x --> x = model.index2word[10]
row i contains embedding for word x --> i = model.vocab[x].index
- patfeats: BOW-features calculated during regression
- pat_ids: IDs of the patents in the corpus
Returns:
- featmat_norm: normalized dot product of BOW-features and w2v-embeddings
- featurenames: the embedded words
"""
# normalized or not normalized matrix containing w2v embeddings: nwords x 200
#w2v_embeddings = model.syn0
w2v_embeddings = model.syn0norm
# make feature matrix: ndocs x nwords
featmat, featurenames = features2mat(patfeats, pat_ids, featurenames=model.index2word)
# make normalized dot product of BOW- and w2v-matrix: ndocs x 200
featmat_norm = normalize(featmat.dot(w2v_embeddings), axis=1, norm='l2')
return featmat_norm, featurenames
if __name__ == "__main__":
# load text
sentences = make_w2v_corpus()
#sentences.mode = 'w2v'
#train_word2vec(sentences, 'all_patents')
#load model
model = gensim.models.Word2Vec(
sentences,
size=300,
window=10,
min_count=1)
model.train(sentences, total_examples=len(sentences), epochs=10)
pkl.dump(model, open("human_eval/models/patents_all_sg_200_hs0_neg13_seed1.model",'wb'), -1)
#model = pkl.load(open('human_eval/models/patents_sg_200_hs0_neg13_seed1.model'))
# load patfeats and ids saved while performing regression
pat_ids = np.load('human_eval/corpus_info/pat_ids.npy')
patfeats = np.load('human_eval/corpus_info/patfeats_human_eval.npy').item()
#patfeats = np.load('human_eval/corpus_info/patfeats_abstract.npy').item()
combis = np.load('human_eval/corpus_info/combis.npy')
binary_label_pairs = np.load('human_eval/corpus_info/binary_label_pairs.npy').item()
human_label_pairs = np.load('human_eval/corpus_info/human_label_pairs.npy').item()
# make word2vec embedded feature matrix
featmat_w2v, featurenames = embed_features(model, patfeats, pat_ids)
#transform feature matrix into dict
patfeats_w2v = {}
for i, pid in enumerate(pat_ids):
patfeats_w2v[pid] = dict(zip(featurenames, featmat_w2v[i,:]))
## Plot AUC
binary_sim_combis, binary_diff_combis = group_combis(binary_label_pairs)
human_sim_combis, human_diff_combis = group_combis(human_label_pairs)
for simcoef in ['linear', 'jaccard']:
binary_scores = calc_simcoef_distr(patfeats_w2v, ['random', 'cited'],
{'cited': binary_sim_combis, 'random': binary_diff_combis},
simcoef)
human_scores = calc_simcoef_distr(patfeats_w2v, ['irrelevant', 'relevant'],
{'relevant': human_sim_combis, 'irrelevant': human_diff_combis},
simcoef)
binary_auc = calc_auc(binary_scores['cited'], binary_scores['random'])[2]
human_auc = calc_auc(human_scores['relevant'], human_scores['irrelevant'])[2]
binary_aps = calc_auc(binary_scores['cited'], binary_scores['random'])[3]
human_aps = calc_auc(human_scores['relevant'], human_scores['irrelevant'])[3]
print(binary_aps, human_aps)
print(binary_auc, human_auc)
plot_score_distr('human_eval', simcoef, ['random', 'cited'],
{'cited': binary_scores['cited'], 'random': binary_scores['random']},
binary_auc, ['cited'], histdir='w2v_sg_200', bins=10)
plot_score_distr('human_eval', simcoef, ['irrelevant', 'relevant'],
{'relevant': human_scores['relevant'], 'irrelevant': human_scores['irrelevant']},
human_auc, ['relevant'], histdir='w2v_sg_200', bins=10)