forked from Jodiechou/L2norm-of-sense-embeddings
-
Notifications
You must be signed in to change notification settings - Fork 0
/
train-glove-or-word2vec-for-sense.py
122 lines (105 loc) · 3.8 KB
/
train-glove-or-word2vec-for-sense.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from glove import Corpus, Glove # creating a corpus object
from gensim.models import Word2Vec
import lxml.etree
import logging
def load_instances(train_path, keys_path):
"""Parse XML of split set and return list of instances (dict)."""
train_instances = []
sense_mapping = get_sense_mapping(keys_path)
text = read_xml_sents(train_path)
for sent_idx, sentence in enumerate(text):
inst = {'tokens': [], 'tokens_mw': [], 'lemmas': [], 'senses': [], 'pos': [], 'id': []}
for e in sentence:
inst['tokens_mw'].append(e.text)
inst['lemmas'].append(e.get('lemma'))
inst['id'].append(e.get('id'))
inst['pos'].append(e.get('pos'))
if 'id' in e.attrib.keys():
inst['senses'].append(sense_mapping[e.get('id')])
else:
inst['senses'].append(None)
inst['tokens'] = sum([t.split() for t in inst['tokens_mw']], [])
"""handling multi-word expressions, mapping allows matching tokens with mw features"""
idx_map_abs = []
idx_map_rel = [(i, list(range(len(t.split()))))
for i, t in enumerate(inst['tokens_mw'])]
token_counter = 0
"""converting relative token positions to absolute"""
for idx_group, idx_tokens in idx_map_rel:
idx_tokens = [i+token_counter for i in idx_tokens]
token_counter += len(idx_tokens)
idx_map_abs.append([idx_group, idx_tokens])
inst['tokenized_sentence'] = ' '.join(inst['tokens'])
inst['idx_map_abs'] = idx_map_abs
inst['idx'] = sent_idx
train_instances.append(inst)
return train_instances
def read_xml_sents(xml_path):
with open(xml_path) as f:
for line in f:
line = line.strip()
if line.startswith('<sentence '):
sent_elems = [line]
elif line.startswith('<wf ') or line.startswith('<instance '):
sent_elems.append(line)
elif line.startswith('</sentence>'):
sent_elems.append(line)
yield lxml.etree.fromstring(''.join(sent_elems))
def get_sense_mapping(keys_path):
sensekey_mapping = {}
sense2id = {}
with open(keys_path) as keys_f:
for line in keys_f:
id_ = line.split()[0]
keys = line.split()[1:]
sensekey_mapping[id_] = keys
return sensekey_mapping
if __name__ == '__main__':
logging.info("Loading Data........")
wsd_fw_path = 'external/wsd_eval/WSD_Evaluation_Framework/'
train_path = wsd_fw_path + 'Training_Corpora/SemCor/semcor.data.xml'
keys_path = wsd_fw_path + 'Training_Corpora/SemCor/semcor.gold.key.txt'
instances = load_instances(train_path, keys_path)
# instantiate the corpus
corpus = Corpus()
# this will create the word co-occurence matrix
senses_in_instances = []
for sent in instances:
sense_temp = []
for sense in sent['senses']:
if sense == None:
continue
for sense_ in sense:
sense_temp.append(sense_)
senses_in_instances.append(sense_temp)
sentences = senses_in_instances
" *******Train GloVe sense embeddings******* "
# corpus.fit(sentences, window=10)
# # instantiate the model
# glove = Glove(no_components=300, learning_rate=0.05)
# # and fit over the corpus matrix
# glove.fit(corpus.matrix, epochs=30, no_threads=2)
# # finally we add the vocabulary to the model
# glove.add_dictionary(corpus.dictionary)
# glove.save('data/glove-sense-embeddings.model')
" ********* "
" *******Load GloVe sense embeddings******* "
# glove = Glove.load('data/glove-sense-embeddings.model')
" *******Train word2vec sense embeddings******* "
# train model
word2vec_model = Word2Vec(sentences, min_count=1, size=300)
# summarize the loaded model
print(word2vec_model)
# summarize vocabulary
words = list(word2vec_model.wv.vocab)
print(words)
# access vector for one word
print(word2vec_model['mastermind%2:31:00::'])
# save model
word2vec_model.save('data/word2vec.sense.model.bin')
" *********** "
" *******Load GloVe sense embeddings******* "
# # load model
# word2vec_model = Word2Vec.load('data/word2vec.sense.model.bin')
# print(word2vec_model)
" *********** "