-
Notifications
You must be signed in to change notification settings - Fork 9
/
aux_word2vec.py
191 lines (157 loc) · 5.75 KB
/
aux_word2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
'''
Rasterfairy code usage adapted from this example:
https://github.com/Quasimondo/RasterFairy/blob/master/
examples/Raster%20Fairy%20Demo%201.ipynb
'''
from gensim import models, matutils # word2vec model loading
from sklearn.decomposition import IncrementalPCA # inital reduction
from sklearn.manifold import TSNE # final reduction
import numpy as np # array handling
import os, warnings
import gensim
import matplotlib.pyplot as plt
model_filename = 'word2vec.model' # model file to reduce
model_name = 'music_word_2vec' # name for exporting files
num_dimensions = 2 # final num dimensions (2D, 3D, etc)
run_init_reduction = True # run an initial reduction with PCA?
init_dimensions = 300 # initial reduction before t-SNE
# use only most common words (helpful for big data sets)
only_most_common = True
num_common = 50 # how many words to filter to? (max 50k)
tagged_pos = False # is our model tagged with parts-of-speech?
common_filename = '50musiclabels.txt'
def normalize_list(vals):
'''
normalize a list of vectors to range of -1 to 1
input: list of vectors
output: normalized list
'''
min_val = float(min(vals))
max_val = float(max(vals))
output = []
for val in vals:
if val < 0:
val = (val / min_val) * -1
elif val > 0:
val = val / max_val
output.append(val) # note if 0, stays the same :)
return output
# ignore unicode warnings
warnings.filterwarnings('ignore', '.*Unicode.*')
# compute the model from GoogleNews-vectors-negative300 pretrained model
print ('computing model...')
model =gensim.models.KeyedVectors.load_word2vec_format('<path to model>/magnatagatune/GoogleNews-vectors-negative300.bin',binary=True)
print ('- done')
# load model as numpy array
# if specified, keep only most common words
print ('converting model/words to numpy array...')
if only_most_common:
print ('- loading ' + str(num_common) + ' most common words...')
most_common = []
with open(common_filename) as f:
for i, line in enumerate(f):
if i > num_common:
break
most_common.append(line.strip())
print ('- creating list of words/vectors for reduction...')
if only_most_common:
print (' - keeping only ' + str(num_common) + ' most common words')
vectors= [] # positions in vector space
labels = [] # keep track of words to label our data again later
for word in model.vocab:
if only_most_common:
if tagged_pos:
parts = word.split('_') # split _ for POS-tagged words
w = parts[0].lower()
p = parts[1]
if w in most_common:
word = w + '_' + p
try:
vectors.append(model[word])
labels.append(word)
except:
pass
else:
if word in most_common:
try:
vectors.append(model[word])
labels.append(word)
except:
pass
else:
vectors.append(model[word])
labels.append(word)
print ('- found ' + str(len(labels)) + ' entities x ' + str(len(vectors[0])) + ' dimensions')
# convert both lists into numpy vectors for reduction
vectors = np.asarray(vectors)
labels = np.asarray(labels)
print ('- done')
# if specified, reduce using IncrementalPCA first (down
# to a smaller number of dimensions before the final reduction)
if run_init_reduction:
print ('reducing to ' + str(init_dimensions) + 'D using IncrementalPCA...')
ipca = IncrementalPCA(n_components=init_dimensions)
vectors = ipca.fit_transform(vectors)
print ('- done')
# save reduced vector space to file
print ('- saving as csv...')
with open(''+model_name + '-' + str(init_dimensions) + 'D.csv', 'w') as f:
for i in range(len(labels)):
f.write(labels[i] + ',' + ','.join(map(str, vectors[i])) + '\n')
# reduce using t-SNE
print ('reducing to ' + str(num_dimensions) + 'D using t-SNE...')
print ('- may take a really, really (really) long time :)')
vectors = np.asarray(vectors)
tsne = TSNE(n_components=num_dimensions, random_state=0)
vectors = tsne.fit_transform(vectors)
print ('- done')
# save reduced vector space to file
print ('saving as csv...')
x_vals = [ v[0] for v in vectors ]
y_vals = [ v[1] for v in vectors ]
#z_vals = [ v[2] for v in vectors ]
#w_vals = [ v[3] for v in vectors ]
with open('' + model_name + '-' + str(num_dimensions) + 'D.csv', 'w') as f:
for i in range(len(labels)):
label = labels[i]
x = x_vals[i]
y = y_vals[i]
# z = z_vals[i]
# w = w_vals[i]
f.write(label + ',' + str(x) + ',' + str(y)
# +','+str(z) + ',' + str(w)
+ '\n')
print ('- done')
# normalize values -1 to 1, save to file
print ('normalizing position values...')
x_vals = normalize_list(x_vals)
y_vals = normalize_list(y_vals)
#z_vals = normalize_list(z_vals)
#w_vals = normalize_list(w_vals)
print ('- saving as csv...')
with open('' + model_name + '-' + str(num_dimensions) + 'D-NORMALIZED.csv', 'w') as f:
for i in range(len(labels)):
label = labels[i]
x = x_vals[i]
y = y_vals[i]
# z = z_vals[i]
# w = w_vals[i]
f.write(label + ',' + str(x) + ',' + str(y)
# +','+str(z) + ',' + str(w)
+ '\n')
print ('- done')
def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
plt.figure(figsize=(18, 18)) # in inches
for i, label in enumerate(labels):
x, y = low_dim_embs[i, :]
plt.scatter(x, y)
plt.annotate(label,
xy=(x, y),
xytext=(5, 2),
textcoords='offset points',
ha='right',
va='bottom')
plt.savefig(filename)
# Finally plotting and saving the fig
plot_with_labels(vectors, labels)