/
embeddings.py
46 lines (39 loc) · 1.76 KB
/
embeddings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# -*- coding: utf-8 -*-
"""
"""
import numpy as np
import tensorflow as tf
from keras import backend as K
class word_embed():
def __init__(self, max_len, embed_size, batch_size, elmo=None):
self.max_len = max_len
self.embed_size = embed_size
self.batch_size = batch_size
if elmo != None: self.elmo = elmo
def glove_embedding(self, file_path, embed_size, word2index):
# creating a dictionary from file
file_obj = open(file_path, "r", encoding="utf8")
embeddings_dic = {}
for line in file_obj:
splitted_values = line.split()
word = splitted_values[0]
coefficients = np.asarray(splitted_values[1:], dtype='float32')
embeddings_dic[word] = coefficients
# generating the embedding matrix
embedding_matrix = np.random.random((len(word2index), embed_size)) #len(word2index) = number of unique words + 2(padding + unknown)
for word, i in word2index.items():
embedding_vector = embeddings_dic.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
return embedding_matrix
def elmo_embedding(self, x):
return self.elmo(inputs={
"tokens": tf.squeeze(tf.cast(x, tf.string)),
"sequence_len": tf.constant(self.batch_size*[self.max_len])
},
signature="tokens",
as_dict=True)["elmo"]
def elmo_embedding2(self, x):
return self.elmo(inputs=K.cast(x, tf.string),
signature="default",
as_dict=True)["elmo"]