/
utility_gpt.py
214 lines (183 loc) · 7.81 KB
/
utility_gpt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import numpy as np
import os
import pandas as pd
import scipy.io as sio
import encoder
def load_words():
path = str(os.path.dirname(os.path.abspath(__file__))) + '/data/subjects/P01/examples_180concepts_sentences.mat'
mat_file = sio.loadmat(path)
words = mat_file['keyConcept']
return words
def checker(string):
string = string.replace("'ve",'')
string = string.replace("@",'')
string = string.replace("'re",'')
string = string.replace("malfoy'll",'malfoy')
string = string.replace("'d",'')
string = string.replace("?",'')
string = string.replace("'s",'')
string = string.replace(":",'')
string = string.replace("!",'')
string = string.replace('"','')
string = string.replace(".",'')
string = string.replace("--",'')
string = string.replace("'",'')
string = string.replace(",",'')
string = string.replace(';','')
string = string.replace('‘','')
string = string.replace('(','')
string = string.replace(')','')
string = string.replace('\'','')
string = string.replace(' ','')
return(string)
def files_exist():
booler = os.path.exists(str(os.path.dirname(os.path.abspath(__file__))) + '/look_ups_gpt-2/converter_table.npy')
booler = booler and os.path.isfile(str(os.path.dirname(os.path.abspath(__file__))) + '/look_ups_gpt-2/top_ten_embeddings')
booler = booler and os.path.isfile(str(os.path.dirname(os.path.abspath(__file__))) + '/look_ups_gpt-2/word_sets_num.npy')
booler = booler and os.path.isfile(str(os.path.dirname(os.path.abspath(__file__))) + '/look_ups_gpt-2/word_sets.txt')
return booler
def generate_look_ups():
if not os.path.exists(str(os.path.dirname(os.path.abspath(__file__))) + '/look_ups_gpt-2/converter_table.npy'):
converter_table()
if not os.path.isfile(str(os.path.dirname(os.path.abspath(__file__))) + '/look_ups_gpt-2/top_ten_embeddings'):
top_ten_embeddings()
if not os.path.isfile(str(os.path.dirname(os.path.abspath(__file__))) + '/look_ups_gpt-2/word_sets.txt'):
print('First create a text file with top 5 words you want the generation to anchored with.')
else:
if not os.path.isfile(str(os.path.dirname(os.path.abspath(__file__))) + '/look_ups_gpt-2/word_sets_num.npy'):
word_sets()
def word_sets():
lister = []
words = load_words()
for i in range(180):
lister.append(words[i][0][0])
file1 = open(str(os.path.dirname(os.path.abspath(__file__))) + '/look_ups_gpt-2/word_sets.txt',"r+")
length = len(file1.readlines())
file1 = open(str(os.path.dirname(os.path.abspath(__file__))) + '/look_ups_gpt-2/word_sets.txt',"r+")
holder = np.zeros((length,5),dtype=int)
for i in range(length):
line = file1.readline()
lines = line.split(',')
for j in range(5):
if j == 0:
numb = lister.index(lines[j].strip())
else:
numb = lister.index(lines[j].strip())
holder[i,j] = numb
np.save(file=str(os.path.dirname(os.path.abspath(__file__))) + '/look_ups_gpt-2/word_sets_num.npy', arr=holder)
def converter_table():
path = str(os.path.dirname(os.path.abspath(__file__))) + '/look_ups_gpt-2/converter_table'
embeddings_dict = {}
with open(str(os.path.dirname(os.path.abspath(__file__))) + "/glove.42B.300d.txt", 'r') as f: #Need glove embeddings dataset!
for line in f:
try:
values = line.split()
word = values[0]
vector = np.asarray(values[1:], "float32")
embeddings_dict[word] = vector
except ValueError:
print(line.split()[0])
model_name='124M'
models_dir= str(os.path.dirname(os.path.abspath(__file__))) + '/models_gpt'
models_dir = os.path.expanduser(os.path.expandvars(models_dir))
enc = encoder.get_encoder(model_name, models_dir)
holder = np.zeros((50257,300))
for i in range(50257):
try:
word = enc.decode([i])
word = checker(word.strip().lower())
glove = embeddings_dict[word]
holder[i,:] = glove
except:
word = enc.decode([i])
holder[i,:] = np.zeros((300)) + 500
np.save(file=path, arr=holder)
print('Converter table was generated')
def find_closest_embeddings_cosine_prox(embedding,embeddings_dict):
return sorted(embeddings_dict.keys(), key=lambda word: cosine(embeddings_dict[word], embedding))
def top_ten_embeddings():
path = str(os.path.dirname(os.path.abspath(__file__))) + '/look_ups_gpt-2/top_ten_embeddings_two'
word = load_words()
words = load_words_and_glove()
embeddings_dict = {}
with open(str(os.path.dirname(os.path.abspath(__file__))) + "/glove.42B.300d.txt", 'r') as f: #Need glove embeddings dataset!
for line in f:
try:
values = line.split()
word = values[0]
vector = np.asarray(values[1:], "float32")
embeddings_dict[word] = vector
except ValueError:
print(line.split()[0])
embbeddings = []
for i in range(words.shape[0]):
top_ten = find_closest_embeddings_cosine_prox(words[i,:],embeddings_dict)[:11]
embbeddings.append(top_ten)
df = pd.DataFrame(embbeddings)
df.to_csv(path, index= False)
def isSubset(subarraies, array):
counter = 0
for subarray in subarraies:
for i in range(len(array)):
for j in range(len(subarray)):
if i+j<len(array):
if array[i+j] == subarray[j]:
if j == len(subarray)-1:
counter+=1
else:
break
return counter
def tokens_from_words(Numbers):
model_name='124M'
models_dir= str(os.path.dirname(os.path.abspath(__file__))) + '/models_gpt'
models_dir = os.path.expanduser(os.path.expandvars(models_dir))
enc = encoder.get_encoder(model_name, models_dir)
word = load_words()
container = []
for Nr in Numbers:
container.append(enc.encode(' ' + word[Nr][0][0]))
return container
def load_words_and_glove():
path = str(os.path.dirname(os.path.abspath(__file__))) + '/data/glove_data/180_concepts.mat'
mat_file = sio.loadmat(path)
glove =mat_file['data']
return glove
def related_words():
top_ten = pd.read_csv(str(os.path.dirname(os.path.abspath(__file__))) + '/look_ups_gpt-2/top_ten_embeddings')
model_name='124M'
models_dir=str(os.path.dirname(os.path.abspath(__file__))) + '/models_gpt'
models_dir = os.path.expanduser(os.path.expandvars(models_dir))
enc = encoder.get_encoder(model_name, models_dir)
container = []
for i in range(180):
intermediate = []
for j in range(11):
shape = enc.encode(' ' + top_ten.iloc[i,j])
intermediate.append(shape)
container.append(intermediate)
return container
def Harry_sentences_no_capital(counter, Sent_num):
harry_dir = str(os.path.dirname(os.path.abspath(__file__))) + '/look_ups_gpt-2/words_fmri.npy'
namelist = ['Harry', 'Ron', 'Malfoy', 'Neville', 'Dumbledore', 'Hermione', 'Potter', 'Weasley.', ' Potter', 'Potter,', ' Potter,']
harry = np.load(harry_dir)
total = ''
q = 0
while q < Sent_num:
booler = False
first = True
stringe = ''
for i in range(harry.size):
j = counter + i
if not first:
booler = any(letter.isupper() for letter in harry[j]) or booler
else:
booler = harry[j] in namelist or booler
stringe = stringe + ' ' + harry[j]
first = False
if harry[j].__contains__('.'):
counter = j+1
break
if not booler:
q+=1
total = total + ' ' + stringe
return(total, counter)