-
Notifications
You must be signed in to change notification settings - Fork 1
/
mr_generic_scripts.py
283 lines (227 loc) · 10.8 KB
/
mr_generic_scripts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 8 12:28:11 2020
@author: Venelin
"""
import pandas as pd
import nltk
import numpy as np
import tensorflow as tf
import re
from datetime import datetime
# Mindreading general scripts and data used by all experiments
# Static variables
# Separate columns by type
bio_cols = ['ID', 'Study', 'School', 'Class', 'Child', 'SEN (0 = No, 1 = Yes)',
'EAL (0 = No, 1 = Yes)', 'DOB', 'DOT', 'Gender (0 = Girl, 1 = Boy)', 'MHVS']
text_cols = ['SFQuestion_1_Text', 'SFQuestion_2_Text', 'SFQuestion_3_Text',
'SFQuestion_4_Text', 'SFQuestion_5_Text', 'SFQuestion_6_Text',
'SS_Brian_Text', 'SS_Peabody_Text', 'SS_Prisoner_Text',
'SS_Simon_Text', 'SS_Burglar_Text']
rate_cols = ['SFQ1_Rating', 'SFQ2_Rating', 'SFQ3_Rating',
'SFQ4_Rating', 'SFQ5_Rating', 'SFQ6_Rating',
'SS_Brian_Rating', 'SS_Peabody_Rating', 'SS_Prisoner_Rating',
'SS_Simon_Rating', 'SS_Burglar_Rating']
tq_cols = ['SFQuestion_1_QT', 'SFQuestion_2_QT', 'SFQuestion_3_QT',
'SFQuestion_4_QT', 'SFQuestion_5_QT', 'SFQuestion_6_QT',
'SS_Brian_QT', 'SS_Peabody_QT', 'SS_Prisoner_QT',
'SS_Simon_QT', 'SS_Burglar_QT']
# Add the questions
questions = ['Why did the men hide ? ', 'What does the woman think ? ', 'Why did the driver lock Harold in the van ? ',
'What is the deliveryman feeling and why ? ', 'Why did Harold pick up the cat ? ', 'Why does Harold fan Mildred ? ',
'Why does Brian say this ? ', 'Why did she say that ? ', 'Why did the prisoner say that ? ',
'Why will Jim look in the cupboard for the paddle ? ', 'Why did the burglar do that ? ']
# Function that reads all excel files and puts them in dataframes
def mr_excel_to_pd(fnames,calc_age=False):
# Initialize an empty list to keep the dictionaries
li = []
# Read each file in panda dataframe
for filename in fnames:
# Try to read from excel
try:
#df = pd.read_excel(filename, index_col=None, header=0, encoding='utf-8')
df = pd.read_excel(filename, index_col=None, header=0)
# Fix incorrect column names
df.rename(columns={'DOB DD/MM/YYYY -99 = Missing' : 'DOB',
'EAL 0 = No, 1 = Yes' : 'EAL (0 = No, 1 = Yes)',
'Gender 0 = Girl, 1 = Boy' : 'Gender (0 = Girl, 1 = Boy)',
'SEN 0 = No, 1 = Yes' : 'SEN (0 = No, 1 = Yes)',
'SEN (0 = No, 1= Yes)' : 'SEN (0 = No, 1 = Yes)',
'SFQ1_Rating ': 'SFQ1_Rating'},
inplace=True)
# Check if we need to calculate age
if calc_age:
df['Age_Y'] = round((df['DOT'] - df['DOB']).dt.days/365)
# Put it in the list
li.append(df)
# Otherwise print the file name for debug
except:
print(filename)
# Concatenate all dataframes
full_frame = pd.concat(li, axis=0, ignore_index=True)
return full_frame
# Fucntion that creates a new column with the question and answer
# Loop through all text columns and question texts
def mr_create_qa(full_df,t_cols=text_cols,list_qs=questions):
for text_col, question in zip(t_cols, list_qs):
# Generate new column name
new_cname = text_col.replace("_Text","_QT")
# Create a new column that is the concatenation of the question and the answer
full_df[new_cname] = question + full_df[text_col].astype(str)
# Def a funciton for basic preprocessing
# This is only for the DL experiments!
# Input is a string
def mr_tok_sc(response):
# Tokenization (nltk for NNs, we only tokenizee)
# Discard nans as they break nltk
tok_res = nltk.word_tokenize(response) if not pd.isna(response) else []
# Return a string
return " ".join(tok_res)
# Create separate datasets for each question and one "large" dataset with all questions together
# Get a version with and without the question
def mr_create_datasets(full_df,X_cols,Y_cols):
# Initialize a list
q_datasets = []
# Loop through all answer / value pairs
for cur_X, cur_Y in zip(X_cols,Y_cols):
# Get the data
#cur_df = full_df[[cur_X,cur_Y,''DOB','DOT'']].copy()
# Need to copy the following fields: ID, age, gender, question, answer
cur_df = full_df[['"ID"',cur_X,cur_Y,'Age_Y','Gender (0 = Girl, 1 = Boy)']].copy()
# Drop missing data and -99s
cur_df[cur_X].replace('', np.nan, inplace=True)
cur_df[cur_Y].replace(-99, np.nan, inplace=True)
cur_df.dropna(inplace=True)
# Add column for the current question (this is needed later on)
cur_df["Question"] = cur_X
# Rename columns for consistency
cur_df.rename(columns={cur_X:'Answer'},inplace=True)
cur_df.rename(columns={cur_Y:'Score'},inplace=True)
cur_df.rename(columns={'Age_Y':'Age'},inplace=True)
cur_df.rename(columns={'"ID"':'Child_ID'},inplace=True)
cur_df.rename(columns={'Gender (0 = Girl, 1 = Boy)':'Gender'},inplace=True)
# Get the year of birth from the date of birth
#cur_df["YOB"] = cur_df["DOB"].astype(str).apply(mr_get_date)
# Get the year of text from the date of test
#cur_df["YOT"] = cur_df["DOT"].astype(str).apply(mr_get_date)
# Calculate age as YOB - YOT (ugly fix due to messy date format in the
# excels, will have to redo it later with proper regex to parse all
# the input)
#cur_df["Age"] = cur_df["YOT"] - cur_df["YOB"]
#cur_df["Age"] = cur_df["Age_Y"]
# Put the data in a list
#q_datasets.append([cur_X,cur_data[cur_X],cur_data[cur_Y]])
q_datasets.append([cur_X,cur_df])
# Get a full dataset with all the data
#full_X = pd.concat([X for [_,X,_] in q_datasets])
#full_Y = pd.concat([Y for [_,_,Y] in q_datasets])
full_dataset = pd.concat([X for [_,X] in q_datasets])
# Add it to the list
#q_datasets.append(["full",full_X,full_Y])
q_datasets.append(["full",full_dataset])
return(q_datasets)
# Function for creating a tensorflow dataset from X and Y
def mr_tf_data(var_X,var_y,BATCH_SIZE = 4,SHUFFLE_BUFFER_SIZE = 100):
# Get the labels in a format for softmax
y_arr = var_y.to_numpy().astype(int)
y_sm = tf.one_hot(y_arr,3)
# Create the actual dataset and shuffle it
var_dataset = tf.data.Dataset.from_tensor_slices((var_X, y_sm))
var_dataset = var_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
return var_dataset
# Function for creating a tensorflow dataset from X and Y
def mr_tf_data_s(var_X,var_y,BATCH_SIZE = 4,SHUFFLE_BUFFER_SIZE = 100):
# Get the labels in a format for softmax
y_arr = var_y.to_numpy().astype(int)
#y_sm = tf.one_hot(y_arr,3)
# Create the actual dataset and shuffle it
var_dataset = tf.data.Dataset.from_tensor_slices((var_X, y_arr))
var_dataset = var_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
return var_dataset
# =============================================================================
# # Function that, given a test set maps them onto original datasets
# # col_list should either be text_cols or tq_cols
# # label list is always rate_cols
#
# def mr_split_questions(full_df,X_test,col_list,lab_list=rate_cols):
#
# # Get the full data of the test set
# test_df = full_df.iloc[X_test.index.values]
#
# # A very ugly loop to extract data, need to rework into efficient use of
# # pandas when I have the timee
#
#
# # Initialize the output
# test_l = []
#
# # Loop over all questions and the corresponding rate columns
# for cur_col,cur_lab in zip(col_list,lab_list):
# # Select the whole row in the original dataframe that has corresponding value
# # on the response or the QA
# cur_df = full_df[full_df[cur_col].isin(X_test)].copy()
#
# # We need the following data:
# # text response, expert score, date of birth and date of test
# cur_df = cur_df[['DOB','DOT',cur_col,cur_lab]]
# # Also add the current question
# cur_df["Question"] = cur_col
# # Rename columns for consistency
# cur_df.rename(columns={cur_col:'Answer'},inplace=True)
# cur_df.rename(columns={cur_lab:'Score'},inplace=True)
#
# # Get the year of birth from the date of birth
# cur_df["YOB"] = cur_df["DOB"].astype(str).apply(mr_get_date)
# # Get the year of text from the date of test
# cur_df["YOT"] = cur_df["DOT"].astype(str).apply(mr_get_date)
# # Calculate age as YOB - YOT (ugly fix due to messy date format in the
# # excels, will have to redo it later)
# cur_df["Age"] = cur_df["YOT"] - cur_df["YOB"]
#
#
# # The result of this is a table in the following format:
# # Question; Answer; Score; DOB; DOT; YOB; YOT; Age
#
# # This format easily allows us to select based on question or age and then evaluate
#
# # Add the current question to the list
# test_l.append(cur_df)
#
# # Merge all questions and return a dataframe
# test_df = pd.concat(test_l, axis=0, ignore_index=True)
#
# return test_df
# =============================================================================
def mr_get_date(inp_date):
# Find a pattern 20XX where X is a digit
# Ignore the rest of the string
year = re.search("20[0-9]{2}",inp_date)
if year:
# Return the year as a number
return int(year.group(0))
else:
# If you can't find the pattern, return 0 for consistency
return 0
def mr_get_data(folder_name):
out_l = []
for cur_q in text_cols:
inp_file = folder_name + cur_q + ".xlsx"
#cur_df = pd.read_excel(inp_file, index_col=None, header=0, encoding='utf-8')
cur_df = pd.read_excel(inp_file, index_col=None, header=0)
out_l.append([cur_q,cur_df])
full_dataset = pd.concat([X for [_,X] in out_l])
# Add it to the list
out_l.append(["full",full_dataset])
return(out_l)
def mr_get_qa_data(folder_name,t_cols=text_cols,list_qs=questions):
out_l = []
for cur_q, question in zip(t_cols, list_qs):
inp_file = folder_name + cur_q + ".xlsx"
#cur_df = pd.read_excel(inp_file, index_col=None, header=0, encoding='utf-8')
cur_df = pd.read_excel(inp_file, index_col=None, header=0)
cur_df['Answer'] = question + cur_df['Answer'].astype(str)
out_l.append([cur_q,cur_df])
full_dataset = pd.concat([X for [_,X] in out_l])
# Add it to the list
out_l.append(["full",full_dataset])
return(out_l)