/
teamassignment.py
201 lines (167 loc) · 6.75 KB
/
teamassignment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
# TO DO:
# Detect & trim outlier distances for each variable
# Weight distances for all variables
# requirements
import pandas as pd
import numpy as np
import scipy.spatial
import os
import subprocess
import sys
import math
import itertools
# get input and output filenames from sys argv
inputfile = sys.argv[1]
outputfile = sys.argv[2]
input_data = pd.DataFrame.from_csv(inputfile)
# default is k = 14 teams per block (following the Dec 8 version of the
# experimental design) unless a value is provided as the third argument
if len(sys.argv) > 3:
k = int(sys.argv[3])
k_default = False
print("User entered number of teams per block: " + str(k))
else:
k = 14
k_default = True
print("Using default number of teams per block: 14")
# default condition is SWARM, unless a value is provided as the 4th argument
if len(sys.argv) > 4:
cond = sys.argv[4]
cond_default = False
print("User entered condition: " + cond)
else:
cond = "swarm"
cond_default = True
print("Using default condition: swarm")
print("Arranging input data")
# remove participants from other performers
df = input_data[input_data["cond"] == cond].copy()
# get list of unique blocks in this condition
blocks = df["block"].unique().tolist()
# save the indices of the cells we'll need to change in the output data
team_col = np.where(input_data.columns.values == "team")[0]
block_rows = {}
for b in blocks:
block_rows[b] = np.where((input_data["cond"] == cond) & \
(input_data["block"] == b))[0]
# block_n = dict of sample sizes per block
block_n = df.groupby("block").size().to_dict()
# if you divide participants into equal sized teams, they differ by at most
# one participant. Natural upper and lower bounds on team size:
lowers = {x: math.floor(block_n[x] / float(k)) for x in list(block_n.keys())}
uppers = {x: math.ceil(block_n[x] / float(k)) for x in list(block_n.keys())}
# variables that contribute to the distance function
relevant_vars = ["block",
"team_lead",
"age",
"gender",
"education",
"english_proficiency",
"enjoy_logic_probs",
"enjoy_num_probs",
"expertise_math",
"expertise_quant_model",
"expertise_stats",
"expertise_prob",
"expertise_bayes_net",
"expertise_programming",
"expertise_exp_design",
"expertise_risk_analysis",
"expertise_forecasting",
"expertise_dec_theory",
"expertise_game_theory",
"expertise_sats",
"expertise_arg_map",
"expertise_inf_logic",
"expertise_sys_think",
"expertise_image_analysis",
"expertise_link_analysis",
"expertise_graphic_design",
"expertise_tech_writing",
"score_matrix", # matrix reasoning test
"score_prob_reas", # probabilistic reasoning test
"crt_seen_before",
"score_crt",
"score_aomt", # actively open-minded thinking test
"score_bfi_openness", # big five personality inventory
"score_bfi_conscientiousness",
"score_bfi_extraversion",
"score_bfi_agreeableness",
"score_bfi_neuroticism",
"score_toa_novelty", # tolerance of ambiguity
"score_toa_complexity",
"score_toa_insolubility",
"score_rme", # Mind in the Eyes
"coh_score_4way",
"coh_score_3way",
"coh_score_2way"]
# subset to relevant variables
df = df[relevant_vars]
# convert gender and age into numeric variables
gender_dict = {'f':1., 'm':0., 'o':0.5}
df.loc[:,["gender"]] = [gender_dict[x] for x in df.loc[:,"gender"]]
age_dict = {'18-24':0, '25-34':1, '35-44':2, '45-54':3, '55-64':4, '65+':5}
df.loc[:,["age"]] = [age_dict[x] for x in df.loc[:,"age"]]
# fix up data in each column (except block)
for i in range(df.shape[1]):
if df.columns[i] != 'block':
# replace null values with column medians
df.iloc[np.where(df.isnull().iloc[:, i])[0], i] = df.iloc[:,i].median()
# and normalize the range of every column
df.iloc[:,i] = df.iloc[:,i] - df.iloc[:,i].min()
if df.iloc[:,i].max() != 0: # avoid dividing by 0
df.iloc[:,i] = df.iloc[:,i] / df.iloc[:,i].max()
# make copy of df with all blocks before subsetting by block
fulldf = df.copy()
# run solver once for each block
for b in blocks:
print("Writing input file for MDGP solver for Block " + str(b))
# subset by block, then drop the "block" column
df = fulldf[fulldf["block"] == b].copy()
df.drop("block", axis=1, inplace=True)
# where to write the file for the solver
instance_filename = 'block_' + str(b) + '_mdgp_solver_input.txt'
# if instance file already exists, remove it
try:
os.remove(instance_filename)
except OSError:
pass
# create instance file for MDGP solver
with open(instance_filename, 'a') as the_file:
# first line has sample size, number of teams, "same size", and
# size limits for each team
lims = " " + str(lowers[b]) + " " + str(uppers[b])
the_file.write(str(block_n[b]) + " " + str(k) + " ss" + \
str(lims) * k + "\n")
# following lines have row numbers identifying a pair of participants,
# and the distance between that pair.
indices = list(itertools.combinations(range(block_n[b]), 2))
distout = pd.DataFrame(indices)
distout["d"] = scipy.spatial.distance.pdist(df, metric='cityblock')
distout.to_csv(the_file, header=False, index=False, sep=" ")
print("Running MDGP solver for Block " + str(b) + ", this step takes 2 min")
# run solver on instance file, write data to solver file
solver_filename = "block_" + str(b) + "_mdgp_solver_output.txt"
bash_command = "java -jar mdgp_jors_2011.jar SO " + instance_filename + \
" 120000 > " + solver_filename
subprocess.call(bash_command, shell=True)
print("Reading solution for Block " + str(b))
# read solution from file
with open(solver_filename) as f:
content = f.readlines()
# remove whitespace characters like `\n` at the end of each line
content = [x.strip() for x in content]
# identify line where solution is written
sol_index = np.where([item.startswith('Solution: [') for item in content])
# turn into list of ints
solution = [int(x) for x in content[sol_index[0][0]][11:-1].split(", ")]
# check solution is right length
if len(solution) != df.shape[0]:
sys.exit("Something's wrong. Check the solver file: " + solver_filename)
# turn ints into strings with form "team<n>", with n starting from 1
team_names = ['team' + str(s + 1 + (b - 1) * k) for s in solution]
# add MDGP solution to dataframe of initial data
input_data.iloc[block_rows[b], team_col] = team_names
print("Finished all blocks, writing team assignment to output file")
# write to file
input_data.to_csv(outputfile)