-
Notifications
You must be signed in to change notification settings - Fork 0
/
iteration_probit_model.py
98 lines (77 loc) · 3.55 KB
/
iteration_probit_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Created january 2020
by Juliette MILLET
script to fit a probit model based on output file with one line = one stimuli-individual and this time with a
sampling of answers to have a equilibrated probit model. We perform multiple subsampling.
"""
import pandas as pd
from statsmodels.formula.api import probit
import numpy as np
import random as rd
def get_dico_corres_file(data_file):
dico ={}
f = open(data_file, 'r')
ind = f.readline().replace('\n', '').split(',')
count = 0
for line in f:
newline = line.replace('\n', '').split(',')
if newline[ind.index('filename')] in dico:
dico[newline[ind.index('filename')]].append(count)
else:
dico[newline[ind.index('filename')]] = [count]
count += 1
f.close()
return dico
def sample_lines(dico_line_files):
# we sample three results per filename
list_lines = []
for filename in dico_line_files:
if 'EN' in filename:
list_lines = list_lines + [dico_line_files[filename][rd.randrange(0,stop= len(dico_line_files[filename]))],
dico_line_files[filename][rd.randrange(0, stop=len(dico_line_files[filename]))],
dico_line_files[filename][rd.randrange(0, stop=len(dico_line_files[filename]))]]
return list_lines
def model_probit_binarized(data_file, model, lines_sampled): # for the model, you have to add the +
data = pd.read_csv(data_file, sep=',', encoding='utf-8')
data = data.iloc[lines_sampled]
data['binarized_answer'] = (data['binarized_answer']+ 1.)/2 # we transform -1 1 into 0 1
# we fit the probit model
model_probit = probit("binarized_answer ~ TGT_first_code + nb_stimuli + C(individual) " + model, data)
result_probit = model_probit.fit()
return model_probit.loglike(result_probit.params)
def iteration_model(filename, nb_it, outfile):
dico_lines = get_dico_corres_file(filename)
f = open(filename, 'r')
ind = f.readline().replace('\n', '').split(',')
f.close()
list_names = ind[ind.index('language_code') + 1:]#['articulation', 'babelmulti', 'fishermono', 'fishertri', 'deepspeech', 'dpgmm', 'mfccs']
out = open(outfile, 'w')
out.write('nb,' + ','.join(list_names) + '\n')
for i in range(nb_it):
out.write(str(i))
# we sample
list_sampled = sample_lines(dico_lines)
list_log = []
try:
for mod in list_names:
print(mod)
log = model_probit_binarized(data_file=args.file_humans, model='+ ' + mod, lines_sampled=list_sampled)
list_log.append(str(log))
out.write(','.join(list_log))
out.write('\n')
except:
continue
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(
description='script to evaluate the predictions of output from humans by model\'s delta values (with resampling of humans results)')
parser.add_argument('file_humans_models', metavar='f_do', type=str,
help='file with human outputs and models\' delta values')
parser.add_argument('outfile', metavar='f_do', type=str,
help='output file with log likelihood answers (one line = one sampling)')
parser.add_argument('nb_it', metavar='f_do', type=int,
help='nb of sampling you want to perform')
args = parser.parse_args()
iteration_model(filename=args.file_humans, nb_it=args.nb_it, outfile=args.outfile)