-
Notifications
You must be signed in to change notification settings - Fork 2
/
utils.py
116 lines (83 loc) · 3.69 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from sklearn import metrics
import numpy
import scipy
import pickle
import pandas
def regression_target_to_ic50(y, max_ic50=50000.0):
return max_ic50 ** (1.0 - y)
def ic50_to_regression_target(ic50, max_ic50=50000.0):
log_ic50 = numpy.log(ic50) / numpy.log(max_ic50)
regression_target = 1.0 - log_ic50
regression_target = numpy.maximum(regression_target, 0.0)
regression_target = numpy.minimum(regression_target, 1.0)
return regression_target
def make_scores(ic50_y, ic50_y_pred, sample_weight=None, threshold_nm=500, max_ic50=50000):
y_pred = ic50_to_regression_target(ic50_y_pred, max_ic50)
auc = metrics.roc_auc_score(ic50_y <= threshold_nm, y_pred, sample_weight=sample_weight)
f1 = metrics.f1_score(ic50_y <= threshold_nm, ic50_y_pred <= threshold_nm, sample_weight=sample_weight)
tau = scipy.stats.kendalltau(ic50_y_pred, ic50_y)[0]
return dict(auc=auc, f1=f1, tau=tau)
def scoring(preds, idxTe, yTe):
preds = numpy.array([regression_target_to_ic50(i[0]) for i in preds])
targs = numpy.array([regression_target_to_ic50(i[0]) for i in yTe])
as_df = pandas.DataFrame([preds, targs], index=['preds', 'targs']).T
as_df['idx'] = idxTe
avg_pred = []
avg_lab = []
for i in set(idxTe):
sliced = as_df[as_df['idx'] == i]
num_vals = len(sliced)
avg_pred.append(sum(sliced['preds'].values) / num_vals)
avg_lab.append(sum(sliced['targs'].values) / num_vals)
scoring_df = pandas.DataFrame([list(set(idxTe)), avg_pred, avg_lab], index=['Original_Index', 'Avg_Pred', 'Original_Label']).T
scores = make_scores(scoring_df['Original_Label'].values, scoring_df['Avg_Pred'].values)
return scores, scoring_df
def get_batch3d(X, X_, size):
X = X.astype('float32')
X_ = X_.astype('float32')
idx = numpy.random.choice(len(X), size, replace=False)
return X[idx, :, :], X_[idx, :]
def get_batch2d(X, X_, size):
idx = numpy.random.choice(len(X), size, replace=False)
return X[idx, :], X_[idx, :]
def get_all_human_alleles_formatting_one(df):
"""
Ugly hack #1 to get the alleles in the same naming convention
:param df: df from file (train, test, or full)
:return: mapping from the shortened name to the original name of the allele
"""
df = df[df.species == 'human']
alleles_from_file = list(df.mhc.unique())
letter = [i.split('-')[1][0] for i in alleles_from_file]
loc = [i.split('-')[1].replace('*', '')[1:].split(':') for i in alleles_from_file]
simplified_name = []
for i, val in enumerate(loc):
if len(val) > 1:
simplified_name.append(letter[i] + val[0] + val[1])
else:
simplified_name.append(letter[i] + val[0])
return dict(zip(simplified_name, alleles_from_file))
def get_all_human_alleles_formatting_two(df):
"""
Ugly hack #2 to get the alleles in the same naming convention
:param df: df from file (train, test, or full)
:return: mapping from the shortened name to the original name of the allele
"""
df = df[df.species == 'human']
alleles_from_file = list(df.mhc.unique())
loc = [i.split('-')[1:] for i in alleles_from_file]
loc_ = [i[1] if len(i) > 1 else i[0][1:] for i in loc]
letter = [i[0] if len(i) > 1 else i[0][0] for i in loc]
simplified_name = []
for i, val in enumerate(loc_):
if len(val) > 1:
simplified_name.append(letter[i] + val)
else:
simplified_name.append(letter[i] + val)
return dict(zip(simplified_name, alleles_from_file))
def save_obj(obj, name):
with open(name + '.pkl', 'wb') as f:
pickle.dump(obj, f, protocol=2)
def load_obj(name):
with open(name + '.pkl', 'rb') as f:
return pickle.load(f)