-
Notifications
You must be signed in to change notification settings - Fork 0
/
cross_utils.py
260 lines (175 loc) · 8.75 KB
/
cross_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
# -*- coding: utf-8 -*-
"""
This file contains the necessary functions for creating cross-cancer patients.
@author: Duygu Ay
"""
import numpy as np
import pandas as pd
import scipy.optimize
from sklearn.metrics import silhouette_samples
import seaborn as sns
def bestMap(L1,L2):
""" maps cluster prediction labels to the grand truth labels.
L1: Array of grand truth labels
L2: Array of cluster predictions
.....
Output: Map of cluster predictions
"""
if not(len(L1)==len(L2)):
print("Error! L1 size and L2 size not equal")
Label1 = np.unique(L1)
nClass1 = len(Label1)
Label2 = np.unique(L2)
nClass2 = len(Label2)
nClass = max(nClass1, nClass2)
G = np.zeros((nClass,nClass))
for i in range(nClass1):
for j in range(nClass2):
G[i,j] = len(np.where(L1[np.where(L2 == Label2[j])] ==Label1[i])[0])
c,t = scipy.optimize.linear_sum_assignment(-G.T)
newL2 = np.zeros(len(L2))
for i in range(nClass2):
idx = np.where(L2 == Label2[i])
if t[i] >= 9:
newL2[idx] = t[i]
else:
newL2[idx] = Label1[t[i]]
return np.asarray(newL2,int)
def create_patient_matrix(Y_tr, Y_te, all_case, FLAGS):
""" create patient by patient matrix that consists of pairwise similarity scores.
Y_tr: Cancer type labels of train data
Y_te: Cancer type labels of test data
all_case: Mapping of cluster predictions for all numbers of clusters.
.....
Output: DataFrame of patient by patient matrix
"""
k= pd.concat([Y_tr, Y_te])
sample_class = pd.DataFrame(np.column_stack((all_case)), index = k.index, columns = FLAGS.num_groups)
#patient by patient matrix
test_val = sample_class.values
# Base matrix that will be added to.
curr_mat = np.zeros((test_val.shape[0], test_val.shape[0]))
# Max index of matrix components (i.e. max_val + 1 is number of clusters/matrix components)
max_val = np.max(test_val)
for n_clus in range(max_val + 1):
# Extract component matrix corresponding to current iteration.
clus_mem = (test_val == n_clus).astype(int)
curr_mat += np.dot(clus_mem, clus_mem.T)
curr_mat /= len(FLAGS.num_groups)
patients_matrix = pd.DataFrame(curr_mat, index=sample_class.index, columns=sample_class.index)
patients_matrix['label'] = k
patients_matrix = patients_matrix.sort_values(by=patients_matrix.columns[-1])
df = patients_matrix.loc[:,~patients_matrix.columns.duplicated()]
df2 = df.loc[~df.index.duplicated(keep='first')]
temp = df2.iloc[:, :-1]
i = temp.index.tolist()
patient_matrix = temp[i]
patient_matrix['label'] = df2[['label']]
patient_matrix.to_csv('patients_matrix.csv', index=True)
return patient_matrix
def similarity_map_fig(patient_matrix, FLAGS):
""" patient similarity map figure.
patient_matrix: DataFrame of pairwise similarity matrix of all patients and last column consist of labels
"""
df = patient_matrix.set_index([patient_matrix.index, patient_matrix.columns[-1]])
df.index.names = ['index', ""]
labels = df.index.get_level_values("")
network_pal = sns.cubehelix_palette(labels.unique().size,
light=.9, dark=.1, reverse = True,
start=1, rot=-2)
network_lut = dict(zip(map(int, np.unique(labels).tolist()), network_pal))
network_colors = pd.Series(labels, index=patient_matrix.columns[:-1]).map(network_lut)
g = sns.clustermap(patient_matrix.iloc[:, :-1],
# Turn off the clustering
row_cluster=False, col_cluster=False,
# Add colored class labels
row_colors=network_colors, col_colors=network_colors,
# Make the plot look better when many rows/cols
linewidths=0, xticklabels=False, yticklabels=False,cmap='Reds')
#Decoration
for label in labels.unique():
g.ax_col_dendrogram.bar(0, 0, color=network_lut[label],
label=label, linewidth=0)
g.ax_col_dendrogram.legend(loc="center", ncol=5)
g.cax.set_position([.97, .2, .03, .45])
g.fig.suptitle('Patients Similarity Map', x = 0.6, y= 0.88)
ax = g.ax_heatmap
ax.set_xlabel('Patients')
ax.set_ylabel('Patients')
g.savefig(FLAGS.plot_dir + "patients_map.png", dpi=300)
def extract_sim_samples_across_cancers(patient_matrix, data_all):
""" extracts pairwise patients that have similarity score of 1 from different types of cancer.
patient_matrix: DataFrame of patient by patient matrix that consists of pairwise similarity scores.
data_all: all data
.....
Output:
DataFrame of similar patients across cancers with their cancer types.
Index of these patients
Index numbers of these patients
"""
rowcols = np.nonzero(np.triu(patient_matrix.iloc[:, :-1], k=1))
cross = list()
for row, col in zip(rowcols[0], rowcols[1]):
if (patient_matrix.iloc[row,-1] != patient_matrix.iloc[col,-1]) & (patient_matrix.iloc[row,col] == 1):
cross.append([patient_matrix.index[row],patient_matrix.index[col], patient_matrix.iloc[row,-1], patient_matrix.iloc[col,-1]])
cross_matrix = pd.DataFrame(cross, columns= ['First Patient', 'Second Patient', 'First Patient Cancer', 'Second Patient Cancer'])
cross_matrix.to_csv('similar_patients_across_cancers.csv', index=True)
cross_index = list(set(cross_matrix['First Patient']) | set(cross_matrix['Second Patient']))
a=data_all.index.isin(cross_index)
cross_index_num = [i for i, x in enumerate(a) if x]
return cross_matrix, cross_index, cross_index_num
def compute_silhouette_samp(kmeans, labels, X):
"""calculate silhouette coefficient of samples.
X: Input
labels: Grand truth labels
"""
kmeans.fit(X)
silhouetteScore = silhouette_samples(X, labels, metric='euclidean')
return silhouetteScore
def extract_cross_cancer_samp(s_k, cross_matrix, cross_index, data_all, FLAGS):
"""calculate silhouette coefficient of similar samples across cancers. Label as cross-cancer patients who have negative silhouette coefficient.
s_k: Silhouette coefficients of similar samples across cancers for all models
cross_matrix: DataFrame of similar patients across cancers with their cancer types.
.....
Output: Dataframe of cross-cancer patients and similar patients to them and cancer types.
"""
s=np.concatenate([s_k], axis = 1).T
a = list(map(str,FLAGS.num_groups))
a.append('label')
cross_df = data_all[data_all.index.isin(cross_index)]
cross_silh=pd.DataFrame(np.column_stack((s,cross_df.iloc[:,FLAGS.dimension].values)),index = cross_df.index, columns = a)
silh_neg_index = cross_silh[cross_silh.iloc[:,0:FLAGS.num_models] < 0].iloc[:,0:FLAGS.num_models].dropna().index
pairs = set(cross_index) - set(silh_neg_index)
cr = cross_silh[cross_silh.index.isin(silh_neg_index)]
sim = []
n = []
cancer = []
for i in silh_neg_index:
sub = cross_matrix[(cross_matrix['First Patient']==i) | (cross_matrix['Second Patient']==i)]
p_all = list(set().union(sub['First Patient'].tolist(),sub['Second Patient'].tolist()))
p = list(set(p_all) & set(pairs))
if len(p) != 0:
num = len(p)
subsub=sub[(sub['First Patient']==p[0]) | (sub['Second Patient']==p[0])]
if (subsub['First Patient']==p[0]).any():
can = subsub['First Patient Cancer'].values
else:
can = subsub['Second Patient Cancer'].values
else:
num=1
if (sub['First Patient']==i).any():
can = sub['Second Patient Cancer'].values
p = sub['Second Patient'].tolist()
else:
can = sub['First Patient Cancer'].values
p = sub['First Patient'].tolist()
sim.append(p)
n.append(num)
cancer.append(can[0])
cr['Patients Similar to'] = sim
cr['Cancer Type of Patients Similar to'] = cancer
cr['Number of Patients Similar to'] = n
cro=cr.drop(cr.columns[0:FLAGS.num_models], axis=1)
cro.rename(columns={'label':'Cross-cancer Type'}, inplace=True)
cro.to_csv('cross_cancer_patients.csv', index=True)
return cro