-
Notifications
You must be signed in to change notification settings - Fork 64
/
preparing_AHBA_data.py
141 lines (111 loc) · 5.46 KB
/
preparing_AHBA_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
'''
Requirements from old dockerfile:
RUN apt-get update && apt-get install -y \
libhdf5-dev \
libhdf5-8 && \
apt-get clean autoclean
RUN pip install numpy==1.11.0
RUN pip install nibabel \
tables==3.3.0 \
h5py==2.6.0 \
pandas==0.20.3 \
pybraincompare==0.1.18
'''
import urllib.request, urllib.parse, urllib.error
import zipfile
import os
import numpy as np
import nibabel as nb
import numpy.linalg as npl
import pandas as pd
from pybraincompare.mr.datasets import get_standard_mask
# Downloading microarray data
urls = ["http://human.brain-map.org/api/v2/well_known_file_download/178238387",
"http://human.brain-map.org/api/v2/well_known_file_download/178238373",
"http://human.brain-map.org/api/v2/well_known_file_download/178238359",
"http://human.brain-map.org/api/v2/well_known_file_download/178238316",
"http://human.brain-map.org/api/v2/well_known_file_download/178238266",
"http://human.brain-map.org/api/v2/well_known_file_download/178236545"]
donor_ids = [""]
download_dir = "./ahba_data"
os.makedirs(download_dir)
for i, url in enumerate(urls):
print("Downloading %s" % url)
urllib.request.urlretrieve(url, os.path.join(download_dir, "donor%d.zip" % (i + 1)))
zipfile.ZipFile(os.path.join(download_dir, "donor%d.zip" % (i + 1)))
# Dowloading MNI coordinates
urllib.request.urlretrieve(
"https://raw.githubusercontent.com/chrisfilo/alleninf/master/alleninf/data/corrected_mni_coordinates.csv",
os.path.join(download_dir, "corrected_mni_coordinates.csv"))
samples = pd.read_csv(os.path.join(download_dir, "corrected_mni_coordinates.csv"), index_col=0)
mni = get_standard_mask(voxdim=4)
reduced_coord = []
for coord_mni in samples[['corrected_mni_x', 'corrected_mni_y', 'corrected_mni_z']].values:
sample_counts = np.zeros(mni.shape, dtype=int)
coord_data = [int(round(i)) for i in nb.affines.apply_affine(npl.inv(mni.affine), coord_mni)]
sample_counts[coord_data[0],
coord_data[1],
coord_data[2]] = 1
out_vector = sample_counts[np.asanyarray(mni.dataobj)!=0]
idx = out_vector.argmax()
if idx == (out_vector == 1.0).sum() == 0:
idx = np.nan
reduced_coord.append(idx)
samples["reduced_coordinate"] = reduced_coord
#Downloading gene selections list
# old URL now 404s:
# http://science.sciencemag.org/highwire/filestream/631209/field_highwire_adjunct_files/2/Richiardi_Data_File_S2.xlsx
if not os.path.exists(os.path.join(download_dir, "Richiardi_Data_File_S2.xlsx")):
urllib.request.urlretrieve(
"https://github.com/amadeuskanaan/GluIRON/raw/master/AHBA/Richiardi_Data_File_S2.xlsx",
os.path.join(download_dir, "Richiardi_Data_File_S2.xlsx"))
donors = ["H0351.2001", "H0351.2002", "H0351.1009", "H0351.1012", "H0351.1015", "H0351.1016"]
with zipfile.ZipFile(os.path.join(download_dir, "donor1.zip")) as z:
with z.open("Probes.csv") as f:
probe_info_df = pd.read_csv(f, index_col=0)
# concatenating data from all donors
dfs = []
for i, donor_id in enumerate(donors):
with zipfile.ZipFile(os.path.join(download_dir, "donor%d.zip" % (i + 1))) as z:
with z.open("MicroarrayExpression.csv") as f:
df = pd.read_csv(f, header=None, index_col=0)
# Dropping probes without entrez_id
df.drop(probe_info_df.index[probe_info_df['entrez_id'].isnull()], inplace=True)
with z.open("SampleAnnot.csv") as f:
sample_annot_df = pd.read_csv(f, index_col="well_id").join(samples, how="inner")
df.columns = list(sample_annot_df["reduced_coordinate"])
# removing out side of the brain coordinates
df.drop(sample_annot_df.index[sample_annot_df.reduced_coordinate.isnull()], axis=1, inplace=True, errors='ignore')
# averaging measurements from similar locations
df = df.groupby(level=0, axis=1).mean()
df.columns = pd.MultiIndex.from_tuples([(donor_id, c,) for c in list(df.columns)],
names=['donor_id', 'reduced_coordinate'])
dfs.append(df)
expression_data = pd.concat(dfs, copy=False, axis=1)
del dfs
probe_info_df.drop(probe_info_df.index[probe_info_df['entrez_id'].isnull()], inplace=True)
multiindex = []
for index in expression_data.index:
entrez_id = int(probe_info_df.loc[index]["entrez_id"])
multiindex.append((entrez_id, index,))
expression_data.index = pd.MultiIndex.from_tuples(multiindex, names=['entrez_id', 'probe_id'])
means = expression_data.mean(axis=1)
idx_maxs = means.groupby(level=0).idxmax()
expression_data = expression_data.reindex(idx_maxs)
expression_data.index = expression_data.index.droplevel(1)
multiindex = []
for index in probe_info_df.index:
entrez_id = int(probe_info_df.loc[index]["entrez_id"])
multiindex.append((entrez_id, index,))
probe_info_df.index = pd.MultiIndex.from_tuples(multiindex, names=['entrez_id', 'probe_id'])
probe_info_df = probe_info_df.reindex(idx_maxs)
probe_info_df.index = probe_info_df.index.droplevel(1)
probe_info_df.to_csv(os.path.join(download_dir, 'probe_info_max1.csv'))
assert (expression_data.index[0] in list(probe_info_df.index))
assert (expression_data.shape == (20787, 3072))
with pd.HDFStore(os.path.join(download_dir, 'store_max1_reduced.h5'), 'w') as store:
for donor_id in donors:
store.append(donor_id.replace(".", "_"), expression_data[donor_id])
# Removing downloaded files
for i, url in enumerate(urls):
os.remove(os.path.join(download_dir, "donor%d.zip" % (i + 1)))