/
preprocess.py
49 lines (39 loc) · 1.56 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from nilearn import signal
from project.helper_functions import set_random_seed
def impute_random(arr_in):
set_random_seed()
arr = arr_in.copy() # to not change the input
arr_is_nan = np.isnan(arr) # 1 if nan else 0
n_nan = sum(arr_is_nan)
arr_without_nan = arr[arr_is_nan==False]
arr_fill = np.random.choice(arr_without_nan, size=n_nan)
arr[arr_is_nan] = arr_fill
return arr
def balanced_sample(column):
out = pd.Series(np.zeros(len(column)))
value_counts = column.value_counts()
n_smalles_value = min(value_counts)
for value, n_value in value_counts.to_dict().items():
select_from_column = np.zeros(n_value)
select_from_column[:n_smalles_value] = 1
np.random.shuffle(select_from_column)
out[column == value] = select_from_column
return out
def balance_x_y(x, y):
set_random_seed()
y_ind = y.index.values
x = x.reset_index(drop=True) #temp
y = y.reset_index(drop=True) #temp
select = balanced_sample(y)
y = y[select == 1]
x = x.loc[select == 1]
y_ind = y_ind[select == 1]
return x, y, y_ind
def clean_volumes(volumes,y, by_col):
cleaner = lambda col_name: np.atleast_2d(StandardScaler().fit_transform(np.nan_to_num(y[col_name].values[:, None])))
conf_mat = np.hstack([cleaner(col) for col in by_col])
volumes_deconf = signal.clean(volumes.values, confounds=conf_mat, detrend=False, standardize=False)
return pd.DataFrame(volumes_deconf, index = volumes.index), y