-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_utils.py
88 lines (75 loc) · 3.1 KB
/
data_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from numpy import isnan
from numpy import nan
from numpy import log2
from statistics import median
from scipy.stats import ttest_ind
from statistics import mean
import pandas as pd
def normalize(data_raw):
data_log2 = log2(data_raw.replace(0,nan))
data = data_log2.apply(lambda series: series-median(series.dropna()))
return data
def get_of_type(cell_type, names, option='contains'):
if option=='contains':
cells_of_type = list(s for s in names if cell_type in s)
elif option=='start':
cells_of_type = list(s for s in names if s.startswith(cell_type))
else:
print ('INVALID OPTION')
return ('INVALID OPTION')
return cells_of_type
def check_present(row):
bool_row = []
for i in row:
bool_row.append(not isnan(i))
return sum(bool_row)
def check_three_of_each_type(row, cell_types=["1_B_", "1_T_"], option='contains', min_reps=3):
present_in_types = []
for i in cell_types:
cells_of_type=get_of_type(i, row.index, option=option)
data_by_type = row.loc[cells_of_type]
in_type = check_present(data_by_type)
three_in_type = bool(in_type >= min_reps)
present_in_types.append(three_in_type)
if sum(present_in_types) == len(cell_types):
return True
else: return False
def check_presence_absence(row, cell_types=["1_B_", "1_T_"], min_reps=3, option='contains'):
present_in_types = {}
for i in cell_types:
cells_of_type=get_of_type(i, row.index, option=option)
data_by_type = row.loc[cells_of_type]
in_type = check_present(data_by_type)
present_in_types[i] = in_type
if 0 in list(present_in_types.values()):#absent in one type
if present_in_types[cell_types[0]] >= min_reps:
return cell_types[0]
elif present_in_types[cell_types[1]]>= min_reps:
return cell_types[1]
def ttest_wrapper(row, cell_types = ["1_B_", "1_T_"], option='contains'):
split_row = []
for i in cell_types:
cells_of_type=get_of_type(i, row.index, option=option)
split_row.append(row.loc[cells_of_type])
tstat = ttest_ind(split_row[0],split_row[1])
tstat = pd.Series(dict(statistic=tstat[0], pvalue=tstat[1]))
return tstat
def get_fold_changes(row, cell_types=["1_B_", "1_T_"], option='contains'):
means = {}
for i in cell_types:
cells_of_type=get_of_type(i, row.index, option=option)
data_by_type = row.loc[cells_of_type]
means[i] = mean(data_by_type)
return (means[cell_types[0]]-means[cell_types[1]])
#get higher-in-B and higher-in-T proteins
def is_altered(tscore, pvalue=.01,change_factor=2, cell_types=["B cells", "T cells"]):
if change_factor > 0:
log2_fold_change=log2(change_factor)
else: log2_fold_change=0
if tscore['pvalue'] < pvalue:
if tscore['log2(B)-log2(T)'] > log2_fold_change:
#first type is statistically bigger
return cell_types[0]
elif tscore['log2(B)-log2(T)'] < -log2_fold_change:
#second type is statistically bigger
return cell_types[1]