/
make_master_csv.py
87 lines (70 loc) · 2.97 KB
/
make_master_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# make master csv (concatenate all summaries)
import pandas as pd
from pathlib import Path
def vertical_stack():
outdir = Path("/mnt/md0/kfaizi/search/output/")
NCBI_datestamp = "_jul25"
final = Path("/mnt/md0/kfaizi/search/master.csv")
names = ['A',
'AAA',
'AAB',
'AAC',
'AAD',
'AAE',
'AAF',
'B',
'C',
'CAA',
'CAB',
'F',
'J',
'L',
'M',
'N',
'O',
'Q',
'R',
'V',
'P',
'S',
'U',
]
summary = None
for name in names:
name = name + NCBI_datestamp
sumdir = Path(outdir, name)
if summary is not None:
summary = list(set(sumdir.glob("*summary_csv.csv")) - set(sumdir.glob(".*")))[0] # exclude weird hidden binary excel files (from sshfs?)
df_new = pd.read_csv(summary, dtype=object, index_col=False)
df = pd.concat([df, df_new], axis=0)
else:
summary = list(set(sumdir.glob("*summary_csv.csv")) - set(sumdir.glob(".*")))[0]
df = pd.read_csv(summary, dtype=object, index_col=False)
with open(final, 'w') as f:
df.to_csv(f, mode='w', index=False, header=True)
def winnow(path, out_csv, out_fasta):
df = pd.read_csv(path, dtype=object, index_col=False)
df = df[df.DR_found == 'yes'] # discard hits w/o crisprs
df = df[df.protein_length.astype(int) > 500] # discard proteins < 500 aa
df = df[df.distance_protein_and_array.astype(float) < 20000] # discard proteins outside 20kb flanking sequence
df = df[df.pident.astype(float) < 90] # discard most similar hits
###### NEED TO FIX FILTERS > NOT STRINGENT ENOUGH > STILL GETTING 1000s OF HITS
# group rows by source id, sorted by % identity and crispr 'confidence', keeping only best
df = df.sort_values(['extracted_header', 'pident', 'crispr_type'], ascending=[True, False, True]).groupby('extracted_header', as_index=False).head(1)
# write filtered dataframe to new csv
with open(out_csv, 'w') as f:
df.to_csv(f, mode='w', index=False, header=True)
# write filtered proteins to a fasta file
with open(out_fasta, 'w') as f:
for row in df.itertuples():
f.write(">" + row.extracted_header + "\n")
f.write(row.protein_sequence + "\n")
# vertical_stack()
# winnow(Path("/mnt/md0/kfaizi/search/master.csv"), Path('/mnt/md0/kfaizi/search/filter_master.csv'), Path('/mnt/md0/kfaizi/search/filter_proteins.fasta'))
def fix(path, out_fasta):
df = pd.read_csv(path, dtype=object, index_col=False)
with open(out_fasta, 'w') as f:
for row in df.itertuples():
f.write(">" + row.extracted_header + "\n")
f.write(row.protein_sequence + "\n")
fix(Path("/Users/kianfaizi/Desktop/best_test.csv"), Path("/Users/kianfaizi/Desktop/best_prots.fasta"))