/
test1.py
63 lines (57 loc) · 2.6 KB
/
test1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
import os
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
l=[]
id_range=np.arange(0.1,1,0.1)
df=pd.DataFrame(np.zeros(shape=(len(id_range),7)),columns=['id','clust_no','max_size','min_size','avg_size','no_clust_all_genome','no_prot_in_multi'])
for i in range(len(id_range)):
id = id_range[i]
df['id'][i] = id
os.system('vsearch --cluster_size combine.fna --id '+str(id)+' --clusters STRING --log t.txt')
f = open('t.txt', 'r').readlines()
l.append(f[-7])
x=f[-7].split(' ')
df['clust_no'][i]=x[1]
df['max_size'][i]=int(x[6][:-1])
df['min_size'][i]=int(x[4][:-1])
df['avg_size'][i]=float(x[-1][:-1])
# remove all cluster files extracted
os.system("grep '>' STRING* > protein_name.txt")
############################## Dealing with data ################################
data = pd.read_csv("protein_name.txt", delimiter=':>', engine='python',
header=None, names=['cluster', 'protein'])
# dropping null value columns to avoid errors
data.dropna(inplace=True)
# new data frame with split value columns
new = data["protein"].str.split("_", n=1, expand=True)
# making seperate first name column from new data frame
data["accession_no"] = new[0]
# making seperate last name column from new data frame
data["prot"] = new[1]
# Dropping old Name columns
data.drop(columns=["protein"], inplace=True)
# create new column for checking prot from genome in cluster
data['unique'] = data['accession_no'] + '_' + data['prot']
unique = pd.crosstab(data['unique'], data['cluster'], margins=False)
# How many clusters have exactly one protein from every genome
clust = pd.crosstab(data['accession_no'], data['cluster'], margins=False)
df['no_clust_all_genome'][i] = int(sum(clust.sum(axis=0)>=clust.shape[0]))
# How many protein are assigned in more than 1 cluster
unique = pd.crosstab(data['unique'], data['cluster'], margins=False)
# unique.sum(axis=1)
x = pd.DataFrame(unique)
x['test'] = unique.sum(axis=1)
df['no_prot_in_multi'][i]=int(sum(x['test']>1))
# remove file protein_name.txt for next loop
os.system('rm STRING*')
# os.system('rm protein_name.txt')
output_file("test.html")
# create a new plot with default tools, using figure
p = figure(plot_width=400, plot_height=400)
# add a circle renderer with x and y coordinates, size, color, and alpha
p.circle(df['id'], df['clust_no'], size=15, line_color="navy", fill_color="orange", fill_alpha=0.5)
show(p)