-
Notifications
You must be signed in to change notification settings - Fork 11
/
make_protein_stat.py
32 lines (24 loc) · 1.26 KB
/
make_protein_stat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.pipeline import FeatureUnion,Pipeline,make_union,make_pipeline
from common import timer,read_csv,ItemSelector,TextStats
with timer("Load data"):
df_protein_train = read_csv("df_protein_train.csv")
df_protein_test = read_csv("df_protein_test.csv")
df_protein = pd.concat([df_protein_train,df_protein_test])
df_protein.Sequence = df_protein.Sequence.apply(lambda x: x.upper())
feature_union = make_union(
make_pipeline(ItemSelector(key="Sequence"),CountVectorizer(analyzer='char',ngram_range=(1,1))),
make_pipeline(ItemSelector(key="Sequence"),TfidfVectorizer(analyzer='char',ngram_range=(1,1),use_idf=False)),
make_pipeline(ItemSelector(key="Sequence"),TextStats(), DictVectorizer())
)
with timer("Fit feature_union"):
feat = feature_union.fit_transform(df_protein)
out_col = [f'protein_stat_{i}' for i in range(feat.shape[1])]
output_file = "./input/temp/df_protein_stat.csv"
with timer(f"Save file to {output_file}"):
df_out = pd.DataFrame(feat.todense(),columns=out_col)
df_out['Protein_ID'] = df_protein.Protein_ID.values
df_out[['Protein_ID']+out_col].to_csv(output_file,index=False)