Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
…into main
  • Loading branch information
HWaymentSteele committed Mar 22, 2022
2 parents df58b98 + 826c3c4 commit de25298
Show file tree
Hide file tree
Showing 25 changed files with 1,541 additions and 936 deletions.
4 changes: 4 additions & 0 deletions README.md
Expand Up @@ -76,6 +76,8 @@ Install via `pip install requirements.txt` or `conda install --file requirements
export KOV_PATH='/path/to/KaggleOpenVaccine'
```

(Est. setup time: 10 min.)

## Usage

To run the nullrecurrent winning solution on one construct, given in `example.txt`:
Expand Down Expand Up @@ -113,11 +115,13 @@ This write a text file of output predictions to `predict.txt`:
```
2.1289976365, 2.650808962, 2.1869660805000004
```
(Runtime: 1 minute on 1.4 GHz Intel Core i5 processor).

(DegScore-XGBoost output)
```
0.2697107, 0.37091506, 0.48528114
```
(Runtime: 5 sec on 1.4 GHz Intel Core i5 processor).

### A note on energy model versions

Expand Down
Binary file not shown.
Binary file added data/Manuscript_Supplementary_Information.xlsx
Binary file not shown.
Binary file modified data/mRNA_233x_data/.DS_Store
Binary file not shown.

Large diffs are not rendered by default.

85 changes: 69 additions & 16 deletions data/mRNA_233x_data/RegenerateFigure5.ipynb

Large diffs are not rendered by default.

31 changes: 15 additions & 16 deletions data/mRNA_233x_data/collate_predictions.py
@@ -1,7 +1,7 @@
import numpy as np
import pandas as pd

df = pd.read_csv('degradation_rates_081120_233x_stability_WITH_ERRORS.csv')
df = pd.read_csv('233x_sequences_degdata_081120.csv')

print(df.keys())

Expand All @@ -12,28 +12,28 @@

df['length'] = [len(x) for x in df['RNA_sequence']]

df['SUP DegScore2.1 FULL'] = np.nansum(np.loadtxt('Degscore_2.1_flat_FULL_233x.csv',delimiter=','),axis=1)
df['SUP DegScore2.1 PCR'] = np.nansum(np.loadtxt('Degscore_2.1_flat_PCR_233x.csv',delimiter=','),axis=1)
df['SUP DegScore2.1 FULL'] = np.nansum(np.loadtxt('formatted_predictions/Degscore_2.1_flat_FULL_233x.csv',delimiter=','),axis=1)
df['SUP DegScore2.1 PCR'] = np.nansum(np.loadtxt('formatted_predictions/Degscore_2.1_flat_PCR_233x.csv',delimiter=','),axis=1)

df['SUP Vienna FULL'] = np.nansum(np.loadtxt('P_UNP_vienna_flat_FULL_233x.csv',delimiter=','),axis=1)
df['SUP Vienna PCR'] = np.nansum(np.loadtxt('P_UNP_vienna_flat_PCR_233x.csv',delimiter=','),axis=1)
df['SUP Vienna FULL'] = np.nansum(np.loadtxt('formatted_predictions/P_UNP_vienna_flat_FULL_233x.csv',delimiter=','),axis=1)
df['SUP Vienna PCR'] = np.nansum(np.loadtxt('formatted_predictions/P_UNP_vienna_flat_PCR_233x.csv',delimiter=','),axis=1)

df['SUP EternaFold FULL'] = np.nansum(np.loadtxt('P_UNP_eternafold_flat_FULL_233x.csv',delimiter=','),axis=1)
df['SUP EternaFold PCR'] = np.nansum(np.loadtxt('P_UNP_eternafold_flat_PCR_233x.csv',delimiter=','),axis=1)
df['SUP EternaFold FULL'] = np.nansum(np.loadtxt('formatted_predictions/P_UNP_eternafold_flat_FULL_233x.csv',delimiter=','),axis=1)
df['SUP EternaFold PCR'] = np.nansum(np.loadtxt('formatted_predictions/P_UNP_eternafold_flat_PCR_233x.csv',delimiter=','),axis=1)

df['SUP nullrecurrent FULL'] = np.nansum(np.loadtxt('nullrecurrent_FULL_233x.csv',delimiter=','),axis=1)
df['SUP nullrecurrent PCR'] = np.nansum(np.loadtxt('nullrecurrent_PCR_233x.csv',delimiter=','),axis=1)
df['SUP nullrecurrent FULL'] = np.nansum(np.loadtxt('formatted_predictions/nullrecurrent_posthoc_hkws_FULL_233x.csv',delimiter=','),axis=1)
df['SUP nullrecurrent PCR'] = np.nansum(np.loadtxt('formatted_predictions/nullrecurrent_posthoc_hkws_PCR_233x.csv',delimiter=','),axis=1)

df['SUP kazuki2 FULL'] = np.nansum(np.loadtxt('kazuki2_deg_Mg_pH10_flat_FULL_233x.csv',delimiter=','),axis=1)
df['SUP kazuki2 PCR'] = np.nansum( np.loadtxt('kazuki2_deg_Mg_pH10_flat_PCR_233x.csv',delimiter=','),axis=1)
df['SUP kazuki2 FULL'] = np.nansum(np.loadtxt('formatted_predictions/kazuki2_deg_Mg_pH10_flat_FULL_233x.csv',delimiter=','),axis=1)
df['SUP kazuki2 PCR'] = np.nansum( np.loadtxt('formatted_predictions/kazuki2_deg_Mg_pH10_flat_PCR_233x.csv',delimiter=','),axis=1)

df['SUP Degscore-XGB FULL'] = np.nansum(np.loadtxt('Degscore-XGB_FULL_233x.csv',delimiter=','),axis=1)
df['SUP Degscore-XGB PCR'] = np.nansum(np.loadtxt('Degscore-XGB_PCR_233x.csv',delimiter=','),axis=1)
df['SUP Degscore-XGB FULL'] = np.nansum(np.loadtxt('formatted_predictions/Degscore-XGB_FULL_233x.csv',delimiter=','),axis=1)
df['SUP Degscore-XGB PCR'] = np.nansum(np.loadtxt('formatted_predictions/Degscore-XGB_PCR_233x.csv',delimiter=','),axis=1)

df['SUP nr_k2_ensembled FULL'] = df.apply(lambda row: 0.5*(row['SUP nullrecurrent FULL']+row['SUP kazuki2 FULL']), axis=1)
df['SUP nr_k2_ensembled PCR'] = df.apply(lambda row: 0.5*(row['SUP nullrecurrent PCR']+row['SUP kazuki2 PCR']), axis=1)

predictor_list = ['Vienna', 'EternaFold', 'DegScore2.1', 'Tunguz','nullrecurrent','kazuki2', 'nr_k2_ensembled']
predictor_list = ['Vienna', 'EternaFold', 'DegScore2.1', 'Degscore-XGB','nullrecurrent','kazuki2', 'nr_k2_ensembled']

for pred in predictor_list:
df['AUP %s PCR'% pred] = df['SUP %s PCR'%pred]/df['RT_PCR_length']
Expand All @@ -50,5 +50,4 @@
print(df[lst].corr())
#print(df[lst+['Expt type']].groupby('Expt type').corr())

df.to_csv('all_collated_predictions_233x.csv',index=False)

df.to_csv('collated_predictions_all_models_233x.csv',index=False)

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Binary file modified data/mRNA_233x_data/predictions/.DS_Store
Binary file not shown.
233 changes: 0 additions & 233 deletions data/mRNA_233x_data/predictions/233_output_13Oct2021.csv

This file was deleted.

233 changes: 0 additions & 233 deletions data/mRNA_233x_data/predictions/nullrecurrent_output_bt_Sep2021.txt

This file was deleted.

This file was deleted.

14 changes: 7 additions & 7 deletions data/mRNA_233x_data/reformat_nullrecurrent.py
Expand Up @@ -2,20 +2,20 @@
import pandas as pd
import gzip

df = pd.read_csv('../233x_sequences_degdata_081120.csv.csv')
fil='flat_output_file_NR.csv'
df = pd.read_csv('233x_sequences_degdata_081120.csv')
fil='predictions/nullrecurrent_233x_output_14Oct2021.csv'

output_array = np.ones([233,1588])*np.nan

for i,x in enumerate(open(fil,'r').readlines()):
dat = [float(k) for k in x.strip().split(',')]
output_array[i,:len(dat)] = dat

np.savetxt('formatted_predictions/nullrecurrent_FULL_233x.csv',output_array, delimiter=',')
np.savetxt('formatted_predictions/nullrecurrent_posthoc_hkws_FULL_233x.csv',output_array, delimiter=',')

for i, row in df.iterrows():
if not np.isnan(row['startpos']):
output_array[i, :int(row['startpos'])] = np.NaN
output_array[i, int(row['endpos']):] = np.NaN
if not np.isnan(row['RT_PCR_start_pos']):
output_array[i, :int(row['RT_PCR_start_pos'])] = np.NaN
output_array[i, int(row['RT_PCR_end_pos']):] = np.NaN

np.savetxt('formatted_predictions/nullrecurrent_PCR_233x.csv',output_array, delimiter=',')
np.savetxt('formatted_predictions/nullrecurrent_posthoc_hkws_PCR_233x.csv',output_array, delimiter=',')
Binary file not shown.
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
5 changes: 0 additions & 5 deletions scripts/nullrecurrent_inference.py
Expand Up @@ -21,11 +21,6 @@
from sklearn.model_selection import train_test_split,KFold, GroupKFold,StratifiedKFold

from tensorflow.keras import losses


from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

from sklearn.metrics import mean_squared_error

import tensorflow.keras as keras
Expand Down

0 comments on commit de25298

Please sign in to comment.