/
recovery_rates_bulgarelli.sh
108 lines (77 loc) · 2.79 KB
/
recovery_rates_bulgarelli.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/bin/bash
# scripts for 16S data analysis
#
# originally by Ruben Garrido-Oter
# garridoo@mpipz.mpg.de
# exits whenever a function returns 1
set -e
# get path to scripts
scripts_dir=$(dirname $0)
# load config file
source $scripts_dir/config.sh
# load functions
source $scripts_dir/16s.functions.sh
# activate QIIME, etc.
source $scripts_dir/activate.sh
# parameters
min_length=315
min_id=97
# representative sequences
bulgarelli_rep_seqs=$working_dir/bulgarelli/rep_seqs.fasta
IRL_rep_seqs=$working_dir/IRLs/rep_seqs.fasta
RCP_rep_seqs=$working_dir/root_colony_picking/rep_seqs.fasta
sanger_rep_seqs=$data_dir/Root_isolates_Sanger.fasta
WGS_rep_seqs=$data_dir/Root_isolates_WGS.fasta
# prepare results directory
results_dir=$working_dir/recovery_rates_bulgarelli
mkdir -p $results_dir
rm -f -r $results_dir/*
output=$results_dir/"output.txt"
logfile=$results_dir/"log.txt"
# add label to seq IDs
log "preparing representative OTU sequences..."
cat $bulgarelli_rep_seqs | sed 's/>/>bulgarelli_/g' >> $results_dir/bulgarelli_rep_seqs.fasta
cat $IRL_rep_seqs | sed 's/>/>IRL_/g' >> $results_dir/IRL_rep_seqs.fasta
cat $RCP_rep_seqs | sed 's/>/>RCP_/g' >> $results_dir/RCP_rep_seqs.fasta
cat $sanger_rep_seqs | sed 's/>/>sanger_/g' >> $results_dir/sanger_rep_seqs.fasta
cat $WGS_rep_seqs | sed 's/>/>WGS_/g' >> $results_dir/WGS_rep_seqs.fasta
# concatenate recovered representative sequences
# from the IRL and colony picking methods
cat $results_dir/IRL_rep_seqs.fasta \
$results_dir/RCP_rep_seqs.fasta \
$results_dir/sanger_rep_seqs.fasta \
$results_dir/WGS_rep_seqs.fasta \
>> $results_dir/rec_rep_seqs.fasta
# build database using the representative sequence of the
# root-associated OTUs from the culture independent studies
target=$results_dir/rec_rep_seqs.fasta
query=$results_dir/bulgarelli_rep_seqs.fasta
log "building blast database..."
formatdb -n $results_dir/blastDB_root \
-i $target \
-p false \
&>> $output
# blast the 16S sequences found in the genomes to
# the reference DB of representative sequences
log "blasting sequences..."
blastall -p blastn \
-e 1e-10 \
-m 8 \
-a $n_cores \
-v 100000 \
-b 100000 \
-i $query \
-d $results_dir/blastDB_root \
-o $results_dir/blast_results_bulgarelli.txt \
&>> $output
# cleanup
rm -f blastDB_root* formatdb.log error.log
# parse the blast results taking matches > 97% identity
# over at least 99% of the reference sequences length (315 bp)
log "parsing results..."
$scripts_dir/recovery_rates_bulgarelli.R $working_dir \
$results_dir \
$min_length \
$min_id \
&>> $output
log "DONE!"