/
LjAt_IRL.sh
executable file
·197 lines (159 loc) · 6.05 KB
/
LjAt_IRL.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#!/bin/bash
# scripts for 16S data filter and OTU clustering for 454 data,
# truncate reads by the alignment of primer sequences
# Author guan@mpipz.mpg.de, modified from garridoo@mpipz.mpg.de
# exits whenever a function returns 1
set -e
# get path to scripts
scripts_dir=$(dirname $0)
# activate and load config file.
source $scripts_dir/activate.sh
source $scripts_dir/config.sh
log(){
echo $(date -u)": "$1 >> $logfile
}
# prepare results director
prefix=`date +%d%m%Y`
working_dir=$working_dir/$prefix\_s1
output=$working_dir/"output.txt"
logfile=$working_dir/"log.txt"
mkdir -p $working_dir
rm -f -r $working_dir/*
# Get the list of 454 < run > used in step 1 and step 2
l_list=`ls -l $data_dir/s1/*.fasta |awk '{print $9}' |sed 's/\.fasta//'
|sed 's/\//\t/g' |awk '{print $NF}' | xargs `
# Get reads with both forward/reverse primer sequences and truncate reads
# according to reverse primer position
for l in $l_list
do
# initialize lib. results directory
log "["$l"] initializing the workdir for 454 step 1..."
rm -f -r $working_dir/"$l"
mkdir $working_dir/"$l"
# truncating reads to equal length
log "["$l"] truncating reads..."
perl $scripts_dir/truncate_by_length.pl -nr $scripts_dir/forward_primer.fasta \
$scripts_dir/reverse_primer.fasta \
$data_dir/s1/"$l".fasta \
$data_dir/s1/"$l".qual \
$working_dir/"$l"/filtered.fasta \
$working_dir/"$l"/filtered.qual \
&>> $output
done
log "Step1 454 data truncate DONE!"
## Step 2, demultiplexing reads according to barcodes
working_dir_s1=$working_dir/$prefix\_s1
working_dir_s2=$working_dir/$prefix\_s2
mkdir -p $working_dir_s2
rm -f -r $working_dir_s2/*
output=$working_dir_s2/"output.txt"
logfile=$working_dir_s2/"log.txt"
# Get the list of 454 < run > information.
l_list=`ls -l $data_dir/s1/*.fasta |awk '{print $9}'|sed 's/\.fasta//' |
sed 's/\//\t/g' |awk '{print $NF}' |xargs`
# Demultiplexing the data according to barcodes
for l in $l_list
do
# initialize results directory
log "["$l"] initializing the 454 workdir for step 2..."
rm -f -r $working_dir_s2/$l
mkdir $working_dir_s2/$l
# get the barcode length
bc_length=`less $data_dir/s1/$l\_mapping.txt |tail -n1 |awk '{print $2}'|
wc -c`
let "bc_length=$bc_length-1"
# quality filtering and demultiplexing
log "["$l"] demultiplexing..."
split_libraries.py -f $working_dir_s1/"$l"/filtered.fasta \
-q $working_dir_s1/"$l"/filtered.qual \
-m $data_dir/s1/"$l"_mapping.txt \
-s $qual \
-e $bc_err \
-b $bc_length \
-d \
-o $working_dir_s2/"$l"/demux \
&>>$output
mv $working_dir_s2/"$l"/demux/* $working_dir_s2/"$l"
rm -f -r $working_dir_s2/"$l"/demux
# edit barcode label identifier for usearch compatibility
cat $working_dir_s2/"$l"/seqs.fna | \
sed 's/.*/;/g;s/>.*/&&/g;s/;>/;barcodelabel=/g;s/_[0-9]*;$/;/g' \
>> $working_dir_s2/seqs.fasta
log "["$l"] finish demultiplexing this run"
done
log "Step2 454 data demultiplex DONE!"
# Step 3 OTU clustering
working_dir_s3="../03.OTU_cluster/"
output=$working_dir_s3/"output.txt"
logfile=$working_dir_s3/"log.txt"
# dereplication
log "dereplicating..."
usearch -derep_fulllength $working_dir_s3/seqs.fasta \
-fastaout $working_dir_s3/seqs_unique.fasta \
-sizeout \
&>> $output
# abundance sort and discard singletons
log "sorting by abundance and discarding singletons..."
usearch -sortbysize $working_dir_s3/seqs_unique.fasta \
-fastaout $working_dir_s3/seqs_unique_sorted.fasta \
-minsize $min_size \
&>> $output
# OTU clustering
log "OTU clustering using UPARSE..."
usearch -cluster_otus $working_dir_s3/seqs_unique_sorted.fasta \
-otus $working_dir_s3/otus.fasta \
&>> $output
# chimera detection
log "removing chimeras..."
usearch -uchime_ref $working_dir_s3/otus.fasta \
-db $gold_db \
-strand plus \
-nonchimeras $working_dir_s3/otus_nc.fasta \
-threads $n_cores \
&>> $output
# align sequences to database using PyNAST and remove remaining
log "aligning OTU representative sequences to database..."
align_seqs.py -i $working_dir_s3/otus_nc.fasta \
-t $gg_core_aligned_db \
-p $min_id_aln \
-o $working_dir_s3 \
&>> $output
# rename OTUs and remove alignment gaps
log "renaming OTUs..."
sed -i 's/-//g' $working_dir_s3/otus_nc_aligned.fasta &>> $output
cat $working_dir_s3/otus_nc_aligned.fasta | \
awk 'BEGIN {n=1}; />/ {print ">OTU_" n; n++} !/>/ {print}' \
>> $working_dir_s3/rep_seqs.fasta
# generate OTU table
log "generating OTU table..."
usearch -usearch_global $working_dir_s3/seqs.fasta \
-db $working_dir_s3/rep_seqs.fasta \
-strand plus \
-id $id_threshold \
-uc $working_dir_s3/read_mapping.uc \
&>> $output
# convert uc file to txt
log "converting uc OTU table file into text format..."
python $usearch_dir/uc2otutab.py $working_dir_s3/read_mapping.uc \
1> $working_dir_s3/otu_table.txt \
2>> $output
# taxonomy assignment
log "taxonomy assignment..."
assign_taxonomy.py -i $working_dir_s3/rep_seqs.fasta \
-r $silva_core_db \
-t $silva_taxonomy \
-m $tax_method \
-o $working_dir_s3/tax \
&>> $output
# cleanup
mv $working_dir_s3/tax/rep_seqs_tax_assignments.txt $working_dir_s3/taxonomy.txt
rm -f -r $working_dir_s3/tax
sed -i 's/; /\t/g' $working_dir_s3/taxonomy.txt
# convert OTU table to biom
log "converting OTU table to QIIME compatible biom format..."
biom convert -i $working_dir_s3/otu_table.txt \
-o $working_dir_s3/otu_table.biom \
--table-type="OTU table" \
--to-json \
&>> $output
log "Step3 OTU cluster DONE!"