/
wgs_pipe.sh~
504 lines (428 loc) · 18.6 KB
/
wgs_pipe.sh~
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
#!/bin/sh
# exit on error
set -o errexit
# force variables to be set
#set -o nounset #CANNOT USE THIS YET BECAUSE FOR ALN THE INPUT CAN BE UNSET AND WILL BE SET LATER...
# force wildcards to throw error
shopt -s failglob
#TODO INCLUDE VARIABLES FOR VERSIONS....
#TODO CHECK PATHS AND TOOLS
#check PATHS
### Tool Paths ##
#define constant variables that wont be changed throuhout the script
#readonly java="/cluster/software/VERSIONS/java/jdk1.8.0_112/bin/java"
#readonly trimmomatic="/cluster/shared/bioinformatics/src/Trimmomatic-0.33/trimmomatic-0.33.jar"
#readonly picard="/cluster/shared/bioinformatics/src/picard-tools-1.129/picard.jar"
#readonly gatk="/cluster/shared/bioinformatics/src/GenomeAnalysisTK-3.4-46/GenomeAnalysisTK.jar"
### FILE PATHS ###
#readonly adapters="/cluster/shared/bioinformatics/src/Trimmomatic-0.33/adapters/TruSeq3-PE-2.fa"
#readonly reference="/cluster/shared/bioinformatics/reference-data/b37/GATK2.8/human_g1k_v37_decoy.fasta"
#readonly known_snps="/cluster/shared/bioinformatics/reference-data/b37/GATK2.8/dbsnp_138.b37.vcf"
#readonly known_indels="/cluster/shared/bioinformatics/reference-data/b37/GATK2.8/Mills_and_1000G_gold_standard.indels.b37.vcf"
#Help message printed if called with > wgs_pipe.sh -h
usage() {
echo "Preprocessing pipeline for Whole Genome Sequencing Data (PE)"
echo ""
echo "Usage: $0 -o <OUTPUTDIR> -i <input_1> <input_2> -c <#threads> -sID <sampleID> -t <\"tr\"/\"aln\"/\"brec\"/\"varcall\"/\"varcall_compare\" > "
echo ""
echo "!!!!!!!!!!!!!!!Please keep the order in the right way, as defined above !!!!!!!!!!!!!!!!!!!!!!"
echo ""
echo "Mandatory arguments:"
echo " -o, --output <path> Specifies output Path."
echo " -i, --input <input_1> <Input_2> "
echo " For trimming: specifiy forward and reverse reads in 2 input files."
echo " For alignment , no files need to be provided if trimming has been done before alignment and trimmed reads are stored in -o <OUTPUTDIR> (If trimmed reads for alignment are not provided, ResultDirs will be searched for trimmed reads with the file extension \"*_1P.fq.gz\" and \"*_2P.fq.gz\")"
echo " For base recalibration only one Input file needed (trimmed, aligned, sorted, indexed and deduped)"
echo " For variant calling only one Input file needed (recalibrated reads)"
echo " For variant calling comparing alleles, two Input files are required: Inputfile1 should be the bam file and Inputfile 2 should be a vcf file for comparing alleles."
echo "-t, --tool <\"tr\"/\"aln\"/\"brec\"/\"varcall\"/\"varcall_compare\" > Specifies tool."
echo " Possible arguments: -tr for trimming , -aln for alignment , -brec for base recalibration "
echo " -varcall for variant calling in discovery mode"
echo " -varcall_compare for variant calling only those alleles provided in extra inputfile"
echo " "
echo "OPTIONS"
echo " -sID , --sampleID Id info of sample will be incorperated to ReadGroup info druing alignment"
echo " "pseudo_id" will be used if no id provided."
echo " -c , --cpus Number of threads used for each command. Default is 4"
#echo " * If neither \"-tr\" option, nor \"-aln\" option -> both will be executed consecutively"
}
############# PARSE CONFIG FILE #################################
###############################################################
set_dependencies() {
echo " The following tools/software are required: "
echo " JAVA, trimmomatics, picard, gatk"
tools=($(echo 'cat //tool/*/@name' | xmllint --shell config_wgs.xml | awk -F\" 'NR % 2 == 0 {print $2; }'))
versions=($(echo 'cat //tool/*/@version' | xmllint --shell config_wgs.xml | awk -F\" 'NR % 2 == 0 {print $2; }'))
locations=($(echo 'cat //tool/*/@location' | xmllint --shell config_wgs.xml | awk -F\" 'NR % 2 == 0 {print $2; }'))
echo "You have provided the following paths in your config file (config_wgs.xml)"
for ((i=0;i<${#tools[@]};++i)); do
software=${tools[i]}
location=${locations[i]}
version=${versions[i]}
path=${location}${version}
eval $software=\$path
echo " ${software}=${path} "
done
files=($(echo 'cat //file/*/@name' | xmllint --shell config_wgs.xml | awk -F\" 'NR % 2 == 0 {print $2; }'))
locations=($(echo 'cat //file/*/@location' | xmllint --shell config_wgs.xml | awk -F\" 'NR % 2 == 0 {print $2; }'))
echo " You have specified the following file paths in you config file: "
for ((i=0;i<${#files[@]};++i)); do
file_name=${files[i]}
location=${locations[i]}
eval $file_name=\$location
echo " ${file_name}=${location} "
done
}
#Test commandline parsing
parsing() {
if [ -z $1 ] || [ -z $2 ]; then
echo "One or both inputfile paths are undefined"
exit 1
elif [ ! -f "$1" ]; then
echo "Inputfile $1 doesn't exist"
exit 1
elif [ ! -f "$2" ]; then
echo "Inputfile $2 doesn*t exist"
exit 1
fi
echo "You entered the following input files:${1} and ${2}"
}
load_modules() {
### LOAD MODULES ###
module purge
arr=("R" "samtools" "bwa")
R_version="/$(echo 'cat //R/*/@version' | xmllint --shell config_wgs.xml | awk -F\" 'NR % 2 == 0 {print $2 }')"
samtools_version="/$(echo 'cat //samtools/*/@version' | xmllint --shell config_wgs.xml | awk -F\" 'NR % 2 == 0 {print $2 }')"
bwa_version="/$(echo 'cat //bwa/*/@version' | xmllint --shell config_wgs.xml | awk -F\" 'NR % 2 == 0 {print $2 }')"
arr_v=("$R_version" "$samtools_version" "$bwa_version")
for (( i=0; i < ${#arr_v[@]}; i++)); do
if [ -z ${arr_v[$i]} ]; then
echo "you did not specify a version for one of the modules in your config_wgs.xml!--> loading default"
fi
done
for (( i=0; i < ${#arr[@]}; i++)); do
echo "... loading module ${arr[$i]} ... "
echo "module load ${arr[$i]}${arr_v[$i]}"
if [ $? -ne 0 ]; then
echo "loading module ${arr[$i]} failed"
exit 1
fi
echo "from this path:"
#get path from "module show <tool>" output by redirecting output to stdout (before stderr)
module show ${arr[$i]} 2>&1 | grep -m1 PATH | awk -F '\t' '{print $2}'
samtools -h
done
}
set_vars () {
#DEFAULT VARIABLES
sample_name="pseudo_samplename"
lib_id="pseudo_libname"
platform="ILLUMINA"
#check if sample id was provided by user and has been parsed
# if not, use "pseudo_id"
if [ -z sID ]; then
echo "You did not provide a sample ID"
sID="pseudo_id" #could be set to different value by user with commandline arg -sID
readgroups="@RG\tID:"$sID"\tSM:"$sample_name"\tLB:"$lib_id"\tPL:"$platform"\tPU:"$sID""
else
readgroups="@RG\tID:"$sID"\tSM:"$sample_name"\tLB:"$lib_id"\tPL:"$platform"\tPU:"$sID""
fi
#If # of cpus was not specified by user, set it to 4 by default
if [ -z cpus ]; then
cpus="4" #could be set to different value by user with commandline arg -c
fi
}
#function to check if variables have been set to prevent usage of unset variables in functions later on
check_vars () {
varlist="$1"
command_name="$2"
for i in "${varlist[@]}"
do
if [ -z "$i" ]; then
echo "one of the variables: "$i" used in the "$command_name" command are not set correctly. Please check the script (function \" "$command_name" () \" ) "
exit 1
fi
done
}
#################################################
### TRIMMING ###
#################################################
trimming() {
echo "... Trimming reads"
#Set input variables which have been parsed from commandline and provided to function during function call
fastqInput_1=$1
fastqInput_2=$2
#CHECK IF ALL VARIABLES HAVE BEEN SET CORRECTLY
declare -a vars=("$java" "$trimmomatic" "$resultDir" "$fastqInput_1" "$fastqInput_2" "$adapters" "$resultDir")
check_vars $vars "trimming" #function will abort script if one of those variables are unset
#start trimming
if $java -jar $trimmomatic PE -phred33 -baseout "$resultDir"/"$sID".fq.gz "$fastqInput_1" "$fastqInput_2" ILLUMINACLIP:$adapters:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36; then
echo "trimming was successfull"
else
echo "Non-zero exit status. Trimming failed"
exit 1
fi
}
##################################################
###ALIGNMENT###
##################################################
alignment() {
#Set input variables which have been parsed from commandline or were found in the specified output directory
#TODO cannot yet handle case when multiple files found in dir
alnInput_1=$1
alnInput_2=$2
#Set output variable
sorted_bam=""$resultDir"/sorted_"$sID".bam"
#CHECK IF ALL VARIABLES HAVE BEEN SET CORRECTLY
declare -a vars=("$cpus" "$reference" "$alnInput_1" "$alnInput_2" "$sorted_bam" "$resultDir" "$picard" "$java" "$readgroups")
check_vars $vars "alignment" #function will abort script if one of those variables are unset
#start alignment process
echo "... aligning reads (PE) with BWA mem, converting to bam and sorting with samtools ...."
echo "The sampleID you entered will be incorperated into the readgroup info like so:"
echo "$readgroups"
#set pipefail so that even if last command in pipe exited with 0, pipe with exit with 1 if error occured inbetween
#set -euxo pipefail DOES NOT WORK HOW IT SHOULD:::THROWS ERROR OF UNSET VAR BEFORE VAR GETS SET IN SEQUENCE OF PIPE?!??!?!?!??!?
#TODO test if pipefail setting works correctly?
load_modules
#check if referece file exits
if [ -f "$reference" ]; then # "-R" in the bwa mem command for adding readgroups!
bwa mem -t "$cpus" -R "$readgroups" "$reference" "$alnInput_1" "$alnInput_2" | samtools view -b -@ "$cpus" - | samtools sort -O BAM -o "$sorted_bam" -@ "$cpus" -
if [ "$?" -ne 0 ]; then echo "bwa command failed"; exit ; fi
else
echo "reference not found"
exit 1
fi
#output file written to same location as "$sorted_bam"
echo "... indexing aligned bam file..."
if [ -f "$sorted_bam" ]; then
samtools index $sorted_bam
if [ "$?" -ne 0 ]; then echo "Failure during indexing of the sorted bam file"; exit 1; fi
else echo "no sorted bam file found to be indexed."
fi
### REMOVING DUPLICATES ###
echo "... removing duplicates with Picard..."
if [ -f "$sorted_bam" ]; then
$java -jar $picard MarkDuplicates I="$sorted_bam" O=""$resultDir"/dedup_"$sID.bam"" CREATE_INDEX=true M=""$resultDir"/output.metrics"
if [ "$?" -ne 0 ]; then echo "Removing/Marking Duplicates failed"; exit 1; fi
else "no bam file found to remove/mark duplicates"
fi
}
##########################################################
### BASE RECALIBRATION ###
##########################################################
base_recal () {
#Base recalibration with gatk
#Set Variables
deduped_bam="$1" #provided input file specified by user and submitted through funciton call during commandline parsing
br_out_1=""$resultDir"/recal_data_"$sID".table"
br_out_2=""$resultDir"/post_recal_data.table" #NOT USED BECAUSE GGPLOT NOT WORKING ON TSD
plot=""$resultDir"/recal_plot.pdf" #NOT USED BECAUSE GGPLOT NOT WORKING ON TSD YET
recal_out=""$resultDir"/recal_reads_"$sID.bam""
#CHECK IF ALL VARIABLES HAVE BEEN SET CORRECTLY
declare -a vars=("$java" "$known_snps" "$known_indels" "$reference" "$deduped_bam" "$br_out_1" "$br_out_2" "$gatk", "$plot", "$resultDir")
check_vars $vars "base_recal" #function will abort script if one of those variables are unset
#start base recalibration process
echo " Step one of base recalibration. Calculating covariation data. Saving $br_out_1"
if [ -f $known_snps ] && [ -f $known_indels ] && [ -f $reference ]; then #make sure files exist and the path provided is correct
$java -jar $gatk -T BaseRecalibrator -R "$reference" -I "$deduped_bam" -knownSites "$known_snps" -knownSites "$known_indels" -o "$br_out_1"
if [ "$?" -ne 0 ]; then
echo "first step of base recalibration (covariant calculations) failed"
exit 1
else #NOT EXECUTED BECAUSE GGPLOT LIBRARY FOR R IS NOT AVAILABLE ON TSD
echo " Second step and third of Base Recalibration: calc covariant after calibration for plotting (to compare later on) "
#$java -jar $gatk -T BaseRecalibrator -R $reference -I $sortdidxd_bam -knownSites $known_snps -knownSites $known_indels -BQSR $br_out_1 -o $br_out_2
# $java -jar $gatk -T AnalyzeCovariates -R $reference -l DEBUG -before $br_out_1 -after $br_out_2 -plots $plot
if [ "$?" -ne 0 ]; then
echo "Second step of Base recalibration failed"
fi
fi
echo " Last step of Base Recalibration: applying calculations to input file"
$java -jar $gatk -T PrintReads -R $reference -I $deduped_bam -BQSR $br_out_1 -o $recal_out
if [ "$?" -ne 0 ]; then
echo " last step of base recalibration failed"
exit 1
fi
echo "Base Realibration completed sucessfully"
exit
else
echo "fetching $known_snps and $known_indels failed. Can't find files."
exit 1
fi
}
############################################################
### VARIANT CALLING ####
############################################################
#TODO
varcall_discovery() {
#Calling varicants with gatk haplotype caller and default parameters
processed_bam="$1" #bam file provided by user
varcall_out=""$resultDir"/"$sID".vcf"
#check that no variables that will be used are unset
declare -a vars=("$java" "$gatk" "$processed_bam" "$reference" "$varcall_out" "$resultDir")
check_vars $vars "varcall" #function will abort script if one of those variables are unset
echo "Calling varinats with gatk's Haplotype Caller"
#Make sure reference file and input file exist
if [ -f "$reference" ] && [ -f "$processed_bam" ]; then
if "$java" -jar "$gatk" -T HaplotypeCaller -R "$reference" -I "$processed_bam" --genotyping_mode DISCOVERY -stand_emit_conf 10 -stand_call_conf 30 -o "$varcall_out"; then
echo "successfully called variants. Output can be found in "$varcall_out""
else
echo "variant calling failed"
exit 1
fi
else
echo "the reference file: "$reference" or the input file: "$processed_bam" doesnt exit or couldnt be found"
exit 1
fi
}
varcall_given_alleles() {
#variant calling comparing to given vcf
processed_bam="$1" #bam file for which variants should be called
alleles="$2" # only variants that were provided by this file are being called
varcall_out=""$ResultDir"/"$sampleID".vcf"
#make sure nor variables are unset
declare -a vars=("$java" "$gatk" "$processed_bam" "$reference" "$varcall_out" "$alleles")
check_vars $vars "varcall" #function will abort script if one of those variables are unset
echo "Calling varinats with gatk's Haplotype Caller (only looking for variants provided in vcf file (genotyping_mode=GENOTYPE_GIVEN_ALLELE)"
if [ -f "$reference" ] && [ -f "$processed_bam" ] && [ -f "$alleles" ]; then
if [ "$java" -jar "$gatk" -T HaplotypeCaller -R "$reference" -I "$processed_bam" -alleles "$alleles" --GENOTYPE_GIVEN_ALLELES -stand_emit_conf 10 -stand_ecall_conf 30 -o "$varcall_out" ]; then
echo "successfully called variants. Output can be found in "$varcall_out""
else
echo "variant calling failed"
exit 1
fi
else
echo "the reference file: "$reference", input file: "$processed_bam" or the file with given alleles "$alleles" doesnt exit or couldnt be found"
exit 1
fi
}
###########################################################
### COMMANDLINE PARSING ###
###########################################################
#looping through commandline args that are given by user
POSITIONAL=()
while [[ $# -gt 0 ]]
do
key="$1"
case $key in
-h|--help)
usage
exit
;;
-o|--output)
resultDir="$2"
if [ ! -d "$resultDir" ]; then
echo "Specified output directory doen*t exist \n "
exit 1
else echo "output dir: "$resultDir""
fi
shift #shift arg
shift #shift value
;;
-i|--input)
Input_1="$2"
echo "Your first Input file is: "$Input_1" "
# somehow solution for case of only providing one inputfile ( e.g. for base_recal ())
# in this case the next argument would starting with "-"
if [[ $3 == -* ]] || [[ $3 == --* ]] ; then
shift # shift argument
shift # shift value
else # if it doesnt start with "-" or "--", let*s assume it is another inputfile.
Input_2="$3"
echo " Your second Input file is: "$Input_2" "
shift #shift arg
shift #shift first value
shift #shift second value
fi
;;
-sID|--sampleID)
sID="$2" #this is optional and if not provided by user, will be set with "set_vars()" later on
shift
shift
;;
--c|--cpus)
cpus="$2" #this is optional and if not provided by user, will be set with "set_vars()" later on
shift
shift
;;
-t|--tool)
tool="$2"
set_vars # check if optional parameters have been provided, otherwise setting variables to default values
set_dependencies #parse config_wgs.xml and set all variables for dependencies
load_modules #loading tools with "module load <tool>"
#TODO function checking if all tools are working and no problems occur with file paths or verisons
if [ $tool == "tr" ]; then
if parsing $Input_1 $Input_2; then #use parsing() to make sure vars are set and files exist
if [ -z $resultDir ]; then
echo "you did not specify an output dir, a Result dir will be created where script was executed, trimmed reads will be saved in \"./Results\" "
mkdir -p "$(pwd)/Results"
resultDir="$(pwd)/Results"
fi
trimming $Input_1 $Input_2
else
echo "No files for trimming prodvided with -i (see -help)"
exit 1
fi
exit
elif [ $tool == "aln" ]; then
if [ -z $Input_1 ] || [ -z $Input_2 ]; then
printf "You did not specify Input files for alginment. \n Checking if trimmed reads are available in Result directory from previous trimming run:"
Input_1=($resultDir/*_1P.fq*)
Input_2=($resultDir/*_2P.fq*)
if parsing "$Input_1" "$Input_2"; then
echo "found trimmed reads in outpur dir \"Results\" "
else
echo "You did not specify any reads to align and no trimmed reads were found in Result dir, from previous trimming run"
exit 1
fi
else
if ! parsing $Input_1 $Input_2; then
echo "Provided Input files for alignment can*t be found or don*t exist!"
exit 1
fi
fi
alignment $Input_1 $Input_2
exit
elif [ "$2" = "brec" ]; then
if [ -z $Input_1 ] || [ ! -f $Input_1 ]; then
echo "No input file provided or cant be found! \n this is the path you provided: "$Input_1""
exit 1
else
base_recal $Input_1
exit
fi
elif [ "$2" = "varcall" ]; then
if [ -z $Input_1 ] || [ ! -f $Input_1 ]; then
echo "No input file provided or cant be found! \n this is the path you provided: "$Input_1""
exit 1
else
varcall_discovery $Input_1
exit
fi
elif [ "$2" = "varcall_compare" ]; then
if parsing "$Input_1" "$Input_2"; then
varcall_given_alleles "$Input_1" "$Input_2"
exit
else
echo "parsing input files failed"
exit 1
fi
else echo "Tool: "$tool" unknown. check your command \n " exit 1
fi
shift
shift
;;
--testing)
set_dependencies
load_modules
exit
shift
;;
*) #unknown option
echo "unknown option: "$1"\nRun ${0##*/} -h for help.">&2
exit 1
# POSITIONAL+=("$1") # save in array
# shift #past argument
# ;;
esac
done
set -- "${POSITIONAL[@]}" #restore positional parameters