/
gust
executable file
·99 lines (81 loc) · 3.25 KB
/
gust
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#! /usr/bin/env bash
if [[ -z "$1" ]]; then
mkdir -p genomes
cat <<EOF
gust an easy breezy snp-based whole genome phylogenetic pipeline
repository: https://github.com/pdimens/gust
numthreads are mandatory (>1)
configFile is optional and defaults to config.yml
[usage]: gust numThreads configFile
[example]: gust 25 config.yml
EOF
exit 1
fi
if [ "$1" = "1" ]; then
echo "Error: Use >1 threads"
echo "gust does not have a default value for threads, please set at least 2 threads"
echo "more threads = faster runtime with minimal memory increase"
exit 1
fi
if [[ -z "$2" ]]; then
if [ ! -f config.yml ]; then
echo "Error: No config.yml found"
echo "The file 'config.yml' is required to run gust but wasn't found. It"
echo "was created for you, please edit it to point to the reference and"
echo "and outgroup genomes, then run gust again. You can also modify"
echo "parameters before running."
cat <<EOF >config.yml
#
#=======================================================#
# gust config file #
# most of this file is comments, which you can delete #
#=======================================================#
fragment_size: 150
# size of the sliding window genome fragments
# values of 50-300+ should be ok (use your judgement)
# note: bwa isn't good at mapping really long sequences
reference_genome: "genome.fasta"
# name of reference genome in the genomes/ dir
outgroup: "outgroupgenome.fasta"
# name of the outgroup genome in the genomes/ dir
bwa_parameters: "-a -k 19"
# -a outputs all alignments
# -k 19 is a kmer size of 19
freebayes_parameters: "-C 1 --min-coverage 5 --standard-filter --ploidy 1"
# -C 2 means at least 2 samples need to have alt alleles for a site to be kept
# --min-coverage 5 means at least 5 aligments to score a site
# --standard-filter is to add more stringency
# --ploidy 1 because these are typically haploid assemblies
window_size: 20000
# window size in base pairs for LD pruning snp data
# bigger window = fewer snps
sites_per_window: 10
# number of sites to keep per window
mafft_parameters: "--auto --maxiterate 1000 --nuc"
# --auto chooses the appropriate model for the data size
# --maxiterate is the max number of iterations for tree building
# --nuc is to specify the input is nucleotides
raxml_model: "GTR+G"
# the phylogenetic model you would like to use, see for options:
# https://isu-molphyl.github.io/EEOB563/computer_labs/lab4/models.html
raxml_parameters: '--bs-trees 100 --tree pars\{25\},rand\{25\} -brlen scaled'
# --bs-trees 100 is 100 bootstrapped trees
# --tree pars{25},rand{25} is to use 25 parsimonious trees and 25 random trees
# the curly braces need to be escaped with \ for snakemake and note the single quotes
# -brlen scaled is the model for branch linkage
EOF
exit 1
fi
CONF=config.yml
else
CONF=$2
fi
if [ ! -d "genomes" ]; then
mkdir -p genomes
echo "Error: 'genomes' folder not found"
echo "The 'genomes' folder is required for gust and must contain the genome"
echo "assemblies you want to include in the analyses. One has been created"
echo "in the project directory, please add your data to it."
exit 1
fi
snakemake --core $1 --snakefile .other/gust.smk --configfile $CONF --directory .