gust

#! /usr/bin/env bash

if [[ -z "$1" ]]; then
mkdir -p genomes
cat <<EOF

gust an easy breezy snp-based whole genome phylogenetic pipeline
repository: https://github.com/pdimens/gust

numthreads are mandatory (>1)
configFile is optional and defaults to config.yml

[usage]:   gust numThreads configFile
[example]: gust 25 config.yml

EOF
  exit 1
fi

if [ "$1" = "1" ]; then
    echo "Error: Use >1 threads"
    echo "gust does not have a default value for threads, please set at least 2 threads"
    echo "more threads = faster runtime with minimal memory increase"
    exit 1
fi

if [[ -z "$2" ]]; then
    if [ ! -f config.yml ]; then
        echo "Error: No config.yml found"
        echo "The file 'config.yml' is required to run gust but wasn't found. It"
        echo "was created for you, please edit it to point to the reference and"
        echo "and outgroup genomes, then run gust again. You can also modify"
        echo "parameters before running."
        cat <<EOF >config.yml
#
#=======================================================#
#                     gust config file                  #
#  most of this file is comments, which you can delete  #
#=======================================================#

fragment_size: 150
# size of the sliding window genome fragments
# values of 50-300+ should be ok (use your judgement)
# note: bwa isn't good at mapping really long sequences

reference_genome: "genome.fasta"
# name of reference genome in the genomes/ dir

outgroup: "outgroupgenome.fasta"
# name of the outgroup genome in the genomes/ dir

bwa_parameters: "-a -k 19"
# -a outputs all alignments
# -k 19 is a kmer size of 19

freebayes_parameters: "-C 1 --min-coverage 5 --standard-filter --ploidy 1"
# -C 2 means at least 2 samples need to have alt alleles for a site to be kept
# --min-coverage 5 means at least 5 aligments to score a site
# --standard-filter is to add more stringency
# --ploidy 1 because these are typically haploid assemblies

window_size: 20000
# window size in base pairs for LD pruning snp data
# bigger window = fewer snps
sites_per_window: 10
# number of sites to keep per window

mafft_parameters: "--auto --maxiterate 1000 --nuc"
# --auto chooses the appropriate model for the data size
# --maxiterate is the max number of iterations for tree building
# --nuc is to specify the input is nucleotides

raxml_model: "GTR+G"
# the phylogenetic model you would like to use, see for options:
# https://isu-molphyl.github.io/EEOB563/computer_labs/lab4/models.html

raxml_parameters: '--bs-trees 100 --tree pars\{25\},rand\{25\} -brlen scaled'
# --bs-trees 100 is 100 bootstrapped trees
# --tree pars{25},rand{25} is to use 25 parsimonious trees and 25 random trees
# the curly braces need to be escaped with \ for snakemake and note the single quotes
# -brlen scaled is the model for branch linkage
EOF
        exit 1
    fi
    CONF=config.yml
else
    CONF=$2
fi

if [ ! -d "genomes" ]; then
    mkdir -p genomes
    echo "Error: 'genomes' folder not found"
    echo "The 'genomes' folder is required for gust and must contain the genome"
    echo "assemblies you want to include in the analyses. One has been created"
    echo "in the project directory, please add your data to it."
    exit 1
fi

snakemake --core $1 --snakefile .other/gust.smk --configfile $CONF --directory .