From 765897caddbbea6c6ad9f1e55a8f3f2a87b93523 Mon Sep 17 00:00:00 2001 From: Patrik Smeds Date: Wed, 3 May 2023 12:53:01 +0200 Subject: [PATCH] feat: gatk modelsegments wrapper (#1321) ### Description gatk ModelSegments wrapper ### QC * [ ] I confirm that: For all wrappers added by this PR, * there is a test case which covers any introduced changes, * `input:` and `output:` file paths in the resulting rule can be changed arbitrarily, * either the wrapper can only use a single core, or the example rule contains a `threads: x` statement with `x` being a reasonable default, * rule names in the test case are in [snake_case](https://en.wikipedia.org/wiki/Snake_case) and somehow tell what the rule is about or match the tools purpose or name (e.g., `map_reads` for a step that maps reads), * all `environment.yaml` specifications follow [the respective best practices](https://stackoverflow.com/a/64594513/2352071), * wherever possible, command line arguments are inferred and set automatically (e.g. based on file extensions in `input:` or `output:`), * all fields of the example rules in the `Snakefile`s and their entries are explained via comments (`input:`/`output:`/`params:` etc.), * `stderr` and/or `stdout` are logged correctly (`log:`), depending on the wrapped tool, * temporary files are either written to a unique hidden folder in the working directory, or (better) stored where the Python function `tempfile.gettempdir()` points to (see [here](https://docs.python.org/3/library/tempfile.html#tempfile.gettempdir); this also means that using any Python `tempfile` default behavior works), * the `meta.yaml` contains a link to the documentation of the respective tool or command, * `Snakefile`s pass the linting (`snakemake --lint`), * `Snakefile`s are formatted with [snakefmt](https://github.com/snakemake/snakefmt), * Python wrapper scripts are formatted with [black](https://black.readthedocs.io). * Conda environments use a minimal amount of channels, in recommended ordering. E.g. for bioconda, use (conda-forge, bioconda, nodefaults, as conda-forge should have highest priority and defaults channels are usually not needed because most packages are in conda-forge nowadays). --------- Co-authored-by: Filipe G. Vieira <1151762+fgvieira@users.noreply.github.com> --- bio/gatk/modelsegments/environment.yaml | 7 ++ bio/gatk/modelsegments/meta.yaml | 19 +++++ bio/gatk/modelsegments/test/Snakefile | 17 ++++ bio/gatk/modelsegments/test/a.denoisedCR.tsv | 6 ++ bio/gatk/modelsegments/wrapper.py | 83 ++++++++++++++++++++ test.py | 8 ++ 6 files changed, 140 insertions(+) create mode 100644 bio/gatk/modelsegments/environment.yaml create mode 100644 bio/gatk/modelsegments/meta.yaml create mode 100644 bio/gatk/modelsegments/test/Snakefile create mode 100644 bio/gatk/modelsegments/test/a.denoisedCR.tsv create mode 100644 bio/gatk/modelsegments/wrapper.py diff --git a/bio/gatk/modelsegments/environment.yaml b/bio/gatk/modelsegments/environment.yaml new file mode 100644 index 00000000000..7abb25e264b --- /dev/null +++ b/bio/gatk/modelsegments/environment.yaml @@ -0,0 +1,7 @@ +channels: + - conda-forge + - bioconda + - nodefaults +dependencies: + - gatk4 =4.4.0.0 + - snakemake-wrapper-utils =0.5.3 diff --git a/bio/gatk/modelsegments/meta.yaml b/bio/gatk/modelsegments/meta.yaml new file mode 100644 index 00000000000..a1cad31a33f --- /dev/null +++ b/bio/gatk/modelsegments/meta.yaml @@ -0,0 +1,19 @@ +name: gatk ModelSegments +url: https://gatk.broadinstitute.org/hc/en-us/articles/13832747657883-ModelSegments +description: | + Models segmented copy ratios from denoised copy ratios and segmented minor-allele fractions from allelic counts +authors: + - Patrik Smeds +input: + - denoised_copy_ratios: denoised_copy_ratios file (optional) + - allelic_counts: allelic_counts file (optional) + - normal_allelic_counts: matched_normal allelic-counts (optional) + - segments: segments Picard interval-list file containing a multisample segmentation output by a previous run (optional) +output: + - list of files ending with either '.modelFinal.seq', '.cr.seg', '.af.igv.seg', '.cr.igv.seg', '.hets.tsv', '.modelBegin.cr.param', '.modelBegin.af.param', '.modelBegin.seg', '.modelFinal.af.param' or '.modelFinal.cr.param' +params: + - java_opts: additional arguments to be passed to the java compiler, e.g. "-XX:ParallelGCThreads=10" (not for `-XmX` or `-Djava.io.tmpdir`, since they are handled automatically). + - extra: additional program arguments. +note: | + * Expected input is either `denoised_copy_ratios` or `allelic_counts` + * `normal_allelic_counts` must be provided together with `allelic_counts` diff --git a/bio/gatk/modelsegments/test/Snakefile b/bio/gatk/modelsegments/test/Snakefile new file mode 100644 index 00000000000..4745b42a575 --- /dev/null +++ b/bio/gatk/modelsegments/test/Snakefile @@ -0,0 +1,17 @@ +rule modelsegments_denoise_input: + input: + denoised_copy_ratios="a.denoisedCR.tsv", + output: + "a.den.modelFinal.seg", + "a.n.cr.seg", + log: + "logs/gatk/modelsegments_denoise.log", + params: + #prefix="a.den.test", + extra="", # optional + java_opts="", # optional + resources: + mem_mb=1024, + wrapper: + "master/bio/gatk/modelsegments" + diff --git a/bio/gatk/modelsegments/test/a.denoisedCR.tsv b/bio/gatk/modelsegments/test/a.denoisedCR.tsv new file mode 100644 index 00000000000..d6e463eb941 --- /dev/null +++ b/bio/gatk/modelsegments/test/a.denoisedCR.tsv @@ -0,0 +1,6 @@ +@HD VN:1.6 +@SQ SN:ref LN:45 +@SQ SN:ref2 LN:40 +@RG ID:GATKCopyNumber SM:mysample +CONTIG START END LOG2_COPY_RATIO +ref2 1 30 0.000000 diff --git a/bio/gatk/modelsegments/wrapper.py b/bio/gatk/modelsegments/wrapper.py new file mode 100644 index 00000000000..1f156da79d1 --- /dev/null +++ b/bio/gatk/modelsegments/wrapper.py @@ -0,0 +1,83 @@ +__author__ = "Patrik Smeds" +__copyright__ = "Copyright 2023, Patrik Smed" +__email__ = "patrik.smeds@gmail.com" +__license__ = "MIT" + +import os +import tempfile +from snakemake.shell import shell +from snakemake_wrapper_utils.java import get_java_opts + + +denoised_copy_ratios = "" +if snakemake.input.get("denoised_copy_ratios", None): + denoised_copy_ratios = ( + f"--denoised-copy-ratios {snakemake.input.denoised_copy_ratios}" + ) + +allelic_counts = "" +if snakemake.input.get("allelic_counts", None): + allelic_counts = f"--allelic-counts {snakemake.input.allelic_counts}" + +normal_allelic_counts = "" +if snakemake.input.get("normal_allelic_counts", None): + matched_normal_allelic_counts = ( + f"--normal-allelic-counts {snakemake.inut.normal_allelic_counts}" + ) + +segments = "" +if snakemake.input.get("segments", None): + interval_list = f"--segments {snakemake.input.segments}" + +if not allelic_counts and not denoised_copy_ratios: + raise Exception( + "wrapper input requires either 'allelic_counts' or 'denoise_copy_ratios' to be set" + ) + +if normal_allelic_counts and not allelic_counts: + raise Exception( + "'allelica_counts' is required when 'normal-allelic-counts' is an input to the rule!" + ) + +extra = snakemake.params.get("extra", "") +java_opts = get_java_opts(snakemake) + +log = snakemake.log_fmt_shell(stdout=True, stderr=True) + + +with tempfile.TemporaryDirectory() as tmpdir: + output_folder = os.path.join(tmpdir, "output_folder") + shell( + "gatk --java-options '{java_opts}' ModelSegments" + " {segments}" + " {denoised_copy_ratios}" + " {allelic_counts}" + " {normal_allelic_counts}" + " --output-prefix temp_name__" + " -O {output_folder}" + " --tmp-dir {tmpdir}" + " {extra}" + " {log}" + ) + + created_files = {} + # Find all created files + for new_file in os.listdir(output_folder): + file_path = os.path.join(output_folder, new_file) + if os.path.isfile(file_path): + file_end = os.path.basename(file_path).split("__")[1] + created_files[file_end] = file_path + + # Match expected output with found files + for output in snakemake.output: + file_found = False + for file_ending in created_files: + if output.endswith(file_ending): + shell(f"cp {created_files[file_ending]} {output}") + file_found = True + break + if not file_found: + created_files_list = [f"{e}" for e in created_files] + raise IOError( + f"Could not create file {output}, possible files ends with {created_files_list}" + ) diff --git a/test.py b/test.py index f33af8487ef..a09bbfa3e89 100644 --- a/test.py +++ b/test.py @@ -4131,6 +4131,14 @@ def test_gatk_haplotypecaller_gvcf(): ) +@skip_if_not_modified +def test_gatk_modelsegments(): + run( + "bio/gatk/modelsegments", + ["snakemake", "--cores", "1", "a.den.modelFinal.seg", "--use-conda", "-F"], + ) + + @skip_if_not_modified def test_gatk_variantrecalibrator(): def check_log(log):