From 097049f8562d76b2ebc532d5ccfc3fc0b93943b6 Mon Sep 17 00:00:00 2001 From: Curro Campuzano Date: Fri, 5 Apr 2024 13:32:54 +0200 Subject: [PATCH 01/26] Minimal support emu abundance --- bio/emu/abundance/environment.yaml | 6 ++ bio/emu/abundance/meta.yaml | 14 ++++ bio/emu/abundance/test/Snakefile | 29 ++++++++ .../test/database/species_taxid.fasta | 15 ++++ bio/emu/abundance/test/database/taxonomy.tsv | 2 + bio/emu/abundance/test/sample.fa | 70 +++++++++++++++++++ bio/emu/abundance/test/short_read_R1.fq | 16 +++++ bio/emu/abundance/test/short_read_R2.fq | 16 +++++ bio/emu/abundance/wrapper.py | 44 ++++++++++++ test.py | 30 ++++++++ 10 files changed, 242 insertions(+) create mode 100644 bio/emu/abundance/environment.yaml create mode 100644 bio/emu/abundance/meta.yaml create mode 100644 bio/emu/abundance/test/Snakefile create mode 100644 bio/emu/abundance/test/database/species_taxid.fasta create mode 100644 bio/emu/abundance/test/database/taxonomy.tsv create mode 100644 bio/emu/abundance/test/sample.fa create mode 100644 bio/emu/abundance/test/short_read_R1.fq create mode 100644 bio/emu/abundance/test/short_read_R2.fq create mode 100644 bio/emu/abundance/wrapper.py diff --git a/bio/emu/abundance/environment.yaml b/bio/emu/abundance/environment.yaml new file mode 100644 index 0000000000..a117676039 --- /dev/null +++ b/bio/emu/abundance/environment.yaml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - nodefaults +dependencies: + - emu=3.4.5 \ No newline at end of file diff --git a/bio/emu/abundance/meta.yaml b/bio/emu/abundance/meta.yaml new file mode 100644 index 0000000000..5ddec3ad0c --- /dev/null +++ b/bio/emu/abundance/meta.yaml @@ -0,0 +1,14 @@ +name: emu abundance +description: Generate relative abundance estimates from ONT, Pac-Bio or short 16S reads using emu. +url: https://github.com/treangenlab/emu +authors: + - Curro Campuzano +input: + - Nucleotide sequence file(s) (either a single ONT or Pac-Bio fasta file, a single fastq file or paired fastq files) + - Optional. A emu database (i.e. a directory that contains at least the files "taxonomy.tsv" and "species_taxid.fasta", check documentation for pre-built databases and how to build them). +output: + - A TSV with relative (and optionally, absolute abundances). + - An optional SAM file with the alignments. + - An optional FASTA file with unclassified sequences. +params: + extra: Any optimal parameter such as --type (sequencer) or --min-abundance. Optional flags involving output are handled automatically (e.g. --output-dir, --output-basename ...) diff --git a/bio/emu/abundance/test/Snakefile b/bio/emu/abundance/test/Snakefile new file mode 100644 index 0000000000..f35eb8b00f --- /dev/null +++ b/bio/emu/abundance/test/Snakefile @@ -0,0 +1,29 @@ +rule abundance: + input: + reads = "{sample}.fa", + database_dir = "database" + output: + abundances = "{sample}_rel-abundance.tsv", + alignments = "{sample}_emu_alignments.sam", + unclassified = "{sample}_unclassified.fa" + log: + "logs/emu/{sample}_abundance.log" + params: + extra="--type map-ont --keep-counts" + threads: 3 # optional, defaults to 1 + wrapper: + "master/bio/emu/abundance" + +rule abundance_paired: + input: + reads =[ "{sample}_R1.fq", "{sample}_R2.fq" ], + database_dir = "database" + output: + abundances = "{sample}_rel-abundance_paired.tsv", + log: + "logs/emu/{sample}_abundance_paired.log" + params: + extra="--type sr --keep-counts" + threads: 3 # optional, defaults to 1 + wrapper: + "master/bio/emu/abundance" diff --git a/bio/emu/abundance/test/database/species_taxid.fasta b/bio/emu/abundance/test/database/species_taxid.fasta new file mode 100644 index 0000000000..4e0e691a19 --- /dev/null +++ b/bio/emu/abundance/test/database/species_taxid.fasta @@ -0,0 +1,15 @@ +>1:emu-silva:1 ['dada2-silva_1 Bacteria;Proteobacteria;Gammaproteobacteria;Pseudomonadales;Pseudomonadaceae;Pseudomonas;amygdali;'] +AACTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAG +TCGAGCGGCAGCACGGGTACTTGTACCTGGTGGCGAGCGGCGGACGGGTGAGTAATGCCT +AGGAATCTGCCTGGTAGTGGGGGATAACGCTCGGAAACGGACGCTAATACCGCATACGTC +CTACGGGAGAAAGCAGGGGACCTTCGGGCCTTGCGCTATCAGATGAGCCTAGGTCGGATT +AGCTAGTTGGTGAGGTAATGGCTCACCAAGGCGACGATCCGTAACTGGTCTGAGAGGATG +ATCAGTCACACTGGAACTGAGACACGGTCCAGACTCCTACGGGAGGCAGCAGTGGGGAAT +ATTGGACAATGGGCGAAAGCCTGATCCAGCCATGCCGCGTGTGTGAAGAAGGTCTTCGGA +TTGTAAAGCACTTTAAGTTGGGAGGAAGGGCAGTTACCTAATACGTATCTGTTTTGACGT +TACCGACAGAATAAGCACCGGCTAACTCTGTGCCAGCAGCCGCGGTAATACAGAGGGTGC +GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAA +GCGCACNNNNGCGGTCTGTCAAGTCGGANGNNAAATCCCCGGGCTNNNNNNNGGAACTGC +ATTCGAAACTGNCAGGCTTGAGTCTTGTAGAGGGNNNTNGNATTCNNNGTGTAGCGNNNN +NNTGCGTAGAGATCTGGANGAACACCAGTGGCGAAGGCGGCTCTCTNGTCTGTAACTGAC +GCTGAGGCTCGAAAGCNTGGGGAGCAAACAGGATTAGATANCCTGGTAGTCCACG \ No newline at end of file diff --git a/bio/emu/abundance/test/database/taxonomy.tsv b/bio/emu/abundance/test/database/taxonomy.tsv new file mode 100644 index 0000000000..c6a3209d65 --- /dev/null +++ b/bio/emu/abundance/test/database/taxonomy.tsv @@ -0,0 +1,2 @@ +tax_id superkingdom phylum class order family genus species +1 Bacteria Proteobacteria Gammaproteobacteria Pseudomonadales Pseudomonadaceae Pseudomonas amygdali; \ No newline at end of file diff --git a/bio/emu/abundance/test/sample.fa b/bio/emu/abundance/test/sample.fa new file mode 100644 index 0000000000..985df2df9c --- /dev/null +++ b/bio/emu/abundance/test/sample.fa @@ -0,0 +1,70 @@ +>Sphingobacterium_puteal_r1 +ACGGGTGCGTAACGCGTGAGCAACCTACCTCTATCAGGGGGATAGCCTCTCGAAAGAGAGATTAACACCGCATAACA +TCAACAGTTCGCATGTTCGGTTGATTAAATATTTATAGGATAGAGATGGGCTCGCGTGACATTAGCTAGTTGGTAGGGTA +ACGGCCTACCAAGGCGACGATGTCTAGGGGCTCTGAGAGGAGAATCCCCCACACTGGTACTGAGACACGGACCAGACTCC +TACGGGAGGCAGCAGTAAGGAATATTGGTCAATGGGCGGAAGCCTGAACCAGCCATGCCGCGTGCAGGAAGACTGCCCTA +TGGGTTGTAAACTGCTTTTGTCCAGGAATAAACCTCTTTACGTGTAGAGAGCTGAATGTACTGGAAGAATAAGGATCGGC +TAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCG +GCCTGTTAAGTCAGGGGTGAAATACGGTGGCTCAACCATCGCAGTGCCTTTGATACTGACGGGCTTGAATCCATTTGAAG +TGGGCGGAATAAGACAAGTAGCGGTGAAATGCATAGATATGTCTTAGAACTCCGATTGCGAAGGCAGCTCACTAAGCTGG +TATTGACGCTGATGCACGAAAGCGTGGGGATCGAACAGGATTAGATACCCTGGTAGTCCACGCCCTAAACGATGATAACT +CGATGTTGGCGATAGACCGCCAGCGTCCAAGCGAAAGCGTTAAGTTATCCACCTGGGGAGTACGCCCGCAAGGGTGAAAC +TCAAAGGAATTGACGGGGGCCCGCACAAGCGGAGGAGCATGTGGTTTAATTCGATGATACGCGAGGAACCTTACCCGGGC +TTGAAAGTTAGTGAAGGATGCGGAGACGCATCCGTCCTTCGGGACACGAAACTAGGTGCTGCATGGCTGTCGTCAGCTCG +TGCCGTGAGGTGTTGGGTTAAGTCCCGCAACGAGCGCAACCCCTATGTTTAGTTGCCAGCAAGTAATGTTGGGGACTCTA +AACAGACTGCCTGCGCAAGCAGAGAGGAAGGTGGGGACGACGTCAAGTCATCATGGCCCTTACGTCCGGGGCTACACACG +TGCTACAATGGATGGTACAGCGGGCAGCTACATAGCAATATGGTGCTAATCTCTAAAAGCCATTCACAGTTCGGATTGGG +GTCTGCAACTCGACCCCATGAAGTTGGATTCGCTAGTAATCGCGTATCAGC +>Sphingobacterium_puteal_r2 +GGCCTAATACATGCAAGTCGGACGGGATTTAAGTTAAAGCTTGCTTTAAGTTAATGAGAGTGG +CGCACGGGTGCGTAACGCGTGAGCAACCTACCTCTATCAGGGGGATAGCCTCTCGAAAGAGAGATTAACACCGCATAACA +TCAACAGTTCGCATGTTCGGTTGATTAAATATTTATAGGATAGAGATGGGCTCGCGTGACATTAGCTAGTTGGTAGGGTA +ACGGCCTACCAAGGCGACGATGTCTAGGGGCTCTGAGAGGAGAATCCCCCACACTGGTACTGAGACACGGACCAGACTCC +TACGGGAGGCAGCAGTAAGGAATATTGGTCAATGGGCGGAAGCCTGAACCAGCCATGCCGCGTGCAGGAAGACTGCCCTA +TGGGTTGTAAACTGCTTTTGTCCAGGAATAAACCTCTTTACGTGTAGAGAGCTGAATGTACTGGAAGAATAAGGATCGGC +TAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCG +GCCTGTTAAGTCAGGGGTGAAATACGGTGGCTCAACCATCGCAGTGCCTTTGATACTGACGGGCTTGAATCCATTTGAAG +TGGGCGGAATAAGACAAGTAGCGGTGAAATGCATAGATATGTCTTAGAACTCCGATTGCGAAGGCAGCTCACTAAGCTGG +TATTGACGCTGATGCACGAAAGCGTGGGGATCGAACAGGATTAGATACCCTGGTAGTCCACGCCCTAAACGATGATAACT +CGATGTTGGCGATAGACCGCCAGCGTCCAAGCGAAAGCGTTAAGTTATCCACCTGGGGAGTACGCCCGCAAGGGTGAAAC +TCAAAGGAATTGACGGGGGCCCGCACAAGCGGAGGAGCATGTGGTTTAATTCGATGATACGCGAGGAACCTTACCCGGGC +TTGAAAGTTAGTGAAGGATGCGGAGACGCATCCGTCCTTCGGGACACGAAACTAGGTGCTGCATGGCTGTCGTCAGCTCG +TGCCGTGAGGTGTTGGGTTAAGTCCCGCAACGAGCGCAACCCCTATGTTTAGTTGCCAGCAAGTAATGTTGGGGACTCTA +AACAGACTGCCTGCGCAAGCAGAGAGGAAGGTGGGGACGACGTCAAGTCATCATGGCCCTTACGTCCGGGGCTACACACG +TGCTACAATGGATGGTACAGCGGGCAGCTACATAGCAATATGGTGCTAATCTCTAAAAGCCATTCACAGTTCGGATTGGG +GTCTGCAACTCGACCCCATGAAGTTGGATTCGCTAGTAATCGCGTATCAGCAATGACGCGGTGAATACGTTCCCGGGCCT +TGTACACA +>Mycobacterium_saskatchewanense_r1 +AGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGGAAAGGTCTCTTCGGAGATAC +TCGAGTGGCGAACGGGTGAGTAACACGTGGGCAATCTGCCCTGCACTTCGGGATAAGCCTGGGAAACTGGGTCTAATACC +GGATAGGACCTTTAGGCGCATGCCTTTTGGTGGAAAGCTTTTGCGGTGTGGGATGGGCCCGCGGCCTATCAGCTTGTTGG +TGGGGTGATGGCCTACCAAGGCGACGACGGGTAGCCGGCCTGAGAGGGTGTCCGGCCACACTGGGACTGAGATACGGCCC +AGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCGACGCCGCGTGGGGGATGAC +GGCCTTCGGGTTGTAAACCTCTTTCAGCAGGGACGAAGCGCAAGTGACGGTACCTGCAGAAGAAGCACCGGCCAACTACG +TGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTGTCCGGAATTACTGGGCGTAAAGAGCTCGTAGGTGGTTTGTCG +CGTTGTTCGTGAAATCTCACGGCTTAACTGTGAGCGTGCGGGCGATACGGGCAGACTAGAGTACTGCAGGGGAGACTGGA +ATTCCTGGTGTAGCGGTGGAATGCGCAGATATCAGGAGGAACACCGGTGGCGAAGGCGGGTCTCTGGGCAGTAACTGACG +CTGAGGAGCGAAAGCGTGGGGAGCGAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGGTGGGTACTAGGTGTGG +GTTTCCTTCCTTGGGATCCGTGCCGTAGCTAACGCATTAAGTACCCCGCCTGGGGAGTACGGCCGCAAGGCTAAAACTCA +AAGGAATTGACGGGGGCCCGCACAAGCGGCGGAGCATGTGGATTAATTCGATGCAACGCGAAGAACCTTACCTGGGTTTG +ACATGCACAGGACGCCGGCAGAGATGTCGGTTCCCTTGTGGCCTGTGTGCAGGTGGTGCATGGCTGTCGTCAGCTCGTGT +CGTGAGATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCTTGTCTCATGTTGCCAGCGGGTAATGCCGGGGACTCGTGAG +AGACTGCCGGGGTCAACTCGGAGGAAGGTGGGGATGACGTCAAGTCATCATG +>Streptococcus_sobrinus_r1 +AGTGTTACTAATGAGTCGCGAACGGGTGAGTAACGCGTAGGTAACCTGCCTGATAGCGGGGGATAACTATTGGAAACGAT +AGCTAATACCGCATAAGAGGAGTTAACTCATGTTAACTGTTTAAAAGAAGCCATTGCTTCACTATCAGAGGACCTGCGT +TGTATTAGCTAGTAGGTAGGGTAACGGCCTACCTAGGCAACGATACATAGCCGACCTGAGAGGGTGAACGGCCACACTGG +GACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAGTAGGGAATCTTCGGCAATGGACGCAAGTCTGACCGAGCAACG +CCGCGTGAGTGAAGACGGTTTTCGGATCGTAAAGCTCTGTTGTAGGGGAAGAACGTGTGTAAGAGTGGAAAGCTTACACA +GTGACGGTACCCTACCAGAAAGGGACGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGTCCCGAGCGTTGTCCGG +ATTTATTGGGCGTAAAGGGAGCGCAGGCGGTTTAGTAAGTCTGAAGTTAAAGGCATTGGCTCAACCAATGTATGCTTTGG +AAACTGTTAGACTTGAGTGCAGAAGGGGAGAGTGGAATTCCATGTGTAGCGGTGAAATGCGTAGATATATGGAGGAACAC +CGGTGGCGAAAGCGGCTCTCTGGTCTGTCACTGACGCTGAGGCTCGAAAGCGTGGGTAGCGAACAGGATTAGATACCCTG +GTAGTCCACGCCGTAAACGCTGAGTGCTAGGTGTTAGGTCCTTTCCGGGACTTAGTGCCGACGCTAACGCATTAAGCACT +CCGCCTGGGGAGTACGACCGCAAGGTTGAAACTCAAAGGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTA +ATTCGAAGCAACGCGAAGAACCTTACCAGGTCTTGACATCCCGATGCCCGCTCTAGAGATAGAGTTTTTCTTCGGAACAT +CGGAGACAGGTGGTGCATGGTTGTCGTCAGCTCGTGTCGTGAGATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCTTAT +TGTTAGTTGCCATCATTAAGTTGGGCACTCTAGCGAGACTGCCGGTAATAAACCGGAGGAAGGTGGGGATGACGTCAAAT +CATCATGCCCCTTATGACCTGGGCTACACACGTGCTACAATGGTTGGTACAACGAGTCGCAAGCCGGTGACGGCAAGCTA +ATCTCTGAAAGCCAATCTCAGTTCGGATTGTAGGCTGCAACTCGCCTACATGAAGTCGGAATCGCTAGTAATCGCGGATC +AGCACGCCGCGGTGAATACGTTCCCGGGCCTTGTACACACCGCCCGTCACACCACGAGAGTTTGTAACACCCAAAGTCGGT \ No newline at end of file diff --git a/bio/emu/abundance/test/short_read_R1.fq b/bio/emu/abundance/test/short_read_R1.fq new file mode 100644 index 0000000000..06c5874275 --- /dev/null +++ b/bio/emu/abundance/test/short_read_R1.fq @@ -0,0 +1,16 @@ +@SRR10391187.1 1 length=293 +GTGCCAGCCGCCGCGGTAATACGTAGGTGGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGGGNTNNNNNNCGGTTCCTTAAGNNTGANNNNNNANCCCCCGGCTNNNNNNNGGAGNGTCNNNGGAANCTNNGGAACTTGAGTGCAGAAGAGGANNNNNNNNTNCNNNGTGTAGCNNNNNNNTGCGTAGAGATGTGNNNNNNCACCAGTGNNNANNNNGACTCTNNNNNNNGTAANTGNNNNTGNGNANCNAANNNNNNGGGAGCGNNNNNNNTTAGATANNNNNNNAGTACA ++SRR10391187.1 1 length=293 +CCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG#:######::CDFEGGGGGG##:CC######:#::?FGGGGF#######::BF#:BC###::BF#:B##::+BFFFGGFGGGGGFFGGGGF########6#6###86>FFGD#######66=CCEGCG?CGFF######*43BFGGC###3####*3/1;+#######*2;C#22####11#1#*#0#22######1131FE@#######(.04:A<#######(--(06 +@SRR10391187.2 2 length=293 +GTGCCAGCAGCCGCGGTAATACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGNGNNNNNNCGGTTTTTTAAGTNTGANNNNANAGCCCACGGCTNNNNNNNGGAGNGTCNNNGGAANCTNNAAAACTTGAGTGCAGAAGAGGANNNNNNNNTNCNNNGTGTAGCNNNNNNNTGCGCAGAGATATGGNNNNACACCAGTGNNGANNNNGACTTTNNNNNNNGTAANTGNNNNTGNTNTNCNAANNNNNNGGGATCANNNNGNNTTAGATANNNNNNNAGTCCA ++SRR10391187.2 2 length=293 +CCCCCGGGGGGGGGGGGGGGGGGFGGGFGGGGGGGGGGGGGGGGGGGGGGGDGFGGGGGGGGG#:######::DFDGGGGGGGG#:CD####:#:BFDGGGGGGG#######::DF#:BB###:8>D#::##8:=FEGGGGGGGGGGGGGACFG########6#6###86@FFGC#######*6>FGGGGGGGFGGG####31=CFGGGF##33####13=C>F#######/2;C#**####*1#*#/#.#22######1186*8>####0##(.08?F<#######(-4:FF +@SRR10391187.3 3 length=295 +GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGNANNNNNGCGGTCTGTCAAGTNGGANNNNAAATCCCCGGGCTNNNNNNNGGAACTGCNTTCGAAACTNNCAGGCTTGAGTCTTGTAGAGGGNNNGNGNATNCNNNGTGTAGCNNNNNNNAGCGTAGAGATCGGGANNNATACCGGTGGNGANNGCGGCCCCNNNNANNAAGAATGANGCTCAGNTGCGAANNNNNNGGGAGCANNNNGGATTAGATANNNNNNTAGCCCACG ++SRR10391187.3 3 length=295 +CCCCCGGGGGGGGGGGGGGGGGGGGGGGDGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGG#:#####::CDEGGGGGGGDG#:9C####::BDFGGGGGGGF#######:8BFGGFG#:BCFFGGG9##::>CGGGGGGGFGGGGGGGGGG###:#6#65#5###861>FF,#######**6@BCB>@CE9**43###341CF=DGC@#13##22**;CGG####*##22;**0+0#*19CF4#*/.:/C######/138C>@####00-4;4<:9:######)((-2FGGGGGFGGCGGGGGFFGG###:#6#6@FF###86DFGCFG######*6=CCCCEGGD,,2?+#/*3BEF>FGGGG58EGGGFGGGGGGG?#2:7F793556384,)-+#217CFFA<8A*6>< \ No newline at end of file diff --git a/bio/emu/abundance/test/short_read_R2.fq b/bio/emu/abundance/test/short_read_R2.fq new file mode 100644 index 0000000000..ffa6936601 --- /dev/null +++ b/bio/emu/abundance/test/short_read_R2.fq @@ -0,0 +1,16 @@ +@SRR10391187.1 1 length=301 +NANCNCGAGCTGACGANAANNNNNNNCCACCTGTCACTCTGCCCCCGANNNNGACGTNCTNTCTCTAGGATTGTNAGNGGANGTCAAGANNNNNNNAGGTTCTNNNNNNNGCTTCGAATTAAACCACANNNTNCACCGCTTGTGCGNGNCCCCGTCAATNNCTNNNAGTTTCAGTCNTGNNACCGTACTCCCCAGNCGGAGTGCTTAATGCGTTAGCTGCAGCACTAAGGGGCGGAANNNCCCTAACACTNAGCNNNCNTNNTTTNNGGCNGGGAGNANCCNNNGATCTAATCCTGTTTNN ++SRR10391187.1 1 length=301 +#8#8#=CFGGGGGGGG#=D#######::CFGGGGGGGGGGGGGGGGGG####::CDC#:C#:CBFGGGGFCEFG#:C#:CC#:BFFFGG#######::CFGGG#######9:AFGGCGGGDGGCFFGG###+#88A@FEGGGGGCB#8#66?EFFEEGE##86###88@FFFGFA;#5*##45@D:CCFGGGGF@#/1**;C8CFGFFFCFGGGGEFGC?FFFFF8;FFA1;=C:24###2197@GFBF=#-1*###-#(##--0##---#((1-(#)#-(###(-,8:A))4FF#+3##35@FGGFGFGGEGGGGGE9AFFFFFFDF6CEGGCFGG8*8DFFAEFGGFB9GG31>1#2*9>>:FFF<@#-**###(#-##-,)##--(#--/4,#)#(-###(--)4>>7AFDF=EGGECE+6EGGGF##38>EGGG7FGFGF9D#5@E:DCEGGGGDGGGF>EDCECCFGCGFFGG=EECBEFGF).:>7ACDD;0*;)..:CGGGDFFGF08)4)0C*##-#,##--)##-,(#(,/,(#0#(,###(-,)4-)442.-:<:,# +@SRR10391187.4 4 length=300 +NAACACGAGCTGACGACAANNNNNNNCCACCTGTCACTCTGCCCCCGANNNNGACGTCCTATCTCTAGGATTGTCAGAGGACGTCAAGACCNGGNAAGGTTCTNCNNNNNGCTTCGAATTAAACCACATGCTCCACCGCCTGTGCGGGCCCCCGTCAATTCCTNTGAGTTTCAACCTTGCGGTCGTACCCCCCAGGCGGAGTGCTTAATGCGTTTGCTGCAGCACTGAAGGGCGGAAACCCTCCAACACTTAGCCCNCATNGTTTNCGGCNTGGACNCNCCNNNGTTCACATCCTGTTTG ++SRR10391187.4 4 length=300 +#8ACCGGEGGGGGGD7FFG#######::CFFGGEFCFGGGGG9CCECF####::CFFEGEDFGGG7BGGGGGG@F9FGG;#66DCFFGFGECDGF9CCC*:@EE8*6BEGCC5DEC8CCCC4>FGFF+>EEGG+096?44)7347**20CC@9((98C*107D4(2*-4*2)0#--(#(-,8#-(,(#(-1-(#)#(-###(((0().42(47CDFF \ No newline at end of file diff --git a/bio/emu/abundance/wrapper.py b/bio/emu/abundance/wrapper.py new file mode 100644 index 0000000000..f3f8e7ee33 --- /dev/null +++ b/bio/emu/abundance/wrapper.py @@ -0,0 +1,44 @@ +__author__ = "Curro Campuzano Jimenez" +__copyright__ = "Copyright 2024, Curro Campuzano Jimenez" +__email__ = "campuzanocurro@gmail.com" +__license__ = "MIT" + + +from snakemake.shell import shell +import tempfile +import os + +log = snakemake.log_fmt_shell(stdout=True, stderr=True) +extra = snakemake.params.get("extra", "") +threads = snakemake.threads or 1 + +# Check input (mandatory) +msg_error = "Please provide either one file of single-end 16S reads or two files of short paired-end 16S" +if not snakemake.input.get("reads"): + raise ValueError(msg_error) +reads = snakemake.input.get("reads") +if isinstance(reads, list) and len(reads) > 2: + raise ValueError(msg_error) + +# Check database (optional) +database_cmd = "" +if database := snakemake.input.get("database_dir"): + if not os.path.isdir(database): + raise ValueError("Please provide a valid Emu database directory") + database_cmd = f"--db {database}" + +with tempfile.TemporaryDirectory() as tmpdir: + shell( + "emu abundance {reads} {database_cmd}" + " --keep-files --output-dir {tmpdir}" + " --output-basename output --output-unclassified" + " --threads {threads}" + " {extra}" + " {log}" + ) + if out_tsv := snakemake.output.get("abundances"): + shell("mv {tmpdir}/output_rel-abundance.tsv {out_tsv}") + if out_sam := snakemake.output.get("alignments"): + shell("mv {tmpdir}/output_emu_alignments.sam {out_sam}") + if out_fa := snakemake.output.get("unclassified"): + shell("mv {tmpdir}/output_unclassified.fa {out_fa}") diff --git a/test.py b/test.py index 844963697f..5d4d748525 100644 --- a/test.py +++ b/test.py @@ -5678,3 +5678,33 @@ def test_barrnap(): "-F", ], ) + +@skip_if_not_modified +def test_emu_abundance(): + run( + "bio/emu/abundance", + [ + "snakemake", + "--cores", + "1", + "sample_rel-abundance.tsv", + "sample_emu_alignments.sam", + "sample_unclassified.fa", + "--use-conda", + "-F", + ], + ) + +@skip_if_not_modified +def test_emu_abundance_paired(): + run( + "bio/emu/abundance", + [ + "snakemake", + "--cores", + "1", + "short_read_rel-abundance_paired.tsv", + "--use-conda", + "-F", + ], + ) From 7949c5e2672a4401939b7ac595d664aeb4ca24c7 Mon Sep 17 00:00:00 2001 From: Curro Campuzano Date: Fri, 5 Apr 2024 14:58:35 +0200 Subject: [PATCH 02/26] Add collapse taxonomy --- bio/emu/collapse-taxonomy/environment.yaml | 6 +++ bio/emu/collapse-taxonomy/meta.yaml | 12 ++++++ bio/emu/collapse-taxonomy/test/Snakefile | 11 ++++++ .../test/full_length_rel-abundance.tsv | 3 ++ bio/emu/collapse-taxonomy/wrapper.py | 37 +++++++++++++++++++ test.py | 14 +++++++ 6 files changed, 83 insertions(+) create mode 100644 bio/emu/collapse-taxonomy/environment.yaml create mode 100644 bio/emu/collapse-taxonomy/meta.yaml create mode 100644 bio/emu/collapse-taxonomy/test/Snakefile create mode 100644 bio/emu/collapse-taxonomy/test/full_length_rel-abundance.tsv create mode 100644 bio/emu/collapse-taxonomy/wrapper.py diff --git a/bio/emu/collapse-taxonomy/environment.yaml b/bio/emu/collapse-taxonomy/environment.yaml new file mode 100644 index 0000000000..a117676039 --- /dev/null +++ b/bio/emu/collapse-taxonomy/environment.yaml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - nodefaults +dependencies: + - emu=3.4.5 \ No newline at end of file diff --git a/bio/emu/collapse-taxonomy/meta.yaml b/bio/emu/collapse-taxonomy/meta.yaml new file mode 100644 index 0000000000..ae21e22eef --- /dev/null +++ b/bio/emu/collapse-taxonomy/meta.yaml @@ -0,0 +1,12 @@ +name: emu collapse-taxonomy +description: Collapse a TSV output file generated with emu at the desired taxonomic rank. +url: https://github.com/treangenlab/emu +authors: + - Curro Campuzano +input: + - A TSV output file generated with emu. +output: + - Another TSV output file collapsed at the desired taxonomic rank. +params: + rank: Accepted ranks are 'species', 'genus', 'family', 'order', 'class', 'phylum' and 'superkingdom' + diff --git a/bio/emu/collapse-taxonomy/test/Snakefile b/bio/emu/collapse-taxonomy/test/Snakefile new file mode 100644 index 0000000000..1ff624cfc2 --- /dev/null +++ b/bio/emu/collapse-taxonomy/test/Snakefile @@ -0,0 +1,11 @@ +rule collapse_taxonomy: + input: + "full_length_rel-abundance.tsv" + output: + "full_length_rel-abundance_collapsed.tsv" + log: + "logs/emu/full_length_collapsed.log" + params: + rank="genus" + wrapper: + "master/bio/emu/collapse-taxonomy" \ No newline at end of file diff --git a/bio/emu/collapse-taxonomy/test/full_length_rel-abundance.tsv b/bio/emu/collapse-taxonomy/test/full_length_rel-abundance.tsv new file mode 100644 index 0000000000..fd065e577e --- /dev/null +++ b/bio/emu/collapse-taxonomy/test/full_length_rel-abundance.tsv @@ -0,0 +1,3 @@ +tax_id abundance superkingdom phylum class order family genus species estimated counts +1 1.0 Bacteria Proteobacteria Gammaproteobacteria Pseudomonadales Pseudomonadaceae Pseudomonas amygdali; 2.0 +unassigned 0.0 2.0 diff --git a/bio/emu/collapse-taxonomy/wrapper.py b/bio/emu/collapse-taxonomy/wrapper.py new file mode 100644 index 0000000000..5271802867 --- /dev/null +++ b/bio/emu/collapse-taxonomy/wrapper.py @@ -0,0 +1,37 @@ +__author__ = "Curro Campuzano Jimenez" +__copyright__ = "Copyright 2024, Curro Campuzano Jimenez" +__email__ = "campuzanocurro@gmail.com" +__license__ = "MIT" + + +from snakemake.shell import shell +import tempfile +import os + +log = snakemake.log_fmt_shell(stdout=True, stderr=True) + +input_file = snakemake.input[0] +if not isinstance(input_file, str) and len(snakemake.input) != 1: + raise ValueError( + "Input should be one TSV file generated with emu: " + str(input_file) + "!" + ) + +output_file = snakemake.output[0] +if not isinstance(output_file, str) and len(snakemake.output) != 1: + raise ValueError("Output should be one file: " + str(output_file) + "!") + +if not snakemake.params.get("rank"): + raise ValueError("Please provide a rank parameter") +rank = snakemake.params.get("rank") + +with tempfile.TemporaryDirectory() as tmpdir: + # Resolve the symbolic link and get the actual path of the input file + input_file_path = os.path.realpath(input_file) + # Create a symlink of the input file in the temporary directory + symlink_path = os.path.join(tmpdir, os.path.basename(input_file_path)) + os.symlink(input_file_path, symlink_path) + shell("emu collapse-taxonomy {symlink_path} {rank} {log}") + # Get the input file name without extension + name = os.path.splitext(os.path.basename(input_file_path))[0] + temp_out = f"{tmpdir}/{name}-{rank}.tsv" # it is always a tsv + shell("mv {temp_out} {output_file}") diff --git a/test.py b/test.py index 5d4d748525..92f386b89a 100644 --- a/test.py +++ b/test.py @@ -5708,3 +5708,17 @@ def test_emu_abundance_paired(): "-F", ], ) + +@skip_if_not_modified +def test_emu_collapse_taxonomy(): + run( + "bio/emu/collapse-taxonomy", + [ + "snakemake", + "--cores", + "1", + "full_length_rel-abundance_collapsed.tsv", + "--use-conda", + "-F", + ], + ) From d1d01fa88ef17d67f9d79c93fe944b98acafde38 Mon Sep 17 00:00:00 2001 From: Curro Campuzano Date: Fri, 5 Apr 2024 16:16:45 +0200 Subject: [PATCH 03/26] Add minimap combine output --- bio/emu/combine-outputs/environment.yaml | 6 ++ bio/emu/combine-outputs/meta.yaml | 14 +++++ bio/emu/combine-outputs/test/Snakefile | 32 +++++++++++ .../test/sample1_rel-abundance.tsv | 3 + .../test/sample2_rel-abundance.tsv | 3 + .../test/sample_rel-abundance.tsv | 3 + bio/emu/combine-outputs/wrapper.py | 55 +++++++++++++++++++ test.py | 29 ++++++++++ 8 files changed, 145 insertions(+) create mode 100644 bio/emu/combine-outputs/environment.yaml create mode 100644 bio/emu/combine-outputs/meta.yaml create mode 100644 bio/emu/combine-outputs/test/Snakefile create mode 100644 bio/emu/combine-outputs/test/sample1_rel-abundance.tsv create mode 100644 bio/emu/combine-outputs/test/sample2_rel-abundance.tsv create mode 100644 bio/emu/combine-outputs/test/sample_rel-abundance.tsv create mode 100644 bio/emu/combine-outputs/wrapper.py diff --git a/bio/emu/combine-outputs/environment.yaml b/bio/emu/combine-outputs/environment.yaml new file mode 100644 index 0000000000..a117676039 --- /dev/null +++ b/bio/emu/combine-outputs/environment.yaml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - nodefaults +dependencies: + - emu=3.4.5 \ No newline at end of file diff --git a/bio/emu/combine-outputs/meta.yaml b/bio/emu/combine-outputs/meta.yaml new file mode 100644 index 0000000000..0a477d5a30 --- /dev/null +++ b/bio/emu/combine-outputs/meta.yaml @@ -0,0 +1,14 @@ +name: emu combine-outputs +description: Collapse individual abundance tables TSV into a single TSV at the desired taxonomic rank. +url: https://github.com/treangenlab/emu +authors: + - Curro Campuzano +input: + - A list of TSV files obtained with emu abundance. They should contain 'rel-abundance' in the filename. +output: + - A TSV output file collapsed at the desired taxonomic rank. +params: + rank: Accepted ranks are 'species', 'genus', 'family', 'order', 'class', 'phylum' and 'superkingdom' + counts: An optional boolean. If true, counts, rather than relative abundances are produced. It will fail if input files do not contain the column 'estimated counts'. +note: The sample columns in the final table will be the finale without the extension. If file ends with "_rel-abundance.tsv", the word '_rel-abundance' will be removed to (for consistency with the program). + diff --git a/bio/emu/combine-outputs/test/Snakefile b/bio/emu/combine-outputs/test/Snakefile new file mode 100644 index 0000000000..129de59324 --- /dev/null +++ b/bio/emu/combine-outputs/test/Snakefile @@ -0,0 +1,32 @@ +rule combine_outputs: + input: + expand( + "{sample}_rel-abundance.tsv", + sample = ["sample1", "sample2"] + ) + output: + "combined_abundances.tsv" + log: + "logs/emu/combined_abundances.log" + params: + rank="tax_id", + counts=False + wrapper: + "master/bio/emu/combine-outputs" + +rule combine_outputs_split: + input: + expand( + "{sample}_rel-abundance.tsv", + sample = ["sample1", "sample2"] + ) + output: + abundance = "counts.tsv", + taxonomy = "taxonomy.tsv", + log: + "logs/emu/combined_split.log" + params: + rank="genus", + counts=True + wrapper: + "master/bio/emu/combine-outputs" \ No newline at end of file diff --git a/bio/emu/combine-outputs/test/sample1_rel-abundance.tsv b/bio/emu/combine-outputs/test/sample1_rel-abundance.tsv new file mode 100644 index 0000000000..fd065e577e --- /dev/null +++ b/bio/emu/combine-outputs/test/sample1_rel-abundance.tsv @@ -0,0 +1,3 @@ +tax_id abundance superkingdom phylum class order family genus species estimated counts +1 1.0 Bacteria Proteobacteria Gammaproteobacteria Pseudomonadales Pseudomonadaceae Pseudomonas amygdali; 2.0 +unassigned 0.0 2.0 diff --git a/bio/emu/combine-outputs/test/sample2_rel-abundance.tsv b/bio/emu/combine-outputs/test/sample2_rel-abundance.tsv new file mode 100644 index 0000000000..61af5e4f0a --- /dev/null +++ b/bio/emu/combine-outputs/test/sample2_rel-abundance.tsv @@ -0,0 +1,3 @@ +tax_id abundance superkingdom phylum class order family genus species +1 1.0 Bacteria Proteobacteria Gammaproteobacteria Pseudomonadales Pseudomonadaceae Pseudomonas amygdali; +unassigned 0.0 diff --git a/bio/emu/combine-outputs/test/sample_rel-abundance.tsv b/bio/emu/combine-outputs/test/sample_rel-abundance.tsv new file mode 100644 index 0000000000..89e3ce5712 --- /dev/null +++ b/bio/emu/combine-outputs/test/sample_rel-abundance.tsv @@ -0,0 +1,3 @@ +tax_id abundance superkingdom phylum class order family genus species estimated counts +1 1.0 Bacteria Proteobacteria Gammaproteobacteria Pseudomonadales Pseudomonadaceae Pseudomonas amygdali; 2.0 +unassigned 0.0 0.0 diff --git a/bio/emu/combine-outputs/wrapper.py b/bio/emu/combine-outputs/wrapper.py new file mode 100644 index 0000000000..c5ee3fb170 --- /dev/null +++ b/bio/emu/combine-outputs/wrapper.py @@ -0,0 +1,55 @@ +__author__ = "Curro Campuzano Jimenez" +__copyright__ = "Copyright 2024, Curro Campuzano Jimenez" +__email__ = "campuzanocurro@gmail.com" +__license__ = "MIT" + + +from snakemake.shell import shell +import tempfile +import os + +log = snakemake.log_fmt_shell(stdout=True, stderr=True) +extra = snakemake.params.get("extra", "") + +input_files = snakemake.input +if not isinstance(input_files, list): + raise ValueError("Input should be a list of files: " + str(input_files) + "!") + +if snakemake.output.get("abundance") and snakemake.output.get("taxonomy"): + split = True + taxonomy = snakemake.output.get("taxonomy") + abundances = snakemake.output.get("abundance") +elif isinstance(snakemake.output[0], str): + split = False + table = snakemake.output[0] +else: + raise ValueError( + "Please provide either one TSV file, or two named TSV files (abundances and taxonomy)" + ) + +if not snakemake.params.get("rank"): + raise ValueError("Please provide a rank parameter") +rank = snakemake.params.get("rank") +counts = snakemake.params.get("counts", False) + +with tempfile.TemporaryDirectory() as tmpdir: + for infile in input_files: + # Files has to end in tsv, and contain rel_abundances + temp = os.path.join(tmpdir, os.path.basename(infile)) + if not temp.endswith("rel_abundances.tsv"): + temp = os.path.splitext(infile)[0] + "-rel_abundances.tsv" + os.symlink(infile, temp) + if split and counts: + shell("emu combine-outputs {tmpdir} {rank} --split-tables --counts {log}") + shell("mv {tmpdir}/emu-combined-taxonomy-{rank}.tsv {taxonomy}") + shell("mv {tmpdir}/emu-combined-abundance-{rank}-counts.tsv {abundances}") + elif split and not counts: + shell("emu combine-outputs {tmpdir} {rank} --split-tables {log}") + shell("mv {tmpdir}/emu-combined-taxonomy-{rank}.tsv {taxonomy}") + shell("mv {tmpdir}/emu-combined-abundance-{rank}.tsv {abundances}") + elif not split and counts: + shell("emu combine-outputs {tmpdir} {rank} --counts {log}") + shell("mv {tmpdir}/emu-combined-{rank}-counts.tsv {table}") + elif not split and not counts: + shell("emu combine-outputs {tmpdir} {rank} {extra} {log}") + shell("mv {tmpdir}/emu-combined-{rank}.tsv {table}") diff --git a/test.py b/test.py index 92f386b89a..b86239659a 100644 --- a/test.py +++ b/test.py @@ -5722,3 +5722,32 @@ def test_emu_collapse_taxonomy(): "-F", ], ) + +@skip_if_not_modified +def test_emu_combine_output(): + run( + "bio/emu/combine-outputs", + [ + "snakemake", + "--cores", + "1", + "combined_abundances.tsv", + "--use-conda", + "-F", + ], + ) + +@skip_if_not_modified +def test_emu_combine_output_split(): + run( + "bio/emu/combine-outputs", + [ + "snakemake", + "--cores", + "1", + "counts.tsv", + "taxonomy.tsv", + "--use-conda", + "-F", + ], + ) From 8003ebbf293929cc8a7a841f27607762b0cadc06 Mon Sep 17 00:00:00 2001 From: Curro Campuzano Date: Fri, 5 Apr 2024 16:49:15 +0200 Subject: [PATCH 04/26] Update docs --- bio/emu/collapse-taxonomy/meta.yaml | 2 +- bio/emu/combine-outputs/meta.yaml | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/bio/emu/collapse-taxonomy/meta.yaml b/bio/emu/collapse-taxonomy/meta.yaml index ae21e22eef..781fe94605 100644 --- a/bio/emu/collapse-taxonomy/meta.yaml +++ b/bio/emu/collapse-taxonomy/meta.yaml @@ -6,7 +6,7 @@ authors: input: - A TSV output file generated with emu. output: - - Another TSV output file collapsed at the desired taxonomic rank. + - A TSV output file collapsed at the desired taxonomic rank. params: rank: Accepted ranks are 'species', 'genus', 'family', 'order', 'class', 'phylum' and 'superkingdom' diff --git a/bio/emu/combine-outputs/meta.yaml b/bio/emu/combine-outputs/meta.yaml index 0a477d5a30..d59358df91 100644 --- a/bio/emu/combine-outputs/meta.yaml +++ b/bio/emu/combine-outputs/meta.yaml @@ -4,11 +4,12 @@ url: https://github.com/treangenlab/emu authors: - Curro Campuzano input: - - A list of TSV files obtained with emu abundance. They should contain 'rel-abundance' in the filename. + - A list of TSV files obtained with emu abundance. output: - - A TSV output file collapsed at the desired taxonomic rank. + - A TSV containing either both abundances and taxonomy or only the abundances. + - Optionally, a TSV containing the taxonomy (if splitting the previous file in two). params: - rank: Accepted ranks are 'species', 'genus', 'family', 'order', 'class', 'phylum' and 'superkingdom' + rank: Accepted ranks are 'species', 'genus', 'family', 'order', 'class', 'phylum' and 'superkingdom'. If no agglomeration is desired, use "tax_id". counts: An optional boolean. If true, counts, rather than relative abundances are produced. It will fail if input files do not contain the column 'estimated counts'. -note: The sample columns in the final table will be the finale without the extension. If file ends with "_rel-abundance.tsv", the word '_rel-abundance' will be removed to (for consistency with the program). +note: The sample columns in the final table will be each filename without extension. If file ends with "_rel-abundance.tsv", the word '_rel-abundance' will be removed to (for consistency with the program). From 6ba094f5d150b16cbe7b6c50b0b1d67259cd4ec7 Mon Sep 17 00:00:00 2001 From: Curro Campuzano Date: Fri, 5 Apr 2024 16:54:40 +0200 Subject: [PATCH 05/26] Lint files --- bio/emu/abundance/test/Snakefile | 29 +++++++++++----------- bio/emu/collapse-taxonomy/test/Snakefile | 10 ++++---- bio/emu/combine-outputs/test/Snakefile | 31 ++++++++++-------------- 3 files changed, 33 insertions(+), 37 deletions(-) diff --git a/bio/emu/abundance/test/Snakefile b/bio/emu/abundance/test/Snakefile index f35eb8b00f..7d6a77276d 100644 --- a/bio/emu/abundance/test/Snakefile +++ b/bio/emu/abundance/test/Snakefile @@ -1,29 +1,30 @@ rule abundance: input: - reads = "{sample}.fa", - database_dir = "database" + reads="{sample}.fa", + database_dir="database", output: - abundances = "{sample}_rel-abundance.tsv", - alignments = "{sample}_emu_alignments.sam", - unclassified = "{sample}_unclassified.fa" + abundances="{sample}_rel-abundance.tsv", + alignments="{sample}_emu_alignments.sam", + unclassified="{sample}_unclassified.fa", log: - "logs/emu/{sample}_abundance.log" + "logs/emu/{sample}_abundance.log", params: - extra="--type map-ont --keep-counts" - threads: 3 # optional, defaults to 1 + extra="--type map-ont --keep-counts", + threads: 3 # optional, defaults to 1 wrapper: "master/bio/emu/abundance" + rule abundance_paired: input: - reads =[ "{sample}_R1.fq", "{sample}_R2.fq" ], - database_dir = "database" + reads=["{sample}_R1.fq", "{sample}_R2.fq"], + database_dir="database", output: - abundances = "{sample}_rel-abundance_paired.tsv", + abundances="{sample}_rel-abundance_paired.tsv", log: - "logs/emu/{sample}_abundance_paired.log" + "logs/emu/{sample}_abundance_paired.log", params: - extra="--type sr --keep-counts" - threads: 3 # optional, defaults to 1 + extra="--type sr --keep-counts", + threads: 3 # optional, defaults to 1 wrapper: "master/bio/emu/abundance" diff --git a/bio/emu/collapse-taxonomy/test/Snakefile b/bio/emu/collapse-taxonomy/test/Snakefile index 1ff624cfc2..91ec4a9d20 100644 --- a/bio/emu/collapse-taxonomy/test/Snakefile +++ b/bio/emu/collapse-taxonomy/test/Snakefile @@ -1,11 +1,11 @@ rule collapse_taxonomy: input: - "full_length_rel-abundance.tsv" + "full_length_rel-abundance.tsv", output: - "full_length_rel-abundance_collapsed.tsv" + "full_length_rel-abundance_collapsed.tsv", log: - "logs/emu/full_length_collapsed.log" + "logs/emu/full_length_collapsed.log", params: - rank="genus" + rank="genus", wrapper: - "master/bio/emu/collapse-taxonomy" \ No newline at end of file + "master/bio/emu/collapse-taxonomy" diff --git a/bio/emu/combine-outputs/test/Snakefile b/bio/emu/combine-outputs/test/Snakefile index 129de59324..e8e28e8626 100644 --- a/bio/emu/combine-outputs/test/Snakefile +++ b/bio/emu/combine-outputs/test/Snakefile @@ -1,32 +1,27 @@ rule combine_outputs: input: - expand( - "{sample}_rel-abundance.tsv", - sample = ["sample1", "sample2"] - ) + expand("{sample}_rel-abundance.tsv", sample=["sample1", "sample2"]), output: - "combined_abundances.tsv" + "combined_abundances.tsv", log: - "logs/emu/combined_abundances.log" + "logs/emu/combined_abundances.log", params: - rank="tax_id", - counts=False + rank="tax_id", + counts=False, wrapper: "master/bio/emu/combine-outputs" + rule combine_outputs_split: input: - expand( - "{sample}_rel-abundance.tsv", - sample = ["sample1", "sample2"] - ) + expand("{sample}_rel-abundance.tsv", sample=["sample1", "sample2"]), output: - abundance = "counts.tsv", - taxonomy = "taxonomy.tsv", + abundance="counts.tsv", + taxonomy="taxonomy.tsv", log: - "logs/emu/combined_split.log" + "logs/emu/combined_split.log", params: - rank="genus", - counts=True + rank="genus", + counts=True, wrapper: - "master/bio/emu/combine-outputs" \ No newline at end of file + "master/bio/emu/combine-outputs" From 6d9228621246b8f9b7159564e4c237cbc416a08e Mon Sep 17 00:00:00 2001 From: Curro Campuzano <69399781+currocam@users.noreply.github.com> Date: Mon, 8 Apr 2024 10:10:26 +0200 Subject: [PATCH 06/26] Update bio/emu/collapse-taxonomy/wrapper.py Co-authored-by: Filipe G. Vieira <1151762+fgvieira@users.noreply.github.com> --- bio/emu/collapse-taxonomy/wrapper.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/bio/emu/collapse-taxonomy/wrapper.py b/bio/emu/collapse-taxonomy/wrapper.py index 5271802867..8539621c39 100644 --- a/bio/emu/collapse-taxonomy/wrapper.py +++ b/bio/emu/collapse-taxonomy/wrapper.py @@ -20,9 +20,7 @@ if not isinstance(output_file, str) and len(snakemake.output) != 1: raise ValueError("Output should be one file: " + str(output_file) + "!") -if not snakemake.params.get("rank"): - raise ValueError("Please provide a rank parameter") -rank = snakemake.params.get("rank") +rank = snakemake.params.get("rank", "species") with tempfile.TemporaryDirectory() as tmpdir: # Resolve the symbolic link and get the actual path of the input file From 8a6e8c133318820d213679006b23d371dd682523 Mon Sep 17 00:00:00 2001 From: Curro Campuzano <69399781+currocam@users.noreply.github.com> Date: Mon, 8 Apr 2024 10:12:12 +0200 Subject: [PATCH 07/26] Document default behavior --- bio/emu/collapse-taxonomy/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bio/emu/collapse-taxonomy/meta.yaml b/bio/emu/collapse-taxonomy/meta.yaml index 781fe94605..3f56fe88fe 100644 --- a/bio/emu/collapse-taxonomy/meta.yaml +++ b/bio/emu/collapse-taxonomy/meta.yaml @@ -8,5 +8,5 @@ input: output: - A TSV output file collapsed at the desired taxonomic rank. params: - rank: Accepted ranks are 'species', 'genus', 'family', 'order', 'class', 'phylum' and 'superkingdom' + rank: Accepted ranks are 'species', 'genus', 'family', 'order', 'class', 'phylum' and 'superkingdom'. If ommited, agglomeration will be done at the species level. From 3b4fd628a99f58e94708b42949786d3a05f91376 Mon Sep 17 00:00:00 2001 From: Curro Campuzano <69399781+currocam@users.noreply.github.com> Date: Mon, 8 Apr 2024 10:24:49 +0200 Subject: [PATCH 08/26] Remove unnecessary assertions --- bio/emu/collapse-taxonomy/wrapper.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/bio/emu/collapse-taxonomy/wrapper.py b/bio/emu/collapse-taxonomy/wrapper.py index 8539621c39..b155bd4b8a 100644 --- a/bio/emu/collapse-taxonomy/wrapper.py +++ b/bio/emu/collapse-taxonomy/wrapper.py @@ -11,15 +11,7 @@ log = snakemake.log_fmt_shell(stdout=True, stderr=True) input_file = snakemake.input[0] -if not isinstance(input_file, str) and len(snakemake.input) != 1: - raise ValueError( - "Input should be one TSV file generated with emu: " + str(input_file) + "!" - ) - output_file = snakemake.output[0] -if not isinstance(output_file, str) and len(snakemake.output) != 1: - raise ValueError("Output should be one file: " + str(output_file) + "!") - rank = snakemake.params.get("rank", "species") with tempfile.TemporaryDirectory() as tmpdir: From 3ff00909c060041ab424f03822224075a53178f7 Mon Sep 17 00:00:00 2001 From: Curro Campuzano <69399781+currocam@users.noreply.github.com> Date: Mon, 8 Apr 2024 11:53:02 +0200 Subject: [PATCH 09/26] Modify params combine-outputs --- bio/emu/combine-outputs/meta.yaml | 4 ++-- bio/emu/combine-outputs/test/Snakefile | 5 +---- bio/emu/combine-outputs/wrapper.py | 16 ++++++++-------- 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/bio/emu/combine-outputs/meta.yaml b/bio/emu/combine-outputs/meta.yaml index d59358df91..228815066b 100644 --- a/bio/emu/combine-outputs/meta.yaml +++ b/bio/emu/combine-outputs/meta.yaml @@ -9,7 +9,7 @@ output: - A TSV containing either both abundances and taxonomy or only the abundances. - Optionally, a TSV containing the taxonomy (if splitting the previous file in two). params: - rank: Accepted ranks are 'species', 'genus', 'family', 'order', 'class', 'phylum' and 'superkingdom'. If no agglomeration is desired, use "tax_id". - counts: An optional boolean. If true, counts, rather than relative abundances are produced. It will fail if input files do not contain the column 'estimated counts'. + rank: Accepted ranks are 'tax_id', 'species', 'genus', 'family', 'order', 'class', 'phylum' and 'superkingdom'. If ommited, no agglomeration will be done (that is, the default is 'tax_id'). + extra: Extra arguments (such as '--counts'). note: The sample columns in the final table will be each filename without extension. If file ends with "_rel-abundance.tsv", the word '_rel-abundance' will be removed to (for consistency with the program). diff --git a/bio/emu/combine-outputs/test/Snakefile b/bio/emu/combine-outputs/test/Snakefile index e8e28e8626..d446c71419 100644 --- a/bio/emu/combine-outputs/test/Snakefile +++ b/bio/emu/combine-outputs/test/Snakefile @@ -5,9 +5,6 @@ rule combine_outputs: "combined_abundances.tsv", log: "logs/emu/combined_abundances.log", - params: - rank="tax_id", - counts=False, wrapper: "master/bio/emu/combine-outputs" @@ -22,6 +19,6 @@ rule combine_outputs_split: "logs/emu/combined_split.log", params: rank="genus", - counts=True, + extra="--counts", wrapper: "master/bio/emu/combine-outputs" diff --git a/bio/emu/combine-outputs/wrapper.py b/bio/emu/combine-outputs/wrapper.py index c5ee3fb170..2a4e7f75c2 100644 --- a/bio/emu/combine-outputs/wrapper.py +++ b/bio/emu/combine-outputs/wrapper.py @@ -17,6 +17,7 @@ if snakemake.output.get("abundance") and snakemake.output.get("taxonomy"): split = True + extra += " --split-tables" taxonomy = snakemake.output.get("taxonomy") abundances = snakemake.output.get("abundance") elif isinstance(snakemake.output[0], str): @@ -27,10 +28,12 @@ "Please provide either one TSV file, or two named TSV files (abundances and taxonomy)" ) -if not snakemake.params.get("rank"): - raise ValueError("Please provide a rank parameter") -rank = snakemake.params.get("rank") -counts = snakemake.params.get("counts", False) +if "--split-tables" in extra and not split: + raise ValueError("You cannot use --split-tables and produce a single output.") + +rank = snakemake.params.get("rank", "tax_id") +counts = "--counts" in extra + with tempfile.TemporaryDirectory() as tmpdir: for infile in input_files: @@ -39,17 +42,14 @@ if not temp.endswith("rel_abundances.tsv"): temp = os.path.splitext(infile)[0] + "-rel_abundances.tsv" os.symlink(infile, temp) + shell("emu combine-outputs {tmpdir} {rank} {extra} {log}") if split and counts: - shell("emu combine-outputs {tmpdir} {rank} --split-tables --counts {log}") shell("mv {tmpdir}/emu-combined-taxonomy-{rank}.tsv {taxonomy}") shell("mv {tmpdir}/emu-combined-abundance-{rank}-counts.tsv {abundances}") elif split and not counts: - shell("emu combine-outputs {tmpdir} {rank} --split-tables {log}") shell("mv {tmpdir}/emu-combined-taxonomy-{rank}.tsv {taxonomy}") shell("mv {tmpdir}/emu-combined-abundance-{rank}.tsv {abundances}") elif not split and counts: - shell("emu combine-outputs {tmpdir} {rank} --counts {log}") shell("mv {tmpdir}/emu-combined-{rank}-counts.tsv {table}") elif not split and not counts: - shell("emu combine-outputs {tmpdir} {rank} {extra} {log}") shell("mv {tmpdir}/emu-combined-{rank}.tsv {table}") From 608fdb62e3c00f096fa41544cb749f82438df6e5 Mon Sep 17 00:00:00 2001 From: "Filipe G. Vieira" <1151762+fgvieira@users.noreply.github.com> Date: Mon, 8 Apr 2024 15:15:36 +0200 Subject: [PATCH 10/26] Add names to input and output --- bio/emu/abundance/meta.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bio/emu/abundance/meta.yaml b/bio/emu/abundance/meta.yaml index 5ddec3ad0c..9106bd2c70 100644 --- a/bio/emu/abundance/meta.yaml +++ b/bio/emu/abundance/meta.yaml @@ -4,11 +4,11 @@ url: https://github.com/treangenlab/emu authors: - Curro Campuzano input: - - Nucleotide sequence file(s) (either a single ONT or Pac-Bio fasta file, a single fastq file or paired fastq files) - - Optional. A emu database (i.e. a directory that contains at least the files "taxonomy.tsv" and "species_taxid.fasta", check documentation for pre-built databases and how to build them). + - reads: single fastq file or paired fastq files + - db: emu database (optional; check documentation for pre-built databases and how to build them). output: - - A TSV with relative (and optionally, absolute abundances). - - An optional SAM file with the alignments. - - An optional FASTA file with unclassified sequences. + - abundances: TSV with relative (and optionally, absolute abundances). + - alignments: SAM file with the alignments (optional). + - unclassified: FASTA file with unclassified sequences (optional). params: extra: Any optimal parameter such as --type (sequencer) or --min-abundance. Optional flags involving output are handled automatically (e.g. --output-dir, --output-basename ...) From b803b8aa44d4902725b1ed6ed5b4a7dd6d9d4129 Mon Sep 17 00:00:00 2001 From: Curro Campuzano <69399781+currocam@users.noreply.github.com> Date: Mon, 8 Apr 2024 15:27:02 +0200 Subject: [PATCH 11/26] Improve abundance emu Co-authored-by: Filipe G. Vieira <1151762+fgvieira@users.noreply.github.com> --- bio/emu/abundance/wrapper.py | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/bio/emu/abundance/wrapper.py b/bio/emu/abundance/wrapper.py index f3f8e7ee33..af1e0c80ce 100644 --- a/bio/emu/abundance/wrapper.py +++ b/bio/emu/abundance/wrapper.py @@ -10,29 +10,16 @@ log = snakemake.log_fmt_shell(stdout=True, stderr=True) extra = snakemake.params.get("extra", "") -threads = snakemake.threads or 1 - -# Check input (mandatory) -msg_error = "Please provide either one file of single-end 16S reads or two files of short paired-end 16S" -if not snakemake.input.get("reads"): - raise ValueError(msg_error) -reads = snakemake.input.get("reads") -if isinstance(reads, list) and len(reads) > 2: - raise ValueError(msg_error) - # Check database (optional) -database_cmd = "" -if database := snakemake.input.get("database_dir"): - if not os.path.isdir(database): - raise ValueError("Please provide a valid Emu database directory") - database_cmd = f"--db {database}" +if db:= snakemake.input.get("db"): + db = f"--db {db}" with tempfile.TemporaryDirectory() as tmpdir: shell( - "emu abundance {reads} {database_cmd}" + "emu abundance {snakemake.input.reads} {db}" " --keep-files --output-dir {tmpdir}" " --output-basename output --output-unclassified" - " --threads {threads}" + " --threads {snakemake.threads}" " {extra}" " {log}" ) From eb2466d3d5df7c3e91874d57ce27acffa88006c7 Mon Sep 17 00:00:00 2001 From: Curro Campuzano <69399781+currocam@users.noreply.github.com> Date: Mon, 8 Apr 2024 15:44:20 +0200 Subject: [PATCH 12/26] Fix typo snakefile --- bio/emu/abundance/test/Snakefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bio/emu/abundance/test/Snakefile b/bio/emu/abundance/test/Snakefile index 7d6a77276d..466de94623 100644 --- a/bio/emu/abundance/test/Snakefile +++ b/bio/emu/abundance/test/Snakefile @@ -1,7 +1,7 @@ rule abundance: input: reads="{sample}.fa", - database_dir="database", + db="database", output: abundances="{sample}_rel-abundance.tsv", alignments="{sample}_emu_alignments.sam", @@ -18,7 +18,7 @@ rule abundance: rule abundance_paired: input: reads=["{sample}_R1.fq", "{sample}_R2.fq"], - database_dir="database", + db="database", output: abundances="{sample}_rel-abundance_paired.tsv", log: From cab7906815cc4b5df55ad365f7750e3571344cb0 Mon Sep 17 00:00:00 2001 From: Curro Campuzano Date: Mon, 8 Apr 2024 18:15:12 +0200 Subject: [PATCH 13/26] Linter --- bio/emu/abundance/wrapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bio/emu/abundance/wrapper.py b/bio/emu/abundance/wrapper.py index af1e0c80ce..998135990d 100644 --- a/bio/emu/abundance/wrapper.py +++ b/bio/emu/abundance/wrapper.py @@ -11,7 +11,7 @@ log = snakemake.log_fmt_shell(stdout=True, stderr=True) extra = snakemake.params.get("extra", "") # Check database (optional) -if db:= snakemake.input.get("db"): +if db := snakemake.input.get("db"): db = f"--db {db}" with tempfile.TemporaryDirectory() as tmpdir: From 16d424a63ef74a4ded8e32f10214df1e11a82871 Mon Sep 17 00:00:00 2001 From: Curro Campuzano Date: Mon, 8 Apr 2024 18:16:56 +0200 Subject: [PATCH 14/26] Add pin-conda --- .../abundance/environment.linux-64.pin.txt | 62 +++++++++++++++++++ .../environment.linux-64.pin.txt | 62 +++++++++++++++++++ .../environment.linux-64.pin.txt | 62 +++++++++++++++++++ 3 files changed, 186 insertions(+) create mode 100644 bio/emu/abundance/environment.linux-64.pin.txt create mode 100644 bio/emu/collapse-taxonomy/environment.linux-64.pin.txt create mode 100644 bio/emu/combine-outputs/environment.linux-64.pin.txt diff --git a/bio/emu/abundance/environment.linux-64.pin.txt b/bio/emu/abundance/environment.linux-64.pin.txt new file mode 100644 index 0000000000..a22b63bcaf --- /dev/null +++ b/bio/emu/abundance/environment.linux-64.pin.txt @@ -0,0 +1,62 @@ +# This file may be used to create an environment using: +# $ conda create --name --file +# platform: linux-64 +@EXPLICIT +https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81 +https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.2.2-hbcca054_0.conda#2f4327a1cbe7f022401b236e915a5fef +https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.40-h41732ed_0.conda#7aca3059a1729aa76c597603f10b0dd3 +https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-13.2.0-h7e041cc_5.conda#f6f6600d18a4047b54f803cf708b868a +https://conda.anaconda.org/conda-forge/linux-64/python_abi-3.10-4_cp310.conda#26322ec5d7712c3ded99dd656142b8ce +https://conda.anaconda.org/conda-forge/noarch/tzdata-2024a-h0c530f3_0.conda#161081fc7cec0bfda0d86d7cb595f8d8 +https://conda.anaconda.org/conda-forge/linux-64/libgomp-13.2.0-h807b86a_5.conda#d211c42b9ce49aee3734fdc828731689 +https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2#73aaf86a425cc6e73fcf236a5a46396d +https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-13.2.0-h807b86a_5.conda#d4ff227c46917d3b4565302a2bbb276b +https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hd590300_5.conda#69b8b6202a07720f448be700e300ccf4 +https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.28.1-hd590300_0.conda#dcde58ff9a1f30b0037a2315d1846d1f +https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.1-h166bdaf_0.tar.bz2#30186d27e2c9fa62b45fb1476b7200e3 +https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.18-h0b41bf4_0.conda#6aa9c9de5542ecb07fdda9ca626252d8 +https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda#172bf1cd1ff8629f2b1179945ed45055 +https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.2-h7f98852_5.tar.bz2#d645c6d2ac96843a2bfaccd2d62b3ac3 +https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-13.2.0-ha4646dd_5.conda#7a6bd7a12a4bd359e2afe6c0fa1acace +https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hd590300_0.conda#30fd6e37fe21f86f4bd26d6ee73eeec7 +https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda#40b61aab5c7ba9ff276c41cfffe6b80b +https://conda.anaconda.org/conda-forge/linux-64/libxcrypt-4.4.36-hd590300_1.conda#5aa797f8787fe7a17d1b0821485b5adc +https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.2.13-hd590300_5.conda#f36c115f1ee199da648e0597ec2047ad +https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.4.20240210-h59595ed_0.conda#97da8860a0da5413c7c98a3b3838a645 +https://conda.anaconda.org/conda-forge/linux-64/openssl-3.2.1-hd590300_1.conda#9d731343cff6ee2e5a25c4a091bf8e2a +https://conda.anaconda.org/conda-forge/linux-64/xz-5.2.6-h166bdaf_0.tar.bz2#2161070d867d1b1204ea749c8eec4ef0 +https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20191231-he28a2e2_2.tar.bz2#4d331e44109e3f0e19b4cb8f9b82f3e1 +https://conda.anaconda.org/conda-forge/linux-64/libgfortran-ng-13.2.0-h69a702a_5.conda#e73e9cfd1191783392131e6238bdb3e9 +https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.58.0-h47da74e_1.conda#700ac6ea6d53d5510591c4344d5c989a +https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.45.2-h2797004_0.conda#866983a220e27a80cb75e85cb30466a1 +https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.0-h0841786_0.conda#1f5a58e686b13bcfde88b93f547d23fe +https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8228510_1.conda#47d31b792659ce70f470b5c82fdfb7a4 +https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h4845f30_101.conda#d453b98d9c83e71da0741bb0ff4d76bc +https://conda.anaconda.org/conda-forge/linux-64/zlib-1.2.13-hd590300_5.conda#68c34ec6149623be41a1933ab996a209 +https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.5-hfc55251_0.conda#04b88013080254850d6c01ed54810589 +https://conda.anaconda.org/bioconda/linux-64/bioawk-1.0-he4a0461_10.tar.bz2#3f4ea155f59ae781753ea76571e8564a +https://conda.anaconda.org/bioconda/linux-64/k8-0.2.5-hdcf5f25_4.tar.bz2#d3c49a96ae45864706037702775ca7c2 +https://conda.anaconda.org/conda-forge/linux-64/krb5-1.21.2-h659d440_0.conda#cd95826dbd331ed1be26bdf401432844 +https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.27-pthreads_h413a1c8_0.conda#a356024784da6dfd4683dc5ecf45b155 +https://conda.anaconda.org/conda-forge/linux-64/python-3.10.14-hd12c33a_0_cpython.conda#2b4ba962994e8bd4be9ff5b64b75aff2 +https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-22_linux64_openblas.conda#1a2a0cd3153464fee6646f3dd6dad9b8 +https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.7.1-hca28451_0.conda#755c7f876815003337d2c61ff5d047e5 +https://conda.anaconda.org/bioconda/linux-64/minimap2-2.28-he4a0461_0.tar.bz2#27d83cfe6bca3eb50aaeb6334371122d +https://conda.anaconda.org/conda-forge/noarch/python-tzdata-2024.1-pyhd8ed1ab_0.conda#98206ea9954216ee7540f0c773f2104d +https://conda.anaconda.org/conda-forge/noarch/pytz-2024.1-pyhd8ed1ab_0.conda#3eeeeb9e4827ace8c0c1419c85d590ad +https://conda.anaconda.org/conda-forge/noarch/setuptools-69.2.0-pyhd8ed1ab_0.conda#da214ecd521a720a9d521c68047682dc +https://conda.anaconda.org/conda-forge/noarch/six-1.16.0-pyh6c4a22f_0.tar.bz2#e5f25f8dbc060e9a8d912e432202afc2 +https://conda.anaconda.org/conda-forge/noarch/wheel-0.43.0-pyhd8ed1ab_1.conda#0b5293a157c2b5cd513dd1b03d8d3aae +https://conda.anaconda.org/conda-forge/noarch/zipp-3.17.0-pyhd8ed1ab_0.conda#2e4d6bc0b14e10f895fc6791a7d9b26a +https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-7.1.0-pyha770c72_0.conda#0896606848b2dc5cebdf111b6543aa04 +https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.9.0-22_linux64_openblas.conda#4b31699e0ec5de64d5896e580389c9a1 +https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.9.0-22_linux64_openblas.conda#b083767b6c877e24ee597d93b87ab838 +https://conda.anaconda.org/conda-forge/linux-64/pathlib2-2.3.7.post1-py310hff52083_3.conda#62d26790749f62b9329425c901d93c3a +https://conda.anaconda.org/conda-forge/noarch/pip-24.0-pyhd8ed1ab_0.conda#f586ac1e56c8638b64f9c8122a7b8a67 +https://conda.anaconda.org/bioconda/linux-64/pysam-0.22.0-py310h41dec4a_1.tar.bz2#19fdb9301a6debbb7fe9836670e3feb7 +https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0-pyhd8ed1ab_0.conda#2cf4264fffb9e6eff6031c5b6884d61c +https://conda.anaconda.org/conda-forge/noarch/flatten-dict-0.4.2-pyhd8ed1ab_1.tar.bz2#ccfb30b92adfeb283d4dcae3d0b6441b +https://conda.anaconda.org/conda-forge/linux-64/numpy-1.26.4-py310hb13e2d6_0.conda#6593de64c935768b6bad3e19b3e978be +https://conda.anaconda.org/conda-forge/linux-64/biopython-1.83-py310h2372a71_0.conda#0128595946cebfaaf212cc45d4b9cd3c +https://conda.anaconda.org/conda-forge/linux-64/pandas-2.2.1-py310hcc13569_0.conda#cf5d315e3601a6a2931f63aa9a84dc40 +https://conda.anaconda.org/bioconda/noarch/emu-3.4.5-hdfd78af_0.tar.bz2#34b067c6f82c3796a40e1b0ecaf094d3 diff --git a/bio/emu/collapse-taxonomy/environment.linux-64.pin.txt b/bio/emu/collapse-taxonomy/environment.linux-64.pin.txt new file mode 100644 index 0000000000..a22b63bcaf --- /dev/null +++ b/bio/emu/collapse-taxonomy/environment.linux-64.pin.txt @@ -0,0 +1,62 @@ +# This file may be used to create an environment using: +# $ conda create --name --file +# platform: linux-64 +@EXPLICIT +https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81 +https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.2.2-hbcca054_0.conda#2f4327a1cbe7f022401b236e915a5fef +https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.40-h41732ed_0.conda#7aca3059a1729aa76c597603f10b0dd3 +https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-13.2.0-h7e041cc_5.conda#f6f6600d18a4047b54f803cf708b868a +https://conda.anaconda.org/conda-forge/linux-64/python_abi-3.10-4_cp310.conda#26322ec5d7712c3ded99dd656142b8ce +https://conda.anaconda.org/conda-forge/noarch/tzdata-2024a-h0c530f3_0.conda#161081fc7cec0bfda0d86d7cb595f8d8 +https://conda.anaconda.org/conda-forge/linux-64/libgomp-13.2.0-h807b86a_5.conda#d211c42b9ce49aee3734fdc828731689 +https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2#73aaf86a425cc6e73fcf236a5a46396d +https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-13.2.0-h807b86a_5.conda#d4ff227c46917d3b4565302a2bbb276b +https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hd590300_5.conda#69b8b6202a07720f448be700e300ccf4 +https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.28.1-hd590300_0.conda#dcde58ff9a1f30b0037a2315d1846d1f +https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.1-h166bdaf_0.tar.bz2#30186d27e2c9fa62b45fb1476b7200e3 +https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.18-h0b41bf4_0.conda#6aa9c9de5542ecb07fdda9ca626252d8 +https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda#172bf1cd1ff8629f2b1179945ed45055 +https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.2-h7f98852_5.tar.bz2#d645c6d2ac96843a2bfaccd2d62b3ac3 +https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-13.2.0-ha4646dd_5.conda#7a6bd7a12a4bd359e2afe6c0fa1acace +https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hd590300_0.conda#30fd6e37fe21f86f4bd26d6ee73eeec7 +https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda#40b61aab5c7ba9ff276c41cfffe6b80b +https://conda.anaconda.org/conda-forge/linux-64/libxcrypt-4.4.36-hd590300_1.conda#5aa797f8787fe7a17d1b0821485b5adc +https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.2.13-hd590300_5.conda#f36c115f1ee199da648e0597ec2047ad +https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.4.20240210-h59595ed_0.conda#97da8860a0da5413c7c98a3b3838a645 +https://conda.anaconda.org/conda-forge/linux-64/openssl-3.2.1-hd590300_1.conda#9d731343cff6ee2e5a25c4a091bf8e2a +https://conda.anaconda.org/conda-forge/linux-64/xz-5.2.6-h166bdaf_0.tar.bz2#2161070d867d1b1204ea749c8eec4ef0 +https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20191231-he28a2e2_2.tar.bz2#4d331e44109e3f0e19b4cb8f9b82f3e1 +https://conda.anaconda.org/conda-forge/linux-64/libgfortran-ng-13.2.0-h69a702a_5.conda#e73e9cfd1191783392131e6238bdb3e9 +https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.58.0-h47da74e_1.conda#700ac6ea6d53d5510591c4344d5c989a +https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.45.2-h2797004_0.conda#866983a220e27a80cb75e85cb30466a1 +https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.0-h0841786_0.conda#1f5a58e686b13bcfde88b93f547d23fe +https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8228510_1.conda#47d31b792659ce70f470b5c82fdfb7a4 +https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h4845f30_101.conda#d453b98d9c83e71da0741bb0ff4d76bc +https://conda.anaconda.org/conda-forge/linux-64/zlib-1.2.13-hd590300_5.conda#68c34ec6149623be41a1933ab996a209 +https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.5-hfc55251_0.conda#04b88013080254850d6c01ed54810589 +https://conda.anaconda.org/bioconda/linux-64/bioawk-1.0-he4a0461_10.tar.bz2#3f4ea155f59ae781753ea76571e8564a +https://conda.anaconda.org/bioconda/linux-64/k8-0.2.5-hdcf5f25_4.tar.bz2#d3c49a96ae45864706037702775ca7c2 +https://conda.anaconda.org/conda-forge/linux-64/krb5-1.21.2-h659d440_0.conda#cd95826dbd331ed1be26bdf401432844 +https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.27-pthreads_h413a1c8_0.conda#a356024784da6dfd4683dc5ecf45b155 +https://conda.anaconda.org/conda-forge/linux-64/python-3.10.14-hd12c33a_0_cpython.conda#2b4ba962994e8bd4be9ff5b64b75aff2 +https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-22_linux64_openblas.conda#1a2a0cd3153464fee6646f3dd6dad9b8 +https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.7.1-hca28451_0.conda#755c7f876815003337d2c61ff5d047e5 +https://conda.anaconda.org/bioconda/linux-64/minimap2-2.28-he4a0461_0.tar.bz2#27d83cfe6bca3eb50aaeb6334371122d +https://conda.anaconda.org/conda-forge/noarch/python-tzdata-2024.1-pyhd8ed1ab_0.conda#98206ea9954216ee7540f0c773f2104d +https://conda.anaconda.org/conda-forge/noarch/pytz-2024.1-pyhd8ed1ab_0.conda#3eeeeb9e4827ace8c0c1419c85d590ad +https://conda.anaconda.org/conda-forge/noarch/setuptools-69.2.0-pyhd8ed1ab_0.conda#da214ecd521a720a9d521c68047682dc +https://conda.anaconda.org/conda-forge/noarch/six-1.16.0-pyh6c4a22f_0.tar.bz2#e5f25f8dbc060e9a8d912e432202afc2 +https://conda.anaconda.org/conda-forge/noarch/wheel-0.43.0-pyhd8ed1ab_1.conda#0b5293a157c2b5cd513dd1b03d8d3aae +https://conda.anaconda.org/conda-forge/noarch/zipp-3.17.0-pyhd8ed1ab_0.conda#2e4d6bc0b14e10f895fc6791a7d9b26a +https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-7.1.0-pyha770c72_0.conda#0896606848b2dc5cebdf111b6543aa04 +https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.9.0-22_linux64_openblas.conda#4b31699e0ec5de64d5896e580389c9a1 +https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.9.0-22_linux64_openblas.conda#b083767b6c877e24ee597d93b87ab838 +https://conda.anaconda.org/conda-forge/linux-64/pathlib2-2.3.7.post1-py310hff52083_3.conda#62d26790749f62b9329425c901d93c3a +https://conda.anaconda.org/conda-forge/noarch/pip-24.0-pyhd8ed1ab_0.conda#f586ac1e56c8638b64f9c8122a7b8a67 +https://conda.anaconda.org/bioconda/linux-64/pysam-0.22.0-py310h41dec4a_1.tar.bz2#19fdb9301a6debbb7fe9836670e3feb7 +https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0-pyhd8ed1ab_0.conda#2cf4264fffb9e6eff6031c5b6884d61c +https://conda.anaconda.org/conda-forge/noarch/flatten-dict-0.4.2-pyhd8ed1ab_1.tar.bz2#ccfb30b92adfeb283d4dcae3d0b6441b +https://conda.anaconda.org/conda-forge/linux-64/numpy-1.26.4-py310hb13e2d6_0.conda#6593de64c935768b6bad3e19b3e978be +https://conda.anaconda.org/conda-forge/linux-64/biopython-1.83-py310h2372a71_0.conda#0128595946cebfaaf212cc45d4b9cd3c +https://conda.anaconda.org/conda-forge/linux-64/pandas-2.2.1-py310hcc13569_0.conda#cf5d315e3601a6a2931f63aa9a84dc40 +https://conda.anaconda.org/bioconda/noarch/emu-3.4.5-hdfd78af_0.tar.bz2#34b067c6f82c3796a40e1b0ecaf094d3 diff --git a/bio/emu/combine-outputs/environment.linux-64.pin.txt b/bio/emu/combine-outputs/environment.linux-64.pin.txt new file mode 100644 index 0000000000..a22b63bcaf --- /dev/null +++ b/bio/emu/combine-outputs/environment.linux-64.pin.txt @@ -0,0 +1,62 @@ +# This file may be used to create an environment using: +# $ conda create --name --file +# platform: linux-64 +@EXPLICIT +https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81 +https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.2.2-hbcca054_0.conda#2f4327a1cbe7f022401b236e915a5fef +https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.40-h41732ed_0.conda#7aca3059a1729aa76c597603f10b0dd3 +https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-13.2.0-h7e041cc_5.conda#f6f6600d18a4047b54f803cf708b868a +https://conda.anaconda.org/conda-forge/linux-64/python_abi-3.10-4_cp310.conda#26322ec5d7712c3ded99dd656142b8ce +https://conda.anaconda.org/conda-forge/noarch/tzdata-2024a-h0c530f3_0.conda#161081fc7cec0bfda0d86d7cb595f8d8 +https://conda.anaconda.org/conda-forge/linux-64/libgomp-13.2.0-h807b86a_5.conda#d211c42b9ce49aee3734fdc828731689 +https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2#73aaf86a425cc6e73fcf236a5a46396d +https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-13.2.0-h807b86a_5.conda#d4ff227c46917d3b4565302a2bbb276b +https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hd590300_5.conda#69b8b6202a07720f448be700e300ccf4 +https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.28.1-hd590300_0.conda#dcde58ff9a1f30b0037a2315d1846d1f +https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.1-h166bdaf_0.tar.bz2#30186d27e2c9fa62b45fb1476b7200e3 +https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.18-h0b41bf4_0.conda#6aa9c9de5542ecb07fdda9ca626252d8 +https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda#172bf1cd1ff8629f2b1179945ed45055 +https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.2-h7f98852_5.tar.bz2#d645c6d2ac96843a2bfaccd2d62b3ac3 +https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-13.2.0-ha4646dd_5.conda#7a6bd7a12a4bd359e2afe6c0fa1acace +https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hd590300_0.conda#30fd6e37fe21f86f4bd26d6ee73eeec7 +https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda#40b61aab5c7ba9ff276c41cfffe6b80b +https://conda.anaconda.org/conda-forge/linux-64/libxcrypt-4.4.36-hd590300_1.conda#5aa797f8787fe7a17d1b0821485b5adc +https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.2.13-hd590300_5.conda#f36c115f1ee199da648e0597ec2047ad +https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.4.20240210-h59595ed_0.conda#97da8860a0da5413c7c98a3b3838a645 +https://conda.anaconda.org/conda-forge/linux-64/openssl-3.2.1-hd590300_1.conda#9d731343cff6ee2e5a25c4a091bf8e2a +https://conda.anaconda.org/conda-forge/linux-64/xz-5.2.6-h166bdaf_0.tar.bz2#2161070d867d1b1204ea749c8eec4ef0 +https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20191231-he28a2e2_2.tar.bz2#4d331e44109e3f0e19b4cb8f9b82f3e1 +https://conda.anaconda.org/conda-forge/linux-64/libgfortran-ng-13.2.0-h69a702a_5.conda#e73e9cfd1191783392131e6238bdb3e9 +https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.58.0-h47da74e_1.conda#700ac6ea6d53d5510591c4344d5c989a +https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.45.2-h2797004_0.conda#866983a220e27a80cb75e85cb30466a1 +https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.0-h0841786_0.conda#1f5a58e686b13bcfde88b93f547d23fe +https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8228510_1.conda#47d31b792659ce70f470b5c82fdfb7a4 +https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h4845f30_101.conda#d453b98d9c83e71da0741bb0ff4d76bc +https://conda.anaconda.org/conda-forge/linux-64/zlib-1.2.13-hd590300_5.conda#68c34ec6149623be41a1933ab996a209 +https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.5-hfc55251_0.conda#04b88013080254850d6c01ed54810589 +https://conda.anaconda.org/bioconda/linux-64/bioawk-1.0-he4a0461_10.tar.bz2#3f4ea155f59ae781753ea76571e8564a +https://conda.anaconda.org/bioconda/linux-64/k8-0.2.5-hdcf5f25_4.tar.bz2#d3c49a96ae45864706037702775ca7c2 +https://conda.anaconda.org/conda-forge/linux-64/krb5-1.21.2-h659d440_0.conda#cd95826dbd331ed1be26bdf401432844 +https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.27-pthreads_h413a1c8_0.conda#a356024784da6dfd4683dc5ecf45b155 +https://conda.anaconda.org/conda-forge/linux-64/python-3.10.14-hd12c33a_0_cpython.conda#2b4ba962994e8bd4be9ff5b64b75aff2 +https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-22_linux64_openblas.conda#1a2a0cd3153464fee6646f3dd6dad9b8 +https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.7.1-hca28451_0.conda#755c7f876815003337d2c61ff5d047e5 +https://conda.anaconda.org/bioconda/linux-64/minimap2-2.28-he4a0461_0.tar.bz2#27d83cfe6bca3eb50aaeb6334371122d +https://conda.anaconda.org/conda-forge/noarch/python-tzdata-2024.1-pyhd8ed1ab_0.conda#98206ea9954216ee7540f0c773f2104d +https://conda.anaconda.org/conda-forge/noarch/pytz-2024.1-pyhd8ed1ab_0.conda#3eeeeb9e4827ace8c0c1419c85d590ad +https://conda.anaconda.org/conda-forge/noarch/setuptools-69.2.0-pyhd8ed1ab_0.conda#da214ecd521a720a9d521c68047682dc +https://conda.anaconda.org/conda-forge/noarch/six-1.16.0-pyh6c4a22f_0.tar.bz2#e5f25f8dbc060e9a8d912e432202afc2 +https://conda.anaconda.org/conda-forge/noarch/wheel-0.43.0-pyhd8ed1ab_1.conda#0b5293a157c2b5cd513dd1b03d8d3aae +https://conda.anaconda.org/conda-forge/noarch/zipp-3.17.0-pyhd8ed1ab_0.conda#2e4d6bc0b14e10f895fc6791a7d9b26a +https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-7.1.0-pyha770c72_0.conda#0896606848b2dc5cebdf111b6543aa04 +https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.9.0-22_linux64_openblas.conda#4b31699e0ec5de64d5896e580389c9a1 +https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.9.0-22_linux64_openblas.conda#b083767b6c877e24ee597d93b87ab838 +https://conda.anaconda.org/conda-forge/linux-64/pathlib2-2.3.7.post1-py310hff52083_3.conda#62d26790749f62b9329425c901d93c3a +https://conda.anaconda.org/conda-forge/noarch/pip-24.0-pyhd8ed1ab_0.conda#f586ac1e56c8638b64f9c8122a7b8a67 +https://conda.anaconda.org/bioconda/linux-64/pysam-0.22.0-py310h41dec4a_1.tar.bz2#19fdb9301a6debbb7fe9836670e3feb7 +https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0-pyhd8ed1ab_0.conda#2cf4264fffb9e6eff6031c5b6884d61c +https://conda.anaconda.org/conda-forge/noarch/flatten-dict-0.4.2-pyhd8ed1ab_1.tar.bz2#ccfb30b92adfeb283d4dcae3d0b6441b +https://conda.anaconda.org/conda-forge/linux-64/numpy-1.26.4-py310hb13e2d6_0.conda#6593de64c935768b6bad3e19b3e978be +https://conda.anaconda.org/conda-forge/linux-64/biopython-1.83-py310h2372a71_0.conda#0128595946cebfaaf212cc45d4b9cd3c +https://conda.anaconda.org/conda-forge/linux-64/pandas-2.2.1-py310hcc13569_0.conda#cf5d315e3601a6a2931f63aa9a84dc40 +https://conda.anaconda.org/bioconda/noarch/emu-3.4.5-hdfd78af_0.tar.bz2#34b067c6f82c3796a40e1b0ecaf094d3 From 36be3377a911ec819fcac80b5bf26cd1d6f75e30 Mon Sep 17 00:00:00 2001 From: Curro Campuzano <69399781+currocam@users.noreply.github.com> Date: Thu, 11 Apr 2024 14:31:42 +0200 Subject: [PATCH 15/26] Both outputs named --- bio/emu/combine-outputs/meta.yaml | 5 +++-- bio/emu/combine-outputs/test/Snakefile | 4 ++-- bio/emu/combine-outputs/wrapper.py | 7 +++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/bio/emu/combine-outputs/meta.yaml b/bio/emu/combine-outputs/meta.yaml index 228815066b..45903b1ca2 100644 --- a/bio/emu/combine-outputs/meta.yaml +++ b/bio/emu/combine-outputs/meta.yaml @@ -6,8 +6,9 @@ authors: input: - A list of TSV files obtained with emu abundance. output: - - A TSV containing either both abundances and taxonomy or only the abundances. - - Optionally, a TSV containing the taxonomy (if splitting the previous file in two). + - Abundances. A TSV containing either the abundance of different taxa. + - Taxonomy. If specified, a separate TSV containing the taxonomy. Otherwise, taxonomy will be included in the abundance table. + both abundances and taxonomy or only the abundances. params: rank: Accepted ranks are 'tax_id', 'species', 'genus', 'family', 'order', 'class', 'phylum' and 'superkingdom'. If ommited, no agglomeration will be done (that is, the default is 'tax_id'). extra: Extra arguments (such as '--counts'). diff --git a/bio/emu/combine-outputs/test/Snakefile b/bio/emu/combine-outputs/test/Snakefile index d446c71419..1b74cb8b0e 100644 --- a/bio/emu/combine-outputs/test/Snakefile +++ b/bio/emu/combine-outputs/test/Snakefile @@ -2,7 +2,7 @@ rule combine_outputs: input: expand("{sample}_rel-abundance.tsv", sample=["sample1", "sample2"]), output: - "combined_abundances.tsv", + abundances="combined_abundances.tsv", log: "logs/emu/combined_abundances.log", wrapper: @@ -13,7 +13,7 @@ rule combine_outputs_split: input: expand("{sample}_rel-abundance.tsv", sample=["sample1", "sample2"]), output: - abundance="counts.tsv", + abundances="counts.tsv", taxonomy="taxonomy.tsv", log: "logs/emu/combined_split.log", diff --git a/bio/emu/combine-outputs/wrapper.py b/bio/emu/combine-outputs/wrapper.py index 2a4e7f75c2..7d7768ebb6 100644 --- a/bio/emu/combine-outputs/wrapper.py +++ b/bio/emu/combine-outputs/wrapper.py @@ -15,14 +15,13 @@ if not isinstance(input_files, list): raise ValueError("Input should be a list of files: " + str(input_files) + "!") -if snakemake.output.get("abundance") and snakemake.output.get("taxonomy"): +if snakemake.output.get("abundances") and snakemake.output.get("taxonomy"): split = True extra += " --split-tables" taxonomy = snakemake.output.get("taxonomy") - abundances = snakemake.output.get("abundance") -elif isinstance(snakemake.output[0], str): + abundances = snakemake.output.get("abundances") +elif table := snakemake.output.get("abundances"): split = False - table = snakemake.output[0] else: raise ValueError( "Please provide either one TSV file, or two named TSV files (abundances and taxonomy)" From f1c7edc0d19c36d1bf3e062348f7ca4d8e238a38 Mon Sep 17 00:00:00 2001 From: Curro Campuzano <69399781+currocam@users.noreply.github.com> Date: Thu, 11 Apr 2024 14:40:25 +0200 Subject: [PATCH 16/26] Fix broken yaml --- bio/emu/combine-outputs/meta.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/bio/emu/combine-outputs/meta.yaml b/bio/emu/combine-outputs/meta.yaml index 45903b1ca2..7808070eef 100644 --- a/bio/emu/combine-outputs/meta.yaml +++ b/bio/emu/combine-outputs/meta.yaml @@ -8,7 +8,6 @@ input: output: - Abundances. A TSV containing either the abundance of different taxa. - Taxonomy. If specified, a separate TSV containing the taxonomy. Otherwise, taxonomy will be included in the abundance table. - both abundances and taxonomy or only the abundances. params: rank: Accepted ranks are 'tax_id', 'species', 'genus', 'family', 'order', 'class', 'phylum' and 'superkingdom'. If ommited, no agglomeration will be done (that is, the default is 'tax_id'). extra: Extra arguments (such as '--counts'). From 893edf6df086ba5a7319905fe5def4b3c100978c Mon Sep 17 00:00:00 2001 From: Curro Campuzano <69399781+currocam@users.noreply.github.com> Date: Fri, 12 Apr 2024 13:46:12 +0200 Subject: [PATCH 17/26] Update bio/emu/combine-outputs/meta.yaml Co-authored-by: Filipe G. Vieira <1151762+fgvieira@users.noreply.github.com> --- bio/emu/combine-outputs/meta.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bio/emu/combine-outputs/meta.yaml b/bio/emu/combine-outputs/meta.yaml index 7808070eef..3ab6e95d92 100644 --- a/bio/emu/combine-outputs/meta.yaml +++ b/bio/emu/combine-outputs/meta.yaml @@ -6,8 +6,8 @@ authors: input: - A list of TSV files obtained with emu abundance. output: - - Abundances. A TSV containing either the abundance of different taxa. - - Taxonomy. If specified, a separate TSV containing the taxonomy. Otherwise, taxonomy will be included in the abundance table. + - abundances: TSV file containing the abundance of different taxa. + - taxonomy: TSV file containing the taxonomy (optional; otherwise, taxonomy will be included in the abundance table). params: rank: Accepted ranks are 'tax_id', 'species', 'genus', 'family', 'order', 'class', 'phylum' and 'superkingdom'. If ommited, no agglomeration will be done (that is, the default is 'tax_id'). extra: Extra arguments (such as '--counts'). From 16b79d7980d61206ea65a368898a2a806cd436a7 Mon Sep 17 00:00:00 2001 From: Curro Campuzano Date: Fri, 12 Apr 2024 13:52:50 +0200 Subject: [PATCH 18/26] Add test case --- bio/emu/combine-outputs/test/Snakefile | 2 +- bio/emu/combine-outputs/test/sample1_rel-abundance.txt | 1 + bio/emu/combine-outputs/test/sample2_rel-abundance.txt | 1 + bio/emu/combine-outputs/test/sample_rel-abundance.tsv | 3 --- 4 files changed, 3 insertions(+), 4 deletions(-) create mode 120000 bio/emu/combine-outputs/test/sample1_rel-abundance.txt create mode 120000 bio/emu/combine-outputs/test/sample2_rel-abundance.txt delete mode 100644 bio/emu/combine-outputs/test/sample_rel-abundance.tsv diff --git a/bio/emu/combine-outputs/test/Snakefile b/bio/emu/combine-outputs/test/Snakefile index 1b74cb8b0e..65973721a0 100644 --- a/bio/emu/combine-outputs/test/Snakefile +++ b/bio/emu/combine-outputs/test/Snakefile @@ -11,7 +11,7 @@ rule combine_outputs: rule combine_outputs_split: input: - expand("{sample}_rel-abundance.tsv", sample=["sample1", "sample2"]), + expand("{sample}_rel-abundance.txt", sample=["sample1", "sample2"]), output: abundances="counts.tsv", taxonomy="taxonomy.tsv", diff --git a/bio/emu/combine-outputs/test/sample1_rel-abundance.txt b/bio/emu/combine-outputs/test/sample1_rel-abundance.txt new file mode 120000 index 0000000000..6fb3595c9e --- /dev/null +++ b/bio/emu/combine-outputs/test/sample1_rel-abundance.txt @@ -0,0 +1 @@ +sample1_rel-abundance.tsv \ No newline at end of file diff --git a/bio/emu/combine-outputs/test/sample2_rel-abundance.txt b/bio/emu/combine-outputs/test/sample2_rel-abundance.txt new file mode 120000 index 0000000000..c74fd5a73f --- /dev/null +++ b/bio/emu/combine-outputs/test/sample2_rel-abundance.txt @@ -0,0 +1 @@ +sample2_rel-abundance.tsv \ No newline at end of file diff --git a/bio/emu/combine-outputs/test/sample_rel-abundance.tsv b/bio/emu/combine-outputs/test/sample_rel-abundance.tsv deleted file mode 100644 index 89e3ce5712..0000000000 --- a/bio/emu/combine-outputs/test/sample_rel-abundance.tsv +++ /dev/null @@ -1,3 +0,0 @@ -tax_id abundance superkingdom phylum class order family genus species estimated counts -1 1.0 Bacteria Proteobacteria Gammaproteobacteria Pseudomonadales Pseudomonadaceae Pseudomonas amygdali; 2.0 -unassigned 0.0 0.0 From ac4157754abc3932208fa75adbfd0c171906fab5 Mon Sep 17 00:00:00 2001 From: "Filipe G. Vieira" <1151762+fgvieira@users.noreply.github.com> Date: Fri, 12 Apr 2024 15:34:12 +0200 Subject: [PATCH 19/26] Code reformat --- bio/emu/combine-outputs/wrapper.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/bio/emu/combine-outputs/wrapper.py b/bio/emu/combine-outputs/wrapper.py index 7d7768ebb6..aa3a477bd1 100644 --- a/bio/emu/combine-outputs/wrapper.py +++ b/bio/emu/combine-outputs/wrapper.py @@ -11,10 +11,6 @@ log = snakemake.log_fmt_shell(stdout=True, stderr=True) extra = snakemake.params.get("extra", "") -input_files = snakemake.input -if not isinstance(input_files, list): - raise ValueError("Input should be a list of files: " + str(input_files) + "!") - if snakemake.output.get("abundances") and snakemake.output.get("taxonomy"): split = True extra += " --split-tables" From 73f5a1f4b7ca75aeb01a7a75ee099ab8433085fb Mon Sep 17 00:00:00 2001 From: "Filipe G. Vieira" <1151762+fgvieira@users.noreply.github.com> Date: Fri, 12 Apr 2024 15:34:33 +0200 Subject: [PATCH 20/26] Code update --- bio/emu/combine-outputs/wrapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bio/emu/combine-outputs/wrapper.py b/bio/emu/combine-outputs/wrapper.py index aa3a477bd1..fb18ec92db 100644 --- a/bio/emu/combine-outputs/wrapper.py +++ b/bio/emu/combine-outputs/wrapper.py @@ -31,7 +31,7 @@ with tempfile.TemporaryDirectory() as tmpdir: - for infile in input_files: + for infile in snakemake.input: # Files has to end in tsv, and contain rel_abundances temp = os.path.join(tmpdir, os.path.basename(infile)) if not temp.endswith("rel_abundances.tsv"): From 34974a9cfb9a1a7a798f5ada399183642c58e4de Mon Sep 17 00:00:00 2001 From: "Filipe G. Vieira" <1151762+fgvieira@users.noreply.github.com> Date: Fri, 12 Apr 2024 15:42:37 +0200 Subject: [PATCH 21/26] Code reformat --- bio/emu/combine-outputs/wrapper.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/bio/emu/combine-outputs/wrapper.py b/bio/emu/combine-outputs/wrapper.py index fb18ec92db..a053c0e95c 100644 --- a/bio/emu/combine-outputs/wrapper.py +++ b/bio/emu/combine-outputs/wrapper.py @@ -11,17 +11,13 @@ log = snakemake.log_fmt_shell(stdout=True, stderr=True) extra = snakemake.params.get("extra", "") -if snakemake.output.get("abundances") and snakemake.output.get("taxonomy"): +taxonomy = snakemake.output.get("taxonomy", "") +abundances = snakemake.output.get("abundances", "") +if taxonomy and abundances: split = True extra += " --split-tables" - taxonomy = snakemake.output.get("taxonomy") - abundances = snakemake.output.get("abundances") -elif table := snakemake.output.get("abundances"): +elise: split = False -else: - raise ValueError( - "Please provide either one TSV file, or two named TSV files (abundances and taxonomy)" - ) if "--split-tables" in extra and not split: raise ValueError("You cannot use --split-tables and produce a single output.") From 1eeb8419cebe179df5aed78cd258100b216cc5bb Mon Sep 17 00:00:00 2001 From: "Filipe G. Vieira" <1151762+fgvieira@users.noreply.github.com> Date: Fri, 12 Apr 2024 15:42:46 +0200 Subject: [PATCH 22/26] Code reformat --- bio/emu/combine-outputs/wrapper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bio/emu/combine-outputs/wrapper.py b/bio/emu/combine-outputs/wrapper.py index a053c0e95c..d3d56a2c9b 100644 --- a/bio/emu/combine-outputs/wrapper.py +++ b/bio/emu/combine-outputs/wrapper.py @@ -41,6 +41,6 @@ shell("mv {tmpdir}/emu-combined-taxonomy-{rank}.tsv {taxonomy}") shell("mv {tmpdir}/emu-combined-abundance-{rank}.tsv {abundances}") elif not split and counts: - shell("mv {tmpdir}/emu-combined-{rank}-counts.tsv {table}") + shell("mv {tmpdir}/emu-combined-{rank}-counts.tsv {abundances}") elif not split and not counts: - shell("mv {tmpdir}/emu-combined-{rank}.tsv {table}") + shell("mv {tmpdir}/emu-combined-{rank}.tsv {abundances}") From ea0940d830eb25a9b8a71fb2dd393cdda055b5be Mon Sep 17 00:00:00 2001 From: "Filipe G. Vieira" <1151762+fgvieira@users.noreply.github.com> Date: Fri, 12 Apr 2024 15:43:13 +0200 Subject: [PATCH 23/26] Code cleanup --- bio/emu/combine-outputs/wrapper.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/bio/emu/combine-outputs/wrapper.py b/bio/emu/combine-outputs/wrapper.py index d3d56a2c9b..f9426bae9d 100644 --- a/bio/emu/combine-outputs/wrapper.py +++ b/bio/emu/combine-outputs/wrapper.py @@ -19,9 +19,6 @@ elise: split = False -if "--split-tables" in extra and not split: - raise ValueError("You cannot use --split-tables and produce a single output.") - rank = snakemake.params.get("rank", "tax_id") counts = "--counts" in extra From f5acf5146fd7a9ed11c3bf42a3bd3a3b75f5b530 Mon Sep 17 00:00:00 2001 From: "Filipe G. Vieira" <1151762+fgvieira@users.noreply.github.com> Date: Fri, 12 Apr 2024 15:43:44 +0200 Subject: [PATCH 24/26] Code cleanup --- bio/emu/abundance/wrapper.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bio/emu/abundance/wrapper.py b/bio/emu/abundance/wrapper.py index 998135990d..98f7c532b6 100644 --- a/bio/emu/abundance/wrapper.py +++ b/bio/emu/abundance/wrapper.py @@ -10,7 +10,6 @@ log = snakemake.log_fmt_shell(stdout=True, stderr=True) extra = snakemake.params.get("extra", "") -# Check database (optional) if db := snakemake.input.get("db"): db = f"--db {db}" From 04c9dffb179302a836fc806360299d6a90040904 Mon Sep 17 00:00:00 2001 From: Curro Campuzano <69399781+currocam@users.noreply.github.com> Date: Fri, 12 Apr 2024 15:54:40 +0200 Subject: [PATCH 25/26] Update bio/emu/abundance/wrapper.py Co-authored-by: Filipe G. Vieira <1151762+fgvieira@users.noreply.github.com> --- bio/emu/abundance/wrapper.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bio/emu/abundance/wrapper.py b/bio/emu/abundance/wrapper.py index 98f7c532b6..d84af9247d 100644 --- a/bio/emu/abundance/wrapper.py +++ b/bio/emu/abundance/wrapper.py @@ -10,7 +10,8 @@ log = snakemake.log_fmt_shell(stdout=True, stderr=True) extra = snakemake.params.get("extra", "") -if db := snakemake.input.get("db"): +db = snakemake.input.get("db", ""): +if db: db = f"--db {db}" with tempfile.TemporaryDirectory() as tmpdir: From d7c5b6c9c2c2bdfb4bce94d9d220dfeb719b43e4 Mon Sep 17 00:00:00 2001 From: Curro Campuzano Date: Fri, 12 Apr 2024 15:58:34 +0200 Subject: [PATCH 26/26] Fix typos --- bio/emu/abundance/wrapper.py | 2 +- bio/emu/combine-outputs/wrapper.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bio/emu/abundance/wrapper.py b/bio/emu/abundance/wrapper.py index d84af9247d..28a571f814 100644 --- a/bio/emu/abundance/wrapper.py +++ b/bio/emu/abundance/wrapper.py @@ -10,7 +10,7 @@ log = snakemake.log_fmt_shell(stdout=True, stderr=True) extra = snakemake.params.get("extra", "") -db = snakemake.input.get("db", ""): +db = snakemake.input.get("db", "") if db: db = f"--db {db}" diff --git a/bio/emu/combine-outputs/wrapper.py b/bio/emu/combine-outputs/wrapper.py index f9426bae9d..e1d62ea787 100644 --- a/bio/emu/combine-outputs/wrapper.py +++ b/bio/emu/combine-outputs/wrapper.py @@ -16,7 +16,7 @@ if taxonomy and abundances: split = True extra += " --split-tables" -elise: +else: split = False rank = snakemake.params.get("rank", "tax_id")