From 097049f8562d76b2ebc532d5ccfc3fc0b93943b6 Mon Sep 17 00:00:00 2001
From: Curro Campuzano <campuzanocurro@gmail.com>
Date: Fri, 5 Apr 2024 13:32:54 +0200
Subject: [PATCH 01/26] Minimal support emu abundance

---
 bio/emu/abundance/environment.yaml            |  6 ++
 bio/emu/abundance/meta.yaml                   | 14 ++++
 bio/emu/abundance/test/Snakefile              | 29 ++++++++
 .../test/database/species_taxid.fasta         | 15 ++++
 bio/emu/abundance/test/database/taxonomy.tsv  |  2 +
 bio/emu/abundance/test/sample.fa              | 70 +++++++++++++++++++
 bio/emu/abundance/test/short_read_R1.fq       | 16 +++++
 bio/emu/abundance/test/short_read_R2.fq       | 16 +++++
 bio/emu/abundance/wrapper.py                  | 44 ++++++++++++
 test.py                                       | 30 ++++++++
 10 files changed, 242 insertions(+)
 create mode 100644 bio/emu/abundance/environment.yaml
 create mode 100644 bio/emu/abundance/meta.yaml
 create mode 100644 bio/emu/abundance/test/Snakefile
 create mode 100644 bio/emu/abundance/test/database/species_taxid.fasta
 create mode 100644 bio/emu/abundance/test/database/taxonomy.tsv
 create mode 100644 bio/emu/abundance/test/sample.fa
 create mode 100644 bio/emu/abundance/test/short_read_R1.fq
 create mode 100644 bio/emu/abundance/test/short_read_R2.fq
 create mode 100644 bio/emu/abundance/wrapper.py

diff --git a/bio/emu/abundance/environment.yaml b/bio/emu/abundance/environment.yaml
new file mode 100644
index 0000000000..a117676039
--- /dev/null
+++ b/bio/emu/abundance/environment.yaml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - bioconda
+  - nodefaults
+dependencies:
+  - emu=3.4.5
\ No newline at end of file
diff --git a/bio/emu/abundance/meta.yaml b/bio/emu/abundance/meta.yaml
new file mode 100644
index 0000000000..5ddec3ad0c
--- /dev/null
+++ b/bio/emu/abundance/meta.yaml
@@ -0,0 +1,14 @@
+name: emu abundance
+description: Generate relative abundance estimates from ONT, Pac-Bio or short 16S reads using emu.
+url: https://github.com/treangenlab/emu
+authors:
+  - Curro Campuzano
+input:
+  - Nucleotide sequence file(s) (either a single ONT or Pac-Bio fasta file, a single fastq file or paired fastq files)
+  - Optional. A emu database (i.e. a directory that contains at least the files "taxonomy.tsv" and "species_taxid.fasta", check documentation for pre-built databases and how to build them).
+output:
+  - A TSV with relative (and optionally, absolute abundances).
+  - An optional SAM file with the alignments.
+  - An optional FASTA file with unclassified sequences.
+params:
+  extra: Any optimal parameter such as --type (sequencer) or --min-abundance. Optional flags involving output are handled automatically (e.g. --output-dir, --output-basename ...)
diff --git a/bio/emu/abundance/test/Snakefile b/bio/emu/abundance/test/Snakefile
new file mode 100644
index 0000000000..f35eb8b00f
--- /dev/null
+++ b/bio/emu/abundance/test/Snakefile
@@ -0,0 +1,29 @@
+rule abundance:
+    input:
+      reads = "{sample}.fa",
+      database_dir = "database"
+    output:
+      abundances = "{sample}_rel-abundance.tsv",
+      alignments = "{sample}_emu_alignments.sam",
+      unclassified = "{sample}_unclassified.fa"
+    log:
+        "logs/emu/{sample}_abundance.log"
+    params:
+      extra="--type map-ont --keep-counts"
+    threads: 3 # optional, defaults to 1
+    wrapper:
+        "master/bio/emu/abundance"
+
+rule abundance_paired:
+    input:
+      reads =[ "{sample}_R1.fq", "{sample}_R2.fq" ],
+      database_dir = "database"
+    output:
+      abundances = "{sample}_rel-abundance_paired.tsv",
+    log:
+        "logs/emu/{sample}_abundance_paired.log"
+    params:
+      extra="--type sr --keep-counts"
+    threads: 3 # optional, defaults to 1
+    wrapper:
+        "master/bio/emu/abundance"
diff --git a/bio/emu/abundance/test/database/species_taxid.fasta b/bio/emu/abundance/test/database/species_taxid.fasta
new file mode 100644
index 0000000000..4e0e691a19
--- /dev/null
+++ b/bio/emu/abundance/test/database/species_taxid.fasta
@@ -0,0 +1,15 @@
+>1:emu-silva:1 ['dada2-silva_1 Bacteria;Proteobacteria;Gammaproteobacteria;Pseudomonadales;Pseudomonadaceae;Pseudomonas;amygdali;']
+AACTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAG
+TCGAGCGGCAGCACGGGTACTTGTACCTGGTGGCGAGCGGCGGACGGGTGAGTAATGCCT
+AGGAATCTGCCTGGTAGTGGGGGATAACGCTCGGAAACGGACGCTAATACCGCATACGTC
+CTACGGGAGAAAGCAGGGGACCTTCGGGCCTTGCGCTATCAGATGAGCCTAGGTCGGATT
+AGCTAGTTGGTGAGGTAATGGCTCACCAAGGCGACGATCCGTAACTGGTCTGAGAGGATG
+ATCAGTCACACTGGAACTGAGACACGGTCCAGACTCCTACGGGAGGCAGCAGTGGGGAAT
+ATTGGACAATGGGCGAAAGCCTGATCCAGCCATGCCGCGTGTGTGAAGAAGGTCTTCGGA
+TTGTAAAGCACTTTAAGTTGGGAGGAAGGGCAGTTACCTAATACGTATCTGTTTTGACGT
+TACCGACAGAATAAGCACCGGCTAACTCTGTGCCAGCAGCCGCGGTAATACAGAGGGTGC
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAA
+GCGCACNNNNGCGGTCTGTCAAGTCGGANGNNAAATCCCCGGGCTNNNNNNNGGAACTGC
+ATTCGAAACTGNCAGGCTTGAGTCTTGTAGAGGGNNNTNGNATTCNNNGTGTAGCGNNNN
+NNTGCGTAGAGATCTGGANGAACACCAGTGGCGAAGGCGGCTCTCTNGTCTGTAACTGAC
+GCTGAGGCTCGAAAGCNTGGGGAGCAAACAGGATTAGATANCCTGGTAGTCCACG
\ No newline at end of file
diff --git a/bio/emu/abundance/test/database/taxonomy.tsv b/bio/emu/abundance/test/database/taxonomy.tsv
new file mode 100644
index 0000000000..c6a3209d65
--- /dev/null
+++ b/bio/emu/abundance/test/database/taxonomy.tsv
@@ -0,0 +1,2 @@
+tax_id	superkingdom	phylum	class	order	family	genus	species
+1	Bacteria	Proteobacteria	Gammaproteobacteria	Pseudomonadales	Pseudomonadaceae	Pseudomonas	amygdali;
\ No newline at end of file
diff --git a/bio/emu/abundance/test/sample.fa b/bio/emu/abundance/test/sample.fa
new file mode 100644
index 0000000000..985df2df9c
--- /dev/null
+++ b/bio/emu/abundance/test/sample.fa
@@ -0,0 +1,70 @@
+>Sphingobacterium_puteal_r1
+ACGGGTGCGTAACGCGTGAGCAACCTACCTCTATCAGGGGGATAGCCTCTCGAAAGAGAGATTAACACCGCATAACA
+TCAACAGTTCGCATGTTCGGTTGATTAAATATTTATAGGATAGAGATGGGCTCGCGTGACATTAGCTAGTTGGTAGGGTA
+ACGGCCTACCAAGGCGACGATGTCTAGGGGCTCTGAGAGGAGAATCCCCCACACTGGTACTGAGACACGGACCAGACTCC
+TACGGGAGGCAGCAGTAAGGAATATTGGTCAATGGGCGGAAGCCTGAACCAGCCATGCCGCGTGCAGGAAGACTGCCCTA
+TGGGTTGTAAACTGCTTTTGTCCAGGAATAAACCTCTTTACGTGTAGAGAGCTGAATGTACTGGAAGAATAAGGATCGGC
+TAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCG
+GCCTGTTAAGTCAGGGGTGAAATACGGTGGCTCAACCATCGCAGTGCCTTTGATACTGACGGGCTTGAATCCATTTGAAG
+TGGGCGGAATAAGACAAGTAGCGGTGAAATGCATAGATATGTCTTAGAACTCCGATTGCGAAGGCAGCTCACTAAGCTGG
+TATTGACGCTGATGCACGAAAGCGTGGGGATCGAACAGGATTAGATACCCTGGTAGTCCACGCCCTAAACGATGATAACT
+CGATGTTGGCGATAGACCGCCAGCGTCCAAGCGAAAGCGTTAAGTTATCCACCTGGGGAGTACGCCCGCAAGGGTGAAAC
+TCAAAGGAATTGACGGGGGCCCGCACAAGCGGAGGAGCATGTGGTTTAATTCGATGATACGCGAGGAACCTTACCCGGGC
+TTGAAAGTTAGTGAAGGATGCGGAGACGCATCCGTCCTTCGGGACACGAAACTAGGTGCTGCATGGCTGTCGTCAGCTCG
+TGCCGTGAGGTGTTGGGTTAAGTCCCGCAACGAGCGCAACCCCTATGTTTAGTTGCCAGCAAGTAATGTTGGGGACTCTA
+AACAGACTGCCTGCGCAAGCAGAGAGGAAGGTGGGGACGACGTCAAGTCATCATGGCCCTTACGTCCGGGGCTACACACG
+TGCTACAATGGATGGTACAGCGGGCAGCTACATAGCAATATGGTGCTAATCTCTAAAAGCCATTCACAGTTCGGATTGGG
+GTCTGCAACTCGACCCCATGAAGTTGGATTCGCTAGTAATCGCGTATCAGC
+>Sphingobacterium_puteal_r2
+GGCCTAATACATGCAAGTCGGACGGGATTTAAGTTAAAGCTTGCTTTAAGTTAATGAGAGTGG
+CGCACGGGTGCGTAACGCGTGAGCAACCTACCTCTATCAGGGGGATAGCCTCTCGAAAGAGAGATTAACACCGCATAACA
+TCAACAGTTCGCATGTTCGGTTGATTAAATATTTATAGGATAGAGATGGGCTCGCGTGACATTAGCTAGTTGGTAGGGTA
+ACGGCCTACCAAGGCGACGATGTCTAGGGGCTCTGAGAGGAGAATCCCCCACACTGGTACTGAGACACGGACCAGACTCC
+TACGGGAGGCAGCAGTAAGGAATATTGGTCAATGGGCGGAAGCCTGAACCAGCCATGCCGCGTGCAGGAAGACTGCCCTA
+TGGGTTGTAAACTGCTTTTGTCCAGGAATAAACCTCTTTACGTGTAGAGAGCTGAATGTACTGGAAGAATAAGGATCGGC
+TAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGATCCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCG
+GCCTGTTAAGTCAGGGGTGAAATACGGTGGCTCAACCATCGCAGTGCCTTTGATACTGACGGGCTTGAATCCATTTGAAG
+TGGGCGGAATAAGACAAGTAGCGGTGAAATGCATAGATATGTCTTAGAACTCCGATTGCGAAGGCAGCTCACTAAGCTGG
+TATTGACGCTGATGCACGAAAGCGTGGGGATCGAACAGGATTAGATACCCTGGTAGTCCACGCCCTAAACGATGATAACT
+CGATGTTGGCGATAGACCGCCAGCGTCCAAGCGAAAGCGTTAAGTTATCCACCTGGGGAGTACGCCCGCAAGGGTGAAAC
+TCAAAGGAATTGACGGGGGCCCGCACAAGCGGAGGAGCATGTGGTTTAATTCGATGATACGCGAGGAACCTTACCCGGGC
+TTGAAAGTTAGTGAAGGATGCGGAGACGCATCCGTCCTTCGGGACACGAAACTAGGTGCTGCATGGCTGTCGTCAGCTCG
+TGCCGTGAGGTGTTGGGTTAAGTCCCGCAACGAGCGCAACCCCTATGTTTAGTTGCCAGCAAGTAATGTTGGGGACTCTA
+AACAGACTGCCTGCGCAAGCAGAGAGGAAGGTGGGGACGACGTCAAGTCATCATGGCCCTTACGTCCGGGGCTACACACG
+TGCTACAATGGATGGTACAGCGGGCAGCTACATAGCAATATGGTGCTAATCTCTAAAAGCCATTCACAGTTCGGATTGGG
+GTCTGCAACTCGACCCCATGAAGTTGGATTCGCTAGTAATCGCGTATCAGCAATGACGCGGTGAATACGTTCCCGGGCCT
+TGTACACA
+>Mycobacterium_saskatchewanense_r1
+AGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGGAAAGGTCTCTTCGGAGATAC
+TCGAGTGGCGAACGGGTGAGTAACACGTGGGCAATCTGCCCTGCACTTCGGGATAAGCCTGGGAAACTGGGTCTAATACC
+GGATAGGACCTTTAGGCGCATGCCTTTTGGTGGAAAGCTTTTGCGGTGTGGGATGGGCCCGCGGCCTATCAGCTTGTTGG
+TGGGGTGATGGCCTACCAAGGCGACGACGGGTAGCCGGCCTGAGAGGGTGTCCGGCCACACTGGGACTGAGATACGGCCC
+AGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCGACGCCGCGTGGGGGATGAC
+GGCCTTCGGGTTGTAAACCTCTTTCAGCAGGGACGAAGCGCAAGTGACGGTACCTGCAGAAGAAGCACCGGCCAACTACG
+TGCCAGCAGCCGCGGTAATACGTAGGGTGCGAGCGTTGTCCGGAATTACTGGGCGTAAAGAGCTCGTAGGTGGTTTGTCG
+CGTTGTTCGTGAAATCTCACGGCTTAACTGTGAGCGTGCGGGCGATACGGGCAGACTAGAGTACTGCAGGGGAGACTGGA
+ATTCCTGGTGTAGCGGTGGAATGCGCAGATATCAGGAGGAACACCGGTGGCGAAGGCGGGTCTCTGGGCAGTAACTGACG
+CTGAGGAGCGAAAGCGTGGGGAGCGAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGGTGGGTACTAGGTGTGG
+GTTTCCTTCCTTGGGATCCGTGCCGTAGCTAACGCATTAAGTACCCCGCCTGGGGAGTACGGCCGCAAGGCTAAAACTCA
+AAGGAATTGACGGGGGCCCGCACAAGCGGCGGAGCATGTGGATTAATTCGATGCAACGCGAAGAACCTTACCTGGGTTTG
+ACATGCACAGGACGCCGGCAGAGATGTCGGTTCCCTTGTGGCCTGTGTGCAGGTGGTGCATGGCTGTCGTCAGCTCGTGT
+CGTGAGATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCTTGTCTCATGTTGCCAGCGGGTAATGCCGGGGACTCGTGAG
+AGACTGCCGGGGTCAACTCGGAGGAAGGTGGGGATGACGTCAAGTCATCATG
+>Streptococcus_sobrinus_r1
+AGTGTTACTAATGAGTCGCGAACGGGTGAGTAACGCGTAGGTAACCTGCCTGATAGCGGGGGATAACTATTGGAAACGAT
+AGCTAATACCGCATAAGAGGAGTTAACTCATGTTAACTGTTTAAAAGAAGCCATTGCTTCACTATCAGAGGACCTGCGT
+TGTATTAGCTAGTAGGTAGGGTAACGGCCTACCTAGGCAACGATACATAGCCGACCTGAGAGGGTGAACGGCCACACTGG
+GACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAGTAGGGAATCTTCGGCAATGGACGCAAGTCTGACCGAGCAACG
+CCGCGTGAGTGAAGACGGTTTTCGGATCGTAAAGCTCTGTTGTAGGGGAAGAACGTGTGTAAGAGTGGAAAGCTTACACA
+GTGACGGTACCCTACCAGAAAGGGACGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGTCCCGAGCGTTGTCCGG
+ATTTATTGGGCGTAAAGGGAGCGCAGGCGGTTTAGTAAGTCTGAAGTTAAAGGCATTGGCTCAACCAATGTATGCTTTGG
+AAACTGTTAGACTTGAGTGCAGAAGGGGAGAGTGGAATTCCATGTGTAGCGGTGAAATGCGTAGATATATGGAGGAACAC
+CGGTGGCGAAAGCGGCTCTCTGGTCTGTCACTGACGCTGAGGCTCGAAAGCGTGGGTAGCGAACAGGATTAGATACCCTG
+GTAGTCCACGCCGTAAACGCTGAGTGCTAGGTGTTAGGTCCTTTCCGGGACTTAGTGCCGACGCTAACGCATTAAGCACT
+CCGCCTGGGGAGTACGACCGCAAGGTTGAAACTCAAAGGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTA
+ATTCGAAGCAACGCGAAGAACCTTACCAGGTCTTGACATCCCGATGCCCGCTCTAGAGATAGAGTTTTTCTTCGGAACAT
+CGGAGACAGGTGGTGCATGGTTGTCGTCAGCTCGTGTCGTGAGATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCTTAT
+TGTTAGTTGCCATCATTAAGTTGGGCACTCTAGCGAGACTGCCGGTAATAAACCGGAGGAAGGTGGGGATGACGTCAAAT
+CATCATGCCCCTTATGACCTGGGCTACACACGTGCTACAATGGTTGGTACAACGAGTCGCAAGCCGGTGACGGCAAGCTA
+ATCTCTGAAAGCCAATCTCAGTTCGGATTGTAGGCTGCAACTCGCCTACATGAAGTCGGAATCGCTAGTAATCGCGGATC
+AGCACGCCGCGGTGAATACGTTCCCGGGCCTTGTACACACCGCCCGTCACACCACGAGAGTTTGTAACACCCAAAGTCGGT
\ No newline at end of file
diff --git a/bio/emu/abundance/test/short_read_R1.fq b/bio/emu/abundance/test/short_read_R1.fq
new file mode 100644
index 0000000000..06c5874275
--- /dev/null
+++ b/bio/emu/abundance/test/short_read_R1.fq
@@ -0,0 +1,16 @@
+@SRR10391187.1 1 length=293
+GTGCCAGCCGCCGCGGTAATACGTAGGTGGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGGGNTNNNNNNCGGTTCCTTAAGNNTGANNNNNNANCCCCCGGCTNNNNNNNGGAGNGTCNNNGGAANCTNNGGAACTTGAGTGCAGAAGAGGANNNNNNNNTNCNNNGTGTAGCNNNNNNNTGCGTAGAGATGTGNNNNNNCACCAGTGNNNANNNNGACTCTNNNNNNNGTAANTGNNNNTGNGNANCNAANNNNNNGGGAGCGNNNNNNNTTAGATANNNNNNNAGTACA
++SRR10391187.1 1 length=293
+CCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG#:######::CDFEGGGGGG##:CC######:#::?FGGGGF#######::BF#:BC###::BF#:B##::+BFFFGGFGGGGGFFGGGGF########6#6###86>FFGD#######66=CCEGCG?CGFF######*43BFGGC###3####*3/1;+#######*2;C#22####11#1#*#0#22######1131FE@#######(.04:A<#######(--(06
+@SRR10391187.2 2 length=293
+GTGCCAGCAGCCGCGGTAATACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGNGNNNNNNCGGTTTTTTAAGTNTGANNNNANAGCCCACGGCTNNNNNNNGGAGNGTCNNNGGAANCTNNAAAACTTGAGTGCAGAAGAGGANNNNNNNNTNCNNNGTGTAGCNNNNNNNTGCGCAGAGATATGGNNNNACACCAGTGNNGANNNNGACTTTNNNNNNNGTAANTGNNNNTGNTNTNCNAANNNNNNGGGATCANNNNGNNTTAGATANNNNNNNAGTCCA
++SRR10391187.2 2 length=293
+CCCCCGGGGGGGGGGGGGGGGGGFGGGFGGGGGGGGGGGGGGGGGGGGGGGDGFGGGGGGGGG#:######::DFDGGGGGGGG#:CD####:#:BFDGGGGGGG#######::DF#:BB###:8>D#::##8:=FEGGGGGGGGGGGGGACFG########6#6###86@FFGC#######*6>FGGGGGGGFGGG####31=CFGGGF##33####13=C>F#######/2;C#**####*1#*#/#.#22######1186*8>####0##(.08?F<#######(-4:FF
+@SRR10391187.3 3 length=295
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGNANNNNNGCGGTCTGTCAAGTNGGANNNNAAATCCCCGGGCTNNNNNNNGGAACTGCNTTCGAAACTNNCAGGCTTGAGTCTTGTAGAGGGNNNGNGNATNCNNNGTGTAGCNNNNNNNAGCGTAGAGATCGGGANNNATACCGGTGGNGANNGCGGCCCCNNNNANNAAGAATGANGCTCAGNTGCGAANNNNNNGGGAGCANNNNGGATTAGATANNNNNNTAGCCCACG
++SRR10391187.3 3 length=295
+CCCCCGGGGGGGGGGGGGGGGGGGGGGGDGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGG#:#####::CDEGGGGGGGDG#:9C####::BDFGGGGGGGF#######:8BFGGFG#:BCFFGGG9##::>CGGGGGGGFGGGGGGGGGG###:#6#65#5###861>FF,#######**6@BCB>@CE9**43###341CF=DGC@#13##22**;CGG####*##22;**0+0#*19CF4#*/.:/C######/138C>@####00-4;4<:9:######)((-2<?<5
+@SRR10391187.4 4 length=295
+GTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACNNNNGCGGTCTGTCAAGTCGGANGNNAAATCCCCGGGCTNNNNNNNGGAACTGCATTCGAAACTGNCAGGCTTGAGTCTTGTAGAGGGNNNTNGNATTCNNNGTGTAGCGNNNNNNTGCGTAGAGATCTGGANGAACACCAGTGGCGAAGGCGGCTCTCTNGTCTGTAACTGACGCTGAGGCTCGAAAGCNTGGGGAGCAAACAGGATTAGATANCCTGGTAGTCCACG
++SRR10391187.4 4 length=295
+CCCCCGGGGGGGGGGGG@FGGGGEGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGGGEEGGGFGGGF####:,:DFGGGGGFGGGFGGC#:##::<DDGGGGGGGG#######::BFEFGGCGGGEFG7FGF#:B>FGGGGGFGGCGGGGGFFGG###:#6#6@FF###86DFGCFG######*6=CCCCEGGD,,2?+#/*3BEF>FGGGG58EGGGFGGGGGGG?#2:7<CCCGFC9CFGD*95C@C=5:=0)))#0/1723DD>F793556384,)-+#217CFFA<8A*6><
\ No newline at end of file
diff --git a/bio/emu/abundance/test/short_read_R2.fq b/bio/emu/abundance/test/short_read_R2.fq
new file mode 100644
index 0000000000..ffa6936601
--- /dev/null
+++ b/bio/emu/abundance/test/short_read_R2.fq
@@ -0,0 +1,16 @@
+@SRR10391187.1 1 length=301
+NANCNCGAGCTGACGANAANNNNNNNCCACCTGTCACTCTGCCCCCGANNNNGACGTNCTNTCTCTAGGATTGTNAGNGGANGTCAAGANNNNNNNAGGTTCTNNNNNNNGCTTCGAATTAAACCACANNNTNCACCGCTTGTGCGNGNCCCCGTCAATNNCTNNNAGTTTCAGTCNTGNNACCGTACTCCCCAGNCGGAGTGCTTAATGCGTTAGCTGCAGCACTAAGGGGCGGAANNNCCCTAACACTNAGCNNNCNTNNTTTNNGGCNGGGAGNANCCNNNGATCTAATCCTGTTTNN
++SRR10391187.1 1 length=301
+#8#8#=CFGGGGGGGG#=D#######::CFGGGGGGGGGGGGGGGGGG####::CDC#:C#:CBFGGGGFCEFG#:C#:CC#:BFFFGG#######::CFGGG#######9:AFGGCGGGDGGCFFGG###+#88A@FEGGGGGCB#8#66?EFFEEGE##86###88@FFFGFA;#5*##45@D:CCFGGGGF@#/1**;C8CFGFFFCFGGGGEFGC?FFFFF8;FFA1;=C:24###2197@GFBF=#-1*###-#(##--0##---#((1-(#)#-(###(-,8:A))4<AAGA)##
+@SRR10391187.2 2 length=301
+NANCNCAGCTGACGACNACNNNNNNNCACCTGTCACTTTGTCCCCGAANNNNAAGCTCTGTCTCCAGAGTGGTCNAAGGATNTCAAGACNNNNNNNGGTTCTTNNNNNNNCTTCGAATTAAACCACATNCNCNACCGCTTGTGCGGNCCCCCGTCAATTNNTTNNNGTTTCAACCTNTCNNTCGTACTCCCCAGGCGGAGTGCTTAATGCGTTAGCCGCAGCACTAAGGGGCGGAAACNCACTAACACTTNGCANNNANCNNTTANNGCGNGGACTNCNAGNNNATCTAATCCTGTTTGAN
++SRR10391187.2 2 length=301
+#8#8#=CFGGGEGGGG#=C#######::CFGGGGGGGGG9FFGGGGD@####::DFFGGGGGGGGGEFGGGGGG#:CDFGG#:BFFGGG#######:::DCFG#######4:AFFFGGFGFFGFGGGG#:#9#99D?FGGGFFGGG#6A@DDEEGGDFF##86###88@FFF:>FF#+3##35@FGGFGFGGEGGGGGE9AFFFFFFDF6CEGGCFGG8*8DFFAEFGGFB9GG31>1#2*9>>:FFF<@#-**###(#-##-,)##--(#--/4,#)#(-###(--)4>>7AFDF=<?)#
+@SRR10391187.3 3 length=301
+NANCACGAGCTGACGACAGNNNNNNNGCACCTGTCTCACGGTTCCCGANNNNACATTCTCATCTCTGAAAACTTCCGTGGANGTCAAGACNNGNNNAGGTTCTNNNNNNNGCATCGAATTAAACCACANGNTCCACCGCTTGTGCGNGCCCCCGTCAATTCATNNGAGTTTTAACCTTGCNGCCGTACTCCCCAGGCGGTCTACTTAACGCGTTAGCTCCGGAAGCCACGCCTCAAGGGCACAACCTACCAGTAANNANCNNTTTNNGCGNGGGCCNCNCGNNNGTCTAATCCTGTTTGTN
++SRR10391187.3 3 length=301
+#8#8AFFFGGGGGGGGGGG#######::CFGGGFGGGFFCFGGGGGD@####6:CFGFFGGGGFGGGGFGAFGGGGEGDGC#:BBFCEGG##:###::CFGCF#######9:A+DCFGFFGGGFGGGG#+#+9AFG8EGGGFFFGE#68>EGGECE+6EGGGF##38>EGGG7FGFGF9D#5@E:DCEGGGGDGGGF>EDCECCFGCGFFGG=EECBEFGF).:>7ACDD;0*;)..:CGGGDFFGF08)4)0C*##-#,##--)##-,(#(,/,(#0#(,###(-,)4-)442.-:<:,#
+@SRR10391187.4 4 length=300
+NAACACGAGCTGACGACAANNNNNNNCCACCTGTCACTCTGCCCCCGANNNNGACGTCCTATCTCTAGGATTGTCAGAGGACGTCAAGACCNGGNAAGGTTCTNCNNNNNGCTTCGAATTAAACCACATGCTCCACCGCCTGTGCGGGCCCCCGTCAATTCCTNTGAGTTTCAACCTTGCGGTCGTACCCCCCAGGCGGAGTGCTTAATGCGTTTGCTGCAGCACTGAAGGGCGGAAACCCTCCAACACTTAGCCCNCATNGTTTNCGGCNTGGACNCNCCNNNGTTCACATCCTGTTTG
++SRR10391187.4 4 length=300
+#8ACCGGEGGGGGGD7FFG#######::CFFGGEFCFGGGGG9CCECF####::CFFEGE<EFBFEFFGGFFGGGGA<EFCGGGGG:CFGG#:B#,:B@FGGG#4#####+:+9AEFFGGGGCFEFFCBFG,EFF?EED+@D>DFGGG7BGGGGGG@F9FGG;#66DCFFGFGECDGF9CCC*:@EE8*6BEGCC5DEC8CCCC4>FGFF+>EEGG+096?44)7347**20CC@9((98C*107D4(2*-4*2)0#--(#(-,8#-(,(#(-1-(#)#(-###(((0().42(47CDFF
\ No newline at end of file
diff --git a/bio/emu/abundance/wrapper.py b/bio/emu/abundance/wrapper.py
new file mode 100644
index 0000000000..f3f8e7ee33
--- /dev/null
+++ b/bio/emu/abundance/wrapper.py
@@ -0,0 +1,44 @@
+__author__ = "Curro Campuzano Jimenez"
+__copyright__ = "Copyright 2024, Curro Campuzano Jimenez"
+__email__ = "campuzanocurro@gmail.com"
+__license__ = "MIT"
+
+
+from snakemake.shell import shell
+import tempfile
+import os
+
+log = snakemake.log_fmt_shell(stdout=True, stderr=True)
+extra = snakemake.params.get("extra", "")
+threads = snakemake.threads or 1
+
+# Check input (mandatory)
+msg_error = "Please provide either one file of single-end 16S reads or two files of short paired-end 16S"
+if not snakemake.input.get("reads"):
+    raise ValueError(msg_error)
+reads = snakemake.input.get("reads")
+if isinstance(reads, list) and len(reads) > 2:
+    raise ValueError(msg_error)
+
+# Check database (optional)
+database_cmd = ""
+if database := snakemake.input.get("database_dir"):
+    if not os.path.isdir(database):
+        raise ValueError("Please provide a valid Emu database directory")
+    database_cmd = f"--db {database}"
+
+with tempfile.TemporaryDirectory() as tmpdir:
+    shell(
+        "emu abundance {reads} {database_cmd}"
+        " --keep-files --output-dir {tmpdir}"
+        " --output-basename output --output-unclassified"
+        " --threads	{threads}"
+        " {extra}"
+        " {log}"
+    )
+    if out_tsv := snakemake.output.get("abundances"):
+        shell("mv {tmpdir}/output_rel-abundance.tsv {out_tsv}")
+    if out_sam := snakemake.output.get("alignments"):
+        shell("mv {tmpdir}/output_emu_alignments.sam {out_sam}")
+    if out_fa := snakemake.output.get("unclassified"):
+        shell("mv {tmpdir}/output_unclassified.fa {out_fa}")
diff --git a/test.py b/test.py
index 844963697f..5d4d748525 100644
--- a/test.py
+++ b/test.py
@@ -5678,3 +5678,33 @@ def test_barrnap():
             "-F",
         ],
     )
+
+@skip_if_not_modified
+def test_emu_abundance():
+    run(
+        "bio/emu/abundance",
+        [
+            "snakemake",
+            "--cores",
+            "1",
+            "sample_rel-abundance.tsv",
+            "sample_emu_alignments.sam",
+            "sample_unclassified.fa",
+            "--use-conda",
+            "-F",
+        ],
+    )
+
+@skip_if_not_modified
+def test_emu_abundance_paired():
+    run(
+        "bio/emu/abundance",
+        [
+            "snakemake",
+            "--cores",
+            "1",
+            "short_read_rel-abundance_paired.tsv",
+            "--use-conda",
+            "-F",
+        ],
+    )

From 7949c5e2672a4401939b7ac595d664aeb4ca24c7 Mon Sep 17 00:00:00 2001
From: Curro Campuzano <campuzanocurro@gmail.com>
Date: Fri, 5 Apr 2024 14:58:35 +0200
Subject: [PATCH 02/26] Add collapse taxonomy

---
 bio/emu/collapse-taxonomy/environment.yaml    |  6 +++
 bio/emu/collapse-taxonomy/meta.yaml           | 12 ++++++
 bio/emu/collapse-taxonomy/test/Snakefile      | 11 ++++++
 .../test/full_length_rel-abundance.tsv        |  3 ++
 bio/emu/collapse-taxonomy/wrapper.py          | 37 +++++++++++++++++++
 test.py                                       | 14 +++++++
 6 files changed, 83 insertions(+)
 create mode 100644 bio/emu/collapse-taxonomy/environment.yaml
 create mode 100644 bio/emu/collapse-taxonomy/meta.yaml
 create mode 100644 bio/emu/collapse-taxonomy/test/Snakefile
 create mode 100644 bio/emu/collapse-taxonomy/test/full_length_rel-abundance.tsv
 create mode 100644 bio/emu/collapse-taxonomy/wrapper.py

diff --git a/bio/emu/collapse-taxonomy/environment.yaml b/bio/emu/collapse-taxonomy/environment.yaml
new file mode 100644
index 0000000000..a117676039
--- /dev/null
+++ b/bio/emu/collapse-taxonomy/environment.yaml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - bioconda
+  - nodefaults
+dependencies:
+  - emu=3.4.5
\ No newline at end of file
diff --git a/bio/emu/collapse-taxonomy/meta.yaml b/bio/emu/collapse-taxonomy/meta.yaml
new file mode 100644
index 0000000000..ae21e22eef
--- /dev/null
+++ b/bio/emu/collapse-taxonomy/meta.yaml
@@ -0,0 +1,12 @@
+name: emu collapse-taxonomy
+description: Collapse a TSV output file generated with emu at the desired taxonomic rank.
+url: https://github.com/treangenlab/emu
+authors:
+  - Curro Campuzano
+input:
+  - A TSV output file generated with emu.
+output:
+  - Another TSV output file collapsed at the desired taxonomic rank.
+params:
+  rank: Accepted ranks are 'species', 'genus', 'family', 'order', 'class', 'phylum' and 'superkingdom'
+
diff --git a/bio/emu/collapse-taxonomy/test/Snakefile b/bio/emu/collapse-taxonomy/test/Snakefile
new file mode 100644
index 0000000000..1ff624cfc2
--- /dev/null
+++ b/bio/emu/collapse-taxonomy/test/Snakefile
@@ -0,0 +1,11 @@
+rule collapse_taxonomy:
+    input:
+      "full_length_rel-abundance.tsv"
+    output:
+      "full_length_rel-abundance_collapsed.tsv"
+    log:
+        "logs/emu/full_length_collapsed.log"
+    params:
+      rank="genus"
+    wrapper:
+        "master/bio/emu/collapse-taxonomy"
\ No newline at end of file
diff --git a/bio/emu/collapse-taxonomy/test/full_length_rel-abundance.tsv b/bio/emu/collapse-taxonomy/test/full_length_rel-abundance.tsv
new file mode 100644
index 0000000000..fd065e577e
--- /dev/null
+++ b/bio/emu/collapse-taxonomy/test/full_length_rel-abundance.tsv
@@ -0,0 +1,3 @@
+tax_id	abundance	superkingdom	phylum	class	order	family	genus	species	estimated counts
+1	1.0	Bacteria	Proteobacteria	Gammaproteobacteria	Pseudomonadales	Pseudomonadaceae	Pseudomonas	amygdali;	2.0
+unassigned	0.0								2.0
diff --git a/bio/emu/collapse-taxonomy/wrapper.py b/bio/emu/collapse-taxonomy/wrapper.py
new file mode 100644
index 0000000000..5271802867
--- /dev/null
+++ b/bio/emu/collapse-taxonomy/wrapper.py
@@ -0,0 +1,37 @@
+__author__ = "Curro Campuzano Jimenez"
+__copyright__ = "Copyright 2024, Curro Campuzano Jimenez"
+__email__ = "campuzanocurro@gmail.com"
+__license__ = "MIT"
+
+
+from snakemake.shell import shell
+import tempfile
+import os
+
+log = snakemake.log_fmt_shell(stdout=True, stderr=True)
+
+input_file = snakemake.input[0]
+if not isinstance(input_file, str) and len(snakemake.input) != 1:
+    raise ValueError(
+        "Input should be one TSV file generated with emu: " + str(input_file) + "!"
+    )
+
+output_file = snakemake.output[0]
+if not isinstance(output_file, str) and len(snakemake.output) != 1:
+    raise ValueError("Output should be one file: " + str(output_file) + "!")
+
+if not snakemake.params.get("rank"):
+    raise ValueError("Please provide a rank parameter")
+rank = snakemake.params.get("rank")
+
+with tempfile.TemporaryDirectory() as tmpdir:
+    # Resolve the symbolic link and get the actual path of the input file
+    input_file_path = os.path.realpath(input_file)
+    # Create a symlink of the input file in the temporary directory
+    symlink_path = os.path.join(tmpdir, os.path.basename(input_file_path))
+    os.symlink(input_file_path, symlink_path)
+    shell("emu collapse-taxonomy {symlink_path} {rank} {log}")
+    # Get the input file name without extension
+    name = os.path.splitext(os.path.basename(input_file_path))[0]
+    temp_out = f"{tmpdir}/{name}-{rank}.tsv"  # it is always a tsv
+    shell("mv {temp_out} {output_file}")
diff --git a/test.py b/test.py
index 5d4d748525..92f386b89a 100644
--- a/test.py
+++ b/test.py
@@ -5708,3 +5708,17 @@ def test_emu_abundance_paired():
             "-F",
         ],
     )
+
+@skip_if_not_modified
+def test_emu_collapse_taxonomy():
+    run(
+        "bio/emu/collapse-taxonomy",
+        [
+            "snakemake",
+            "--cores",
+            "1",
+            "full_length_rel-abundance_collapsed.tsv",
+            "--use-conda",
+            "-F",
+        ],
+    )

From d1d01fa88ef17d67f9d79c93fe944b98acafde38 Mon Sep 17 00:00:00 2001
From: Curro Campuzano <campuzanocurro@gmail.com>
Date: Fri, 5 Apr 2024 16:16:45 +0200
Subject: [PATCH 03/26] Add minimap combine output

---
 bio/emu/combine-outputs/environment.yaml      |  6 ++
 bio/emu/combine-outputs/meta.yaml             | 14 +++++
 bio/emu/combine-outputs/test/Snakefile        | 32 +++++++++++
 .../test/sample1_rel-abundance.tsv            |  3 +
 .../test/sample2_rel-abundance.tsv            |  3 +
 .../test/sample_rel-abundance.tsv             |  3 +
 bio/emu/combine-outputs/wrapper.py            | 55 +++++++++++++++++++
 test.py                                       | 29 ++++++++++
 8 files changed, 145 insertions(+)
 create mode 100644 bio/emu/combine-outputs/environment.yaml
 create mode 100644 bio/emu/combine-outputs/meta.yaml
 create mode 100644 bio/emu/combine-outputs/test/Snakefile
 create mode 100644 bio/emu/combine-outputs/test/sample1_rel-abundance.tsv
 create mode 100644 bio/emu/combine-outputs/test/sample2_rel-abundance.tsv
 create mode 100644 bio/emu/combine-outputs/test/sample_rel-abundance.tsv
 create mode 100644 bio/emu/combine-outputs/wrapper.py

diff --git a/bio/emu/combine-outputs/environment.yaml b/bio/emu/combine-outputs/environment.yaml
new file mode 100644
index 0000000000..a117676039
--- /dev/null
+++ b/bio/emu/combine-outputs/environment.yaml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - bioconda
+  - nodefaults
+dependencies:
+  - emu=3.4.5
\ No newline at end of file
diff --git a/bio/emu/combine-outputs/meta.yaml b/bio/emu/combine-outputs/meta.yaml
new file mode 100644
index 0000000000..0a477d5a30
--- /dev/null
+++ b/bio/emu/combine-outputs/meta.yaml
@@ -0,0 +1,14 @@
+name: emu combine-outputs
+description: Collapse individual abundance tables TSV into a single TSV at the desired taxonomic rank.
+url: https://github.com/treangenlab/emu
+authors:
+  - Curro Campuzano
+input:
+  - A list of TSV files obtained with emu abundance. They should contain 'rel-abundance' in the filename. 
+output:
+  - A TSV output file collapsed at the desired taxonomic rank.
+params:
+  rank: Accepted ranks are 'species', 'genus', 'family', 'order', 'class', 'phylum' and 'superkingdom'
+  counts: An optional boolean. If true, counts, rather than relative abundances are produced. It will fail if input files do not contain the column 'estimated counts'.
+note: The sample columns in the final table will be the finale without the extension. If file ends with "_rel-abundance.tsv", the word '_rel-abundance' will be removed to (for consistency with the program). 
+
diff --git a/bio/emu/combine-outputs/test/Snakefile b/bio/emu/combine-outputs/test/Snakefile
new file mode 100644
index 0000000000..129de59324
--- /dev/null
+++ b/bio/emu/combine-outputs/test/Snakefile
@@ -0,0 +1,32 @@
+rule combine_outputs:
+    input:
+      expand(
+        "{sample}_rel-abundance.tsv",
+        sample = ["sample1", "sample2"]
+      )
+    output:
+      "combined_abundances.tsv"
+    log:
+        "logs/emu/combined_abundances.log"
+    params:
+      rank="tax_id",
+      counts=False
+    wrapper:
+        "master/bio/emu/combine-outputs"
+
+rule combine_outputs_split:
+    input:
+      expand(
+        "{sample}_rel-abundance.tsv",
+        sample = ["sample1", "sample2"]
+      )
+    output:
+      abundance = "counts.tsv",
+      taxonomy = "taxonomy.tsv",
+    log:
+        "logs/emu/combined_split.log"
+    params:
+      rank="genus",
+      counts=True
+    wrapper:
+        "master/bio/emu/combine-outputs"
\ No newline at end of file
diff --git a/bio/emu/combine-outputs/test/sample1_rel-abundance.tsv b/bio/emu/combine-outputs/test/sample1_rel-abundance.tsv
new file mode 100644
index 0000000000..fd065e577e
--- /dev/null
+++ b/bio/emu/combine-outputs/test/sample1_rel-abundance.tsv
@@ -0,0 +1,3 @@
+tax_id	abundance	superkingdom	phylum	class	order	family	genus	species	estimated counts
+1	1.0	Bacteria	Proteobacteria	Gammaproteobacteria	Pseudomonadales	Pseudomonadaceae	Pseudomonas	amygdali;	2.0
+unassigned	0.0								2.0
diff --git a/bio/emu/combine-outputs/test/sample2_rel-abundance.tsv b/bio/emu/combine-outputs/test/sample2_rel-abundance.tsv
new file mode 100644
index 0000000000..61af5e4f0a
--- /dev/null
+++ b/bio/emu/combine-outputs/test/sample2_rel-abundance.tsv
@@ -0,0 +1,3 @@
+tax_id	abundance	superkingdom	phylum	class	order	family	genus	species
+1	1.0	Bacteria	Proteobacteria	Gammaproteobacteria	Pseudomonadales	Pseudomonadaceae	Pseudomonas	amygdali;
+unassigned	0.0							
diff --git a/bio/emu/combine-outputs/test/sample_rel-abundance.tsv b/bio/emu/combine-outputs/test/sample_rel-abundance.tsv
new file mode 100644
index 0000000000..89e3ce5712
--- /dev/null
+++ b/bio/emu/combine-outputs/test/sample_rel-abundance.tsv
@@ -0,0 +1,3 @@
+tax_id	abundance	superkingdom	phylum	class	order	family	genus	species	estimated counts
+1	1.0	Bacteria	Proteobacteria	Gammaproteobacteria	Pseudomonadales	Pseudomonadaceae	Pseudomonas	amygdali;	2.0
+unassigned	0.0								0.0
diff --git a/bio/emu/combine-outputs/wrapper.py b/bio/emu/combine-outputs/wrapper.py
new file mode 100644
index 0000000000..c5ee3fb170
--- /dev/null
+++ b/bio/emu/combine-outputs/wrapper.py
@@ -0,0 +1,55 @@
+__author__ = "Curro Campuzano Jimenez"
+__copyright__ = "Copyright 2024, Curro Campuzano Jimenez"
+__email__ = "campuzanocurro@gmail.com"
+__license__ = "MIT"
+
+
+from snakemake.shell import shell
+import tempfile
+import os
+
+log = snakemake.log_fmt_shell(stdout=True, stderr=True)
+extra = snakemake.params.get("extra", "")
+
+input_files = snakemake.input
+if not isinstance(input_files, list):
+    raise ValueError("Input should be a list of files: " + str(input_files) + "!")
+
+if snakemake.output.get("abundance") and snakemake.output.get("taxonomy"):
+    split = True
+    taxonomy = snakemake.output.get("taxonomy")
+    abundances = snakemake.output.get("abundance")
+elif isinstance(snakemake.output[0], str):
+    split = False
+    table = snakemake.output[0]
+else:
+    raise ValueError(
+        "Please provide either one TSV file, or two named TSV files (abundances and taxonomy)"
+    )
+
+if not snakemake.params.get("rank"):
+    raise ValueError("Please provide a rank parameter")
+rank = snakemake.params.get("rank")
+counts = snakemake.params.get("counts", False)
+
+with tempfile.TemporaryDirectory() as tmpdir:
+    for infile in input_files:
+        # Files has to end in tsv, and contain rel_abundances
+        temp = os.path.join(tmpdir, os.path.basename(infile))
+        if not temp.endswith("rel_abundances.tsv"):
+            temp = os.path.splitext(infile)[0] + "-rel_abundances.tsv"
+        os.symlink(infile, temp)
+    if split and counts:
+        shell("emu combine-outputs {tmpdir} {rank} --split-tables --counts {log}")
+        shell("mv {tmpdir}/emu-combined-taxonomy-{rank}.tsv {taxonomy}")
+        shell("mv {tmpdir}/emu-combined-abundance-{rank}-counts.tsv {abundances}")
+    elif split and not counts:
+        shell("emu combine-outputs {tmpdir} {rank} --split-tables {log}")
+        shell("mv {tmpdir}/emu-combined-taxonomy-{rank}.tsv {taxonomy}")
+        shell("mv {tmpdir}/emu-combined-abundance-{rank}.tsv {abundances}")
+    elif not split and counts:
+        shell("emu combine-outputs {tmpdir} {rank} --counts {log}")
+        shell("mv {tmpdir}/emu-combined-{rank}-counts.tsv {table}")
+    elif not split and not counts:
+        shell("emu combine-outputs {tmpdir} {rank} {extra} {log}")
+        shell("mv {tmpdir}/emu-combined-{rank}.tsv {table}")
diff --git a/test.py b/test.py
index 92f386b89a..b86239659a 100644
--- a/test.py
+++ b/test.py
@@ -5722,3 +5722,32 @@ def test_emu_collapse_taxonomy():
             "-F",
         ],
     )
+
+@skip_if_not_modified
+def test_emu_combine_output():
+    run(
+        "bio/emu/combine-outputs",
+        [
+            "snakemake",
+            "--cores",
+            "1",
+            "combined_abundances.tsv",
+            "--use-conda",
+            "-F",
+        ],
+    )
+
+@skip_if_not_modified
+def test_emu_combine_output_split():
+    run(
+        "bio/emu/combine-outputs",
+        [
+            "snakemake",
+            "--cores",
+            "1",
+            "counts.tsv",
+            "taxonomy.tsv",
+            "--use-conda",
+            "-F",
+        ],
+    )

From 8003ebbf293929cc8a7a841f27607762b0cadc06 Mon Sep 17 00:00:00 2001
From: Curro Campuzano <campuzanocurro@gmail.com>
Date: Fri, 5 Apr 2024 16:49:15 +0200
Subject: [PATCH 04/26] Update docs

---
 bio/emu/collapse-taxonomy/meta.yaml | 2 +-
 bio/emu/combine-outputs/meta.yaml   | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/bio/emu/collapse-taxonomy/meta.yaml b/bio/emu/collapse-taxonomy/meta.yaml
index ae21e22eef..781fe94605 100644
--- a/bio/emu/collapse-taxonomy/meta.yaml
+++ b/bio/emu/collapse-taxonomy/meta.yaml
@@ -6,7 +6,7 @@ authors:
 input:
   - A TSV output file generated with emu.
 output:
-  - Another TSV output file collapsed at the desired taxonomic rank.
+  - A TSV output file collapsed at the desired taxonomic rank.
 params:
   rank: Accepted ranks are 'species', 'genus', 'family', 'order', 'class', 'phylum' and 'superkingdom'
 
diff --git a/bio/emu/combine-outputs/meta.yaml b/bio/emu/combine-outputs/meta.yaml
index 0a477d5a30..d59358df91 100644
--- a/bio/emu/combine-outputs/meta.yaml
+++ b/bio/emu/combine-outputs/meta.yaml
@@ -4,11 +4,12 @@ url: https://github.com/treangenlab/emu
 authors:
   - Curro Campuzano
 input:
-  - A list of TSV files obtained with emu abundance. They should contain 'rel-abundance' in the filename. 
+  - A list of TSV files obtained with emu abundance. 
 output:
-  - A TSV output file collapsed at the desired taxonomic rank.
+  - A TSV containing either both abundances and taxonomy or only the abundances.
+  - Optionally, a TSV containing the taxonomy (if splitting the previous file in two).
 params:
-  rank: Accepted ranks are 'species', 'genus', 'family', 'order', 'class', 'phylum' and 'superkingdom'
+  rank: Accepted ranks are 'species', 'genus', 'family', 'order', 'class', 'phylum' and 'superkingdom'. If no agglomeration is desired, use "tax_id". 
   counts: An optional boolean. If true, counts, rather than relative abundances are produced. It will fail if input files do not contain the column 'estimated counts'.
-note: The sample columns in the final table will be the finale without the extension. If file ends with "_rel-abundance.tsv", the word '_rel-abundance' will be removed to (for consistency with the program). 
+note: The sample columns in the final table will be each filename without extension. If file ends with "_rel-abundance.tsv", the word '_rel-abundance' will be removed to (for consistency with the program). 
 

From 6ba094f5d150b16cbe7b6c50b0b1d67259cd4ec7 Mon Sep 17 00:00:00 2001
From: Curro Campuzano <campuzanocurro@gmail.com>
Date: Fri, 5 Apr 2024 16:54:40 +0200
Subject: [PATCH 05/26] Lint files

---
 bio/emu/abundance/test/Snakefile         | 29 +++++++++++-----------
 bio/emu/collapse-taxonomy/test/Snakefile | 10 ++++----
 bio/emu/combine-outputs/test/Snakefile   | 31 ++++++++++--------------
 3 files changed, 33 insertions(+), 37 deletions(-)

diff --git a/bio/emu/abundance/test/Snakefile b/bio/emu/abundance/test/Snakefile
index f35eb8b00f..7d6a77276d 100644
--- a/bio/emu/abundance/test/Snakefile
+++ b/bio/emu/abundance/test/Snakefile
@@ -1,29 +1,30 @@
 rule abundance:
     input:
-      reads = "{sample}.fa",
-      database_dir = "database"
+        reads="{sample}.fa",
+        database_dir="database",
     output:
-      abundances = "{sample}_rel-abundance.tsv",
-      alignments = "{sample}_emu_alignments.sam",
-      unclassified = "{sample}_unclassified.fa"
+        abundances="{sample}_rel-abundance.tsv",
+        alignments="{sample}_emu_alignments.sam",
+        unclassified="{sample}_unclassified.fa",
     log:
-        "logs/emu/{sample}_abundance.log"
+        "logs/emu/{sample}_abundance.log",
     params:
-      extra="--type map-ont --keep-counts"
-    threads: 3 # optional, defaults to 1
+        extra="--type map-ont --keep-counts",
+    threads: 3  # optional, defaults to 1
     wrapper:
         "master/bio/emu/abundance"
 
+
 rule abundance_paired:
     input:
-      reads =[ "{sample}_R1.fq", "{sample}_R2.fq" ],
-      database_dir = "database"
+        reads=["{sample}_R1.fq", "{sample}_R2.fq"],
+        database_dir="database",
     output:
-      abundances = "{sample}_rel-abundance_paired.tsv",
+        abundances="{sample}_rel-abundance_paired.tsv",
     log:
-        "logs/emu/{sample}_abundance_paired.log"
+        "logs/emu/{sample}_abundance_paired.log",
     params:
-      extra="--type sr --keep-counts"
-    threads: 3 # optional, defaults to 1
+        extra="--type sr --keep-counts",
+    threads: 3  # optional, defaults to 1
     wrapper:
         "master/bio/emu/abundance"
diff --git a/bio/emu/collapse-taxonomy/test/Snakefile b/bio/emu/collapse-taxonomy/test/Snakefile
index 1ff624cfc2..91ec4a9d20 100644
--- a/bio/emu/collapse-taxonomy/test/Snakefile
+++ b/bio/emu/collapse-taxonomy/test/Snakefile
@@ -1,11 +1,11 @@
 rule collapse_taxonomy:
     input:
-      "full_length_rel-abundance.tsv"
+        "full_length_rel-abundance.tsv",
     output:
-      "full_length_rel-abundance_collapsed.tsv"
+        "full_length_rel-abundance_collapsed.tsv",
     log:
-        "logs/emu/full_length_collapsed.log"
+        "logs/emu/full_length_collapsed.log",
     params:
-      rank="genus"
+        rank="genus",
     wrapper:
-        "master/bio/emu/collapse-taxonomy"
\ No newline at end of file
+        "master/bio/emu/collapse-taxonomy"
diff --git a/bio/emu/combine-outputs/test/Snakefile b/bio/emu/combine-outputs/test/Snakefile
index 129de59324..e8e28e8626 100644
--- a/bio/emu/combine-outputs/test/Snakefile
+++ b/bio/emu/combine-outputs/test/Snakefile
@@ -1,32 +1,27 @@
 rule combine_outputs:
     input:
-      expand(
-        "{sample}_rel-abundance.tsv",
-        sample = ["sample1", "sample2"]
-      )
+        expand("{sample}_rel-abundance.tsv", sample=["sample1", "sample2"]),
     output:
-      "combined_abundances.tsv"
+        "combined_abundances.tsv",
     log:
-        "logs/emu/combined_abundances.log"
+        "logs/emu/combined_abundances.log",
     params:
-      rank="tax_id",
-      counts=False
+        rank="tax_id",
+        counts=False,
     wrapper:
         "master/bio/emu/combine-outputs"
 
+
 rule combine_outputs_split:
     input:
-      expand(
-        "{sample}_rel-abundance.tsv",
-        sample = ["sample1", "sample2"]
-      )
+        expand("{sample}_rel-abundance.tsv", sample=["sample1", "sample2"]),
     output:
-      abundance = "counts.tsv",
-      taxonomy = "taxonomy.tsv",
+        abundance="counts.tsv",
+        taxonomy="taxonomy.tsv",
     log:
-        "logs/emu/combined_split.log"
+        "logs/emu/combined_split.log",
     params:
-      rank="genus",
-      counts=True
+        rank="genus",
+        counts=True,
     wrapper:
-        "master/bio/emu/combine-outputs"
\ No newline at end of file
+        "master/bio/emu/combine-outputs"

From 6d9228621246b8f9b7159564e4c237cbc416a08e Mon Sep 17 00:00:00 2001
From: Curro Campuzano <69399781+currocam@users.noreply.github.com>
Date: Mon, 8 Apr 2024 10:10:26 +0200
Subject: [PATCH 06/26] Update bio/emu/collapse-taxonomy/wrapper.py

Co-authored-by: Filipe G. Vieira <1151762+fgvieira@users.noreply.github.com>
---
 bio/emu/collapse-taxonomy/wrapper.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/bio/emu/collapse-taxonomy/wrapper.py b/bio/emu/collapse-taxonomy/wrapper.py
index 5271802867..8539621c39 100644
--- a/bio/emu/collapse-taxonomy/wrapper.py
+++ b/bio/emu/collapse-taxonomy/wrapper.py
@@ -20,9 +20,7 @@
 if not isinstance(output_file, str) and len(snakemake.output) != 1:
     raise ValueError("Output should be one file: " + str(output_file) + "!")
 
-if not snakemake.params.get("rank"):
-    raise ValueError("Please provide a rank parameter")
-rank = snakemake.params.get("rank")
+rank = snakemake.params.get("rank", "species")
 
 with tempfile.TemporaryDirectory() as tmpdir:
     # Resolve the symbolic link and get the actual path of the input file

From 8a6e8c133318820d213679006b23d371dd682523 Mon Sep 17 00:00:00 2001
From: Curro Campuzano <69399781+currocam@users.noreply.github.com>
Date: Mon, 8 Apr 2024 10:12:12 +0200
Subject: [PATCH 07/26] Document default behavior

---
 bio/emu/collapse-taxonomy/meta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bio/emu/collapse-taxonomy/meta.yaml b/bio/emu/collapse-taxonomy/meta.yaml
index 781fe94605..3f56fe88fe 100644
--- a/bio/emu/collapse-taxonomy/meta.yaml
+++ b/bio/emu/collapse-taxonomy/meta.yaml
@@ -8,5 +8,5 @@ input:
 output:
   - A TSV output file collapsed at the desired taxonomic rank.
 params:
-  rank: Accepted ranks are 'species', 'genus', 'family', 'order', 'class', 'phylum' and 'superkingdom'
+  rank: Accepted ranks are 'species', 'genus', 'family', 'order', 'class', 'phylum' and 'superkingdom'. If ommited, agglomeration will be done at the species level. 
 

From 3b4fd628a99f58e94708b42949786d3a05f91376 Mon Sep 17 00:00:00 2001
From: Curro Campuzano <69399781+currocam@users.noreply.github.com>
Date: Mon, 8 Apr 2024 10:24:49 +0200
Subject: [PATCH 08/26] Remove unnecessary assertions

---
 bio/emu/collapse-taxonomy/wrapper.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/bio/emu/collapse-taxonomy/wrapper.py b/bio/emu/collapse-taxonomy/wrapper.py
index 8539621c39..b155bd4b8a 100644
--- a/bio/emu/collapse-taxonomy/wrapper.py
+++ b/bio/emu/collapse-taxonomy/wrapper.py
@@ -11,15 +11,7 @@
 log = snakemake.log_fmt_shell(stdout=True, stderr=True)
 
 input_file = snakemake.input[0]
-if not isinstance(input_file, str) and len(snakemake.input) != 1:
-    raise ValueError(
-        "Input should be one TSV file generated with emu: " + str(input_file) + "!"
-    )
-
 output_file = snakemake.output[0]
-if not isinstance(output_file, str) and len(snakemake.output) != 1:
-    raise ValueError("Output should be one file: " + str(output_file) + "!")
-
 rank = snakemake.params.get("rank", "species")
 
 with tempfile.TemporaryDirectory() as tmpdir:

From 3ff00909c060041ab424f03822224075a53178f7 Mon Sep 17 00:00:00 2001
From: Curro Campuzano <69399781+currocam@users.noreply.github.com>
Date: Mon, 8 Apr 2024 11:53:02 +0200
Subject: [PATCH 09/26] Modify params combine-outputs

---
 bio/emu/combine-outputs/meta.yaml      |  4 ++--
 bio/emu/combine-outputs/test/Snakefile |  5 +----
 bio/emu/combine-outputs/wrapper.py     | 16 ++++++++--------
 3 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/bio/emu/combine-outputs/meta.yaml b/bio/emu/combine-outputs/meta.yaml
index d59358df91..228815066b 100644
--- a/bio/emu/combine-outputs/meta.yaml
+++ b/bio/emu/combine-outputs/meta.yaml
@@ -9,7 +9,7 @@ output:
   - A TSV containing either both abundances and taxonomy or only the abundances.
   - Optionally, a TSV containing the taxonomy (if splitting the previous file in two).
 params:
-  rank: Accepted ranks are 'species', 'genus', 'family', 'order', 'class', 'phylum' and 'superkingdom'. If no agglomeration is desired, use "tax_id". 
-  counts: An optional boolean. If true, counts, rather than relative abundances are produced. It will fail if input files do not contain the column 'estimated counts'.
+  rank: Accepted ranks are 'tax_id', 'species', 'genus', 'family', 'order', 'class', 'phylum' and 'superkingdom'. If ommited, no agglomeration will be done (that is, the default is 'tax_id'). 
+  extra: Extra arguments (such as '--counts'). 
 note: The sample columns in the final table will be each filename without extension. If file ends with "_rel-abundance.tsv", the word '_rel-abundance' will be removed to (for consistency with the program). 
 
diff --git a/bio/emu/combine-outputs/test/Snakefile b/bio/emu/combine-outputs/test/Snakefile
index e8e28e8626..d446c71419 100644
--- a/bio/emu/combine-outputs/test/Snakefile
+++ b/bio/emu/combine-outputs/test/Snakefile
@@ -5,9 +5,6 @@ rule combine_outputs:
         "combined_abundances.tsv",
     log:
         "logs/emu/combined_abundances.log",
-    params:
-        rank="tax_id",
-        counts=False,
     wrapper:
         "master/bio/emu/combine-outputs"
 
@@ -22,6 +19,6 @@ rule combine_outputs_split:
         "logs/emu/combined_split.log",
     params:
         rank="genus",
-        counts=True,
+        extra="--counts",
     wrapper:
         "master/bio/emu/combine-outputs"
diff --git a/bio/emu/combine-outputs/wrapper.py b/bio/emu/combine-outputs/wrapper.py
index c5ee3fb170..2a4e7f75c2 100644
--- a/bio/emu/combine-outputs/wrapper.py
+++ b/bio/emu/combine-outputs/wrapper.py
@@ -17,6 +17,7 @@
 
 if snakemake.output.get("abundance") and snakemake.output.get("taxonomy"):
     split = True
+    extra += " --split-tables"
     taxonomy = snakemake.output.get("taxonomy")
     abundances = snakemake.output.get("abundance")
 elif isinstance(snakemake.output[0], str):
@@ -27,10 +28,12 @@
         "Please provide either one TSV file, or two named TSV files (abundances and taxonomy)"
     )
 
-if not snakemake.params.get("rank"):
-    raise ValueError("Please provide a rank parameter")
-rank = snakemake.params.get("rank")
-counts = snakemake.params.get("counts", False)
+if "--split-tables" in extra and not split:
+    raise ValueError("You cannot use --split-tables and produce a single output.")
+
+rank = snakemake.params.get("rank", "tax_id")
+counts = "--counts" in extra
+
 
 with tempfile.TemporaryDirectory() as tmpdir:
     for infile in input_files:
@@ -39,17 +42,14 @@
         if not temp.endswith("rel_abundances.tsv"):
             temp = os.path.splitext(infile)[0] + "-rel_abundances.tsv"
         os.symlink(infile, temp)
+    shell("emu combine-outputs {tmpdir} {rank} {extra} {log}")
     if split and counts:
-        shell("emu combine-outputs {tmpdir} {rank} --split-tables --counts {log}")
         shell("mv {tmpdir}/emu-combined-taxonomy-{rank}.tsv {taxonomy}")
         shell("mv {tmpdir}/emu-combined-abundance-{rank}-counts.tsv {abundances}")
     elif split and not counts:
-        shell("emu combine-outputs {tmpdir} {rank} --split-tables {log}")
         shell("mv {tmpdir}/emu-combined-taxonomy-{rank}.tsv {taxonomy}")
         shell("mv {tmpdir}/emu-combined-abundance-{rank}.tsv {abundances}")
     elif not split and counts:
-        shell("emu combine-outputs {tmpdir} {rank} --counts {log}")
         shell("mv {tmpdir}/emu-combined-{rank}-counts.tsv {table}")
     elif not split and not counts:
-        shell("emu combine-outputs {tmpdir} {rank} {extra} {log}")
         shell("mv {tmpdir}/emu-combined-{rank}.tsv {table}")

From 608fdb62e3c00f096fa41544cb749f82438df6e5 Mon Sep 17 00:00:00 2001
From: "Filipe G. Vieira" <1151762+fgvieira@users.noreply.github.com>
Date: Mon, 8 Apr 2024 15:15:36 +0200
Subject: [PATCH 10/26] Add names to input and output

---
 bio/emu/abundance/meta.yaml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/bio/emu/abundance/meta.yaml b/bio/emu/abundance/meta.yaml
index 5ddec3ad0c..9106bd2c70 100644
--- a/bio/emu/abundance/meta.yaml
+++ b/bio/emu/abundance/meta.yaml
@@ -4,11 +4,11 @@ url: https://github.com/treangenlab/emu
 authors:
   - Curro Campuzano
 input:
-  - Nucleotide sequence file(s) (either a single ONT or Pac-Bio fasta file, a single fastq file or paired fastq files)
-  - Optional. A emu database (i.e. a directory that contains at least the files "taxonomy.tsv" and "species_taxid.fasta", check documentation for pre-built databases and how to build them).
+  - reads: single fastq file or paired fastq files
+  - db: emu database (optional; check documentation for pre-built databases and how to build them).
 output:
-  - A TSV with relative (and optionally, absolute abundances).
-  - An optional SAM file with the alignments.
-  - An optional FASTA file with unclassified sequences.
+  - abundances: TSV with relative (and optionally, absolute abundances).
+  - alignments: SAM file with the alignments (optional).
+  - unclassified: FASTA file with unclassified sequences (optional).
 params:
   extra: Any optimal parameter such as --type (sequencer) or --min-abundance. Optional flags involving output are handled automatically (e.g. --output-dir, --output-basename ...)

From b803b8aa44d4902725b1ed6ed5b4a7dd6d9d4129 Mon Sep 17 00:00:00 2001
From: Curro Campuzano <69399781+currocam@users.noreply.github.com>
Date: Mon, 8 Apr 2024 15:27:02 +0200
Subject: [PATCH 11/26] Improve abundance emu

Co-authored-by: Filipe G. Vieira <1151762+fgvieira@users.noreply.github.com>
---
 bio/emu/abundance/wrapper.py | 21 ++++-----------------
 1 file changed, 4 insertions(+), 17 deletions(-)

diff --git a/bio/emu/abundance/wrapper.py b/bio/emu/abundance/wrapper.py
index f3f8e7ee33..af1e0c80ce 100644
--- a/bio/emu/abundance/wrapper.py
+++ b/bio/emu/abundance/wrapper.py
@@ -10,29 +10,16 @@
 
 log = snakemake.log_fmt_shell(stdout=True, stderr=True)
 extra = snakemake.params.get("extra", "")
-threads = snakemake.threads or 1
-
-# Check input (mandatory)
-msg_error = "Please provide either one file of single-end 16S reads or two files of short paired-end 16S"
-if not snakemake.input.get("reads"):
-    raise ValueError(msg_error)
-reads = snakemake.input.get("reads")
-if isinstance(reads, list) and len(reads) > 2:
-    raise ValueError(msg_error)
-
 # Check database (optional)
-database_cmd = ""
-if database := snakemake.input.get("database_dir"):
-    if not os.path.isdir(database):
-        raise ValueError("Please provide a valid Emu database directory")
-    database_cmd = f"--db {database}"
+if db:= snakemake.input.get("db"):
+    db = f"--db {db}"
 
 with tempfile.TemporaryDirectory() as tmpdir:
     shell(
-        "emu abundance {reads} {database_cmd}"
+        "emu abundance {snakemake.input.reads} {db}"
         " --keep-files --output-dir {tmpdir}"
         " --output-basename output --output-unclassified"
-        " --threads	{threads}"
+        " --threads {snakemake.threads}"
         " {extra}"
         " {log}"
     )

From eb2466d3d5df7c3e91874d57ce27acffa88006c7 Mon Sep 17 00:00:00 2001
From: Curro Campuzano <69399781+currocam@users.noreply.github.com>
Date: Mon, 8 Apr 2024 15:44:20 +0200
Subject: [PATCH 12/26] Fix typo snakefile

---
 bio/emu/abundance/test/Snakefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bio/emu/abundance/test/Snakefile b/bio/emu/abundance/test/Snakefile
index 7d6a77276d..466de94623 100644
--- a/bio/emu/abundance/test/Snakefile
+++ b/bio/emu/abundance/test/Snakefile
@@ -1,7 +1,7 @@
 rule abundance:
     input:
         reads="{sample}.fa",
-        database_dir="database",
+        db="database",
     output:
         abundances="{sample}_rel-abundance.tsv",
         alignments="{sample}_emu_alignments.sam",
@@ -18,7 +18,7 @@ rule abundance:
 rule abundance_paired:
     input:
         reads=["{sample}_R1.fq", "{sample}_R2.fq"],
-        database_dir="database",
+        db="database",
     output:
         abundances="{sample}_rel-abundance_paired.tsv",
     log:

From cab7906815cc4b5df55ad365f7750e3571344cb0 Mon Sep 17 00:00:00 2001
From: Curro Campuzano <campuzanocurro@gmail.com>
Date: Mon, 8 Apr 2024 18:15:12 +0200
Subject: [PATCH 13/26] Linter

---
 bio/emu/abundance/wrapper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bio/emu/abundance/wrapper.py b/bio/emu/abundance/wrapper.py
index af1e0c80ce..998135990d 100644
--- a/bio/emu/abundance/wrapper.py
+++ b/bio/emu/abundance/wrapper.py
@@ -11,7 +11,7 @@
 log = snakemake.log_fmt_shell(stdout=True, stderr=True)
 extra = snakemake.params.get("extra", "")
 # Check database (optional)
-if db:= snakemake.input.get("db"):
+if db := snakemake.input.get("db"):
     db = f"--db {db}"
 
 with tempfile.TemporaryDirectory() as tmpdir:

From 16d424a63ef74a4ded8e32f10214df1e11a82871 Mon Sep 17 00:00:00 2001
From: Curro Campuzano <campuzanocurro@gmail.com>
Date: Mon, 8 Apr 2024 18:16:56 +0200
Subject: [PATCH 14/26] Add pin-conda

---
 .../abundance/environment.linux-64.pin.txt    | 62 +++++++++++++++++++
 .../environment.linux-64.pin.txt              | 62 +++++++++++++++++++
 .../environment.linux-64.pin.txt              | 62 +++++++++++++++++++
 3 files changed, 186 insertions(+)
 create mode 100644 bio/emu/abundance/environment.linux-64.pin.txt
 create mode 100644 bio/emu/collapse-taxonomy/environment.linux-64.pin.txt
 create mode 100644 bio/emu/combine-outputs/environment.linux-64.pin.txt

diff --git a/bio/emu/abundance/environment.linux-64.pin.txt b/bio/emu/abundance/environment.linux-64.pin.txt
new file mode 100644
index 0000000000..a22b63bcaf
--- /dev/null
+++ b/bio/emu/abundance/environment.linux-64.pin.txt
@@ -0,0 +1,62 @@
+# This file may be used to create an environment using:
+# $ conda create --name <env> --file <this file>
+# platform: linux-64
+@EXPLICIT
+https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81
+https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.2.2-hbcca054_0.conda#2f4327a1cbe7f022401b236e915a5fef
+https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.40-h41732ed_0.conda#7aca3059a1729aa76c597603f10b0dd3
+https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-13.2.0-h7e041cc_5.conda#f6f6600d18a4047b54f803cf708b868a
+https://conda.anaconda.org/conda-forge/linux-64/python_abi-3.10-4_cp310.conda#26322ec5d7712c3ded99dd656142b8ce
+https://conda.anaconda.org/conda-forge/noarch/tzdata-2024a-h0c530f3_0.conda#161081fc7cec0bfda0d86d7cb595f8d8
+https://conda.anaconda.org/conda-forge/linux-64/libgomp-13.2.0-h807b86a_5.conda#d211c42b9ce49aee3734fdc828731689
+https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2#73aaf86a425cc6e73fcf236a5a46396d
+https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-13.2.0-h807b86a_5.conda#d4ff227c46917d3b4565302a2bbb276b
+https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hd590300_5.conda#69b8b6202a07720f448be700e300ccf4
+https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.28.1-hd590300_0.conda#dcde58ff9a1f30b0037a2315d1846d1f
+https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.1-h166bdaf_0.tar.bz2#30186d27e2c9fa62b45fb1476b7200e3
+https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.18-h0b41bf4_0.conda#6aa9c9de5542ecb07fdda9ca626252d8
+https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda#172bf1cd1ff8629f2b1179945ed45055
+https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.2-h7f98852_5.tar.bz2#d645c6d2ac96843a2bfaccd2d62b3ac3
+https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-13.2.0-ha4646dd_5.conda#7a6bd7a12a4bd359e2afe6c0fa1acace
+https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hd590300_0.conda#30fd6e37fe21f86f4bd26d6ee73eeec7
+https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda#40b61aab5c7ba9ff276c41cfffe6b80b
+https://conda.anaconda.org/conda-forge/linux-64/libxcrypt-4.4.36-hd590300_1.conda#5aa797f8787fe7a17d1b0821485b5adc
+https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.2.13-hd590300_5.conda#f36c115f1ee199da648e0597ec2047ad
+https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.4.20240210-h59595ed_0.conda#97da8860a0da5413c7c98a3b3838a645
+https://conda.anaconda.org/conda-forge/linux-64/openssl-3.2.1-hd590300_1.conda#9d731343cff6ee2e5a25c4a091bf8e2a
+https://conda.anaconda.org/conda-forge/linux-64/xz-5.2.6-h166bdaf_0.tar.bz2#2161070d867d1b1204ea749c8eec4ef0
+https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20191231-he28a2e2_2.tar.bz2#4d331e44109e3f0e19b4cb8f9b82f3e1
+https://conda.anaconda.org/conda-forge/linux-64/libgfortran-ng-13.2.0-h69a702a_5.conda#e73e9cfd1191783392131e6238bdb3e9
+https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.58.0-h47da74e_1.conda#700ac6ea6d53d5510591c4344d5c989a
+https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.45.2-h2797004_0.conda#866983a220e27a80cb75e85cb30466a1
+https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.0-h0841786_0.conda#1f5a58e686b13bcfde88b93f547d23fe
+https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8228510_1.conda#47d31b792659ce70f470b5c82fdfb7a4
+https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h4845f30_101.conda#d453b98d9c83e71da0741bb0ff4d76bc
+https://conda.anaconda.org/conda-forge/linux-64/zlib-1.2.13-hd590300_5.conda#68c34ec6149623be41a1933ab996a209
+https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.5-hfc55251_0.conda#04b88013080254850d6c01ed54810589
+https://conda.anaconda.org/bioconda/linux-64/bioawk-1.0-he4a0461_10.tar.bz2#3f4ea155f59ae781753ea76571e8564a
+https://conda.anaconda.org/bioconda/linux-64/k8-0.2.5-hdcf5f25_4.tar.bz2#d3c49a96ae45864706037702775ca7c2
+https://conda.anaconda.org/conda-forge/linux-64/krb5-1.21.2-h659d440_0.conda#cd95826dbd331ed1be26bdf401432844
+https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.27-pthreads_h413a1c8_0.conda#a356024784da6dfd4683dc5ecf45b155
+https://conda.anaconda.org/conda-forge/linux-64/python-3.10.14-hd12c33a_0_cpython.conda#2b4ba962994e8bd4be9ff5b64b75aff2
+https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-22_linux64_openblas.conda#1a2a0cd3153464fee6646f3dd6dad9b8
+https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.7.1-hca28451_0.conda#755c7f876815003337d2c61ff5d047e5
+https://conda.anaconda.org/bioconda/linux-64/minimap2-2.28-he4a0461_0.tar.bz2#27d83cfe6bca3eb50aaeb6334371122d
+https://conda.anaconda.org/conda-forge/noarch/python-tzdata-2024.1-pyhd8ed1ab_0.conda#98206ea9954216ee7540f0c773f2104d
+https://conda.anaconda.org/conda-forge/noarch/pytz-2024.1-pyhd8ed1ab_0.conda#3eeeeb9e4827ace8c0c1419c85d590ad
+https://conda.anaconda.org/conda-forge/noarch/setuptools-69.2.0-pyhd8ed1ab_0.conda#da214ecd521a720a9d521c68047682dc
+https://conda.anaconda.org/conda-forge/noarch/six-1.16.0-pyh6c4a22f_0.tar.bz2#e5f25f8dbc060e9a8d912e432202afc2
+https://conda.anaconda.org/conda-forge/noarch/wheel-0.43.0-pyhd8ed1ab_1.conda#0b5293a157c2b5cd513dd1b03d8d3aae
+https://conda.anaconda.org/conda-forge/noarch/zipp-3.17.0-pyhd8ed1ab_0.conda#2e4d6bc0b14e10f895fc6791a7d9b26a
+https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-7.1.0-pyha770c72_0.conda#0896606848b2dc5cebdf111b6543aa04
+https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.9.0-22_linux64_openblas.conda#4b31699e0ec5de64d5896e580389c9a1
+https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.9.0-22_linux64_openblas.conda#b083767b6c877e24ee597d93b87ab838
+https://conda.anaconda.org/conda-forge/linux-64/pathlib2-2.3.7.post1-py310hff52083_3.conda#62d26790749f62b9329425c901d93c3a
+https://conda.anaconda.org/conda-forge/noarch/pip-24.0-pyhd8ed1ab_0.conda#f586ac1e56c8638b64f9c8122a7b8a67
+https://conda.anaconda.org/bioconda/linux-64/pysam-0.22.0-py310h41dec4a_1.tar.bz2#19fdb9301a6debbb7fe9836670e3feb7
+https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0-pyhd8ed1ab_0.conda#2cf4264fffb9e6eff6031c5b6884d61c
+https://conda.anaconda.org/conda-forge/noarch/flatten-dict-0.4.2-pyhd8ed1ab_1.tar.bz2#ccfb30b92adfeb283d4dcae3d0b6441b
+https://conda.anaconda.org/conda-forge/linux-64/numpy-1.26.4-py310hb13e2d6_0.conda#6593de64c935768b6bad3e19b3e978be
+https://conda.anaconda.org/conda-forge/linux-64/biopython-1.83-py310h2372a71_0.conda#0128595946cebfaaf212cc45d4b9cd3c
+https://conda.anaconda.org/conda-forge/linux-64/pandas-2.2.1-py310hcc13569_0.conda#cf5d315e3601a6a2931f63aa9a84dc40
+https://conda.anaconda.org/bioconda/noarch/emu-3.4.5-hdfd78af_0.tar.bz2#34b067c6f82c3796a40e1b0ecaf094d3
diff --git a/bio/emu/collapse-taxonomy/environment.linux-64.pin.txt b/bio/emu/collapse-taxonomy/environment.linux-64.pin.txt
new file mode 100644
index 0000000000..a22b63bcaf
--- /dev/null
+++ b/bio/emu/collapse-taxonomy/environment.linux-64.pin.txt
@@ -0,0 +1,62 @@
+# This file may be used to create an environment using:
+# $ conda create --name <env> --file <this file>
+# platform: linux-64
+@EXPLICIT
+https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81
+https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.2.2-hbcca054_0.conda#2f4327a1cbe7f022401b236e915a5fef
+https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.40-h41732ed_0.conda#7aca3059a1729aa76c597603f10b0dd3
+https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-13.2.0-h7e041cc_5.conda#f6f6600d18a4047b54f803cf708b868a
+https://conda.anaconda.org/conda-forge/linux-64/python_abi-3.10-4_cp310.conda#26322ec5d7712c3ded99dd656142b8ce
+https://conda.anaconda.org/conda-forge/noarch/tzdata-2024a-h0c530f3_0.conda#161081fc7cec0bfda0d86d7cb595f8d8
+https://conda.anaconda.org/conda-forge/linux-64/libgomp-13.2.0-h807b86a_5.conda#d211c42b9ce49aee3734fdc828731689
+https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2#73aaf86a425cc6e73fcf236a5a46396d
+https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-13.2.0-h807b86a_5.conda#d4ff227c46917d3b4565302a2bbb276b
+https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hd590300_5.conda#69b8b6202a07720f448be700e300ccf4
+https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.28.1-hd590300_0.conda#dcde58ff9a1f30b0037a2315d1846d1f
+https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.1-h166bdaf_0.tar.bz2#30186d27e2c9fa62b45fb1476b7200e3
+https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.18-h0b41bf4_0.conda#6aa9c9de5542ecb07fdda9ca626252d8
+https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda#172bf1cd1ff8629f2b1179945ed45055
+https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.2-h7f98852_5.tar.bz2#d645c6d2ac96843a2bfaccd2d62b3ac3
+https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-13.2.0-ha4646dd_5.conda#7a6bd7a12a4bd359e2afe6c0fa1acace
+https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hd590300_0.conda#30fd6e37fe21f86f4bd26d6ee73eeec7
+https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda#40b61aab5c7ba9ff276c41cfffe6b80b
+https://conda.anaconda.org/conda-forge/linux-64/libxcrypt-4.4.36-hd590300_1.conda#5aa797f8787fe7a17d1b0821485b5adc
+https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.2.13-hd590300_5.conda#f36c115f1ee199da648e0597ec2047ad
+https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.4.20240210-h59595ed_0.conda#97da8860a0da5413c7c98a3b3838a645
+https://conda.anaconda.org/conda-forge/linux-64/openssl-3.2.1-hd590300_1.conda#9d731343cff6ee2e5a25c4a091bf8e2a
+https://conda.anaconda.org/conda-forge/linux-64/xz-5.2.6-h166bdaf_0.tar.bz2#2161070d867d1b1204ea749c8eec4ef0
+https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20191231-he28a2e2_2.tar.bz2#4d331e44109e3f0e19b4cb8f9b82f3e1
+https://conda.anaconda.org/conda-forge/linux-64/libgfortran-ng-13.2.0-h69a702a_5.conda#e73e9cfd1191783392131e6238bdb3e9
+https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.58.0-h47da74e_1.conda#700ac6ea6d53d5510591c4344d5c989a
+https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.45.2-h2797004_0.conda#866983a220e27a80cb75e85cb30466a1
+https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.0-h0841786_0.conda#1f5a58e686b13bcfde88b93f547d23fe
+https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8228510_1.conda#47d31b792659ce70f470b5c82fdfb7a4
+https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h4845f30_101.conda#d453b98d9c83e71da0741bb0ff4d76bc
+https://conda.anaconda.org/conda-forge/linux-64/zlib-1.2.13-hd590300_5.conda#68c34ec6149623be41a1933ab996a209
+https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.5-hfc55251_0.conda#04b88013080254850d6c01ed54810589
+https://conda.anaconda.org/bioconda/linux-64/bioawk-1.0-he4a0461_10.tar.bz2#3f4ea155f59ae781753ea76571e8564a
+https://conda.anaconda.org/bioconda/linux-64/k8-0.2.5-hdcf5f25_4.tar.bz2#d3c49a96ae45864706037702775ca7c2
+https://conda.anaconda.org/conda-forge/linux-64/krb5-1.21.2-h659d440_0.conda#cd95826dbd331ed1be26bdf401432844
+https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.27-pthreads_h413a1c8_0.conda#a356024784da6dfd4683dc5ecf45b155
+https://conda.anaconda.org/conda-forge/linux-64/python-3.10.14-hd12c33a_0_cpython.conda#2b4ba962994e8bd4be9ff5b64b75aff2
+https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-22_linux64_openblas.conda#1a2a0cd3153464fee6646f3dd6dad9b8
+https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.7.1-hca28451_0.conda#755c7f876815003337d2c61ff5d047e5
+https://conda.anaconda.org/bioconda/linux-64/minimap2-2.28-he4a0461_0.tar.bz2#27d83cfe6bca3eb50aaeb6334371122d
+https://conda.anaconda.org/conda-forge/noarch/python-tzdata-2024.1-pyhd8ed1ab_0.conda#98206ea9954216ee7540f0c773f2104d
+https://conda.anaconda.org/conda-forge/noarch/pytz-2024.1-pyhd8ed1ab_0.conda#3eeeeb9e4827ace8c0c1419c85d590ad
+https://conda.anaconda.org/conda-forge/noarch/setuptools-69.2.0-pyhd8ed1ab_0.conda#da214ecd521a720a9d521c68047682dc
+https://conda.anaconda.org/conda-forge/noarch/six-1.16.0-pyh6c4a22f_0.tar.bz2#e5f25f8dbc060e9a8d912e432202afc2
+https://conda.anaconda.org/conda-forge/noarch/wheel-0.43.0-pyhd8ed1ab_1.conda#0b5293a157c2b5cd513dd1b03d8d3aae
+https://conda.anaconda.org/conda-forge/noarch/zipp-3.17.0-pyhd8ed1ab_0.conda#2e4d6bc0b14e10f895fc6791a7d9b26a
+https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-7.1.0-pyha770c72_0.conda#0896606848b2dc5cebdf111b6543aa04
+https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.9.0-22_linux64_openblas.conda#4b31699e0ec5de64d5896e580389c9a1
+https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.9.0-22_linux64_openblas.conda#b083767b6c877e24ee597d93b87ab838
+https://conda.anaconda.org/conda-forge/linux-64/pathlib2-2.3.7.post1-py310hff52083_3.conda#62d26790749f62b9329425c901d93c3a
+https://conda.anaconda.org/conda-forge/noarch/pip-24.0-pyhd8ed1ab_0.conda#f586ac1e56c8638b64f9c8122a7b8a67
+https://conda.anaconda.org/bioconda/linux-64/pysam-0.22.0-py310h41dec4a_1.tar.bz2#19fdb9301a6debbb7fe9836670e3feb7
+https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0-pyhd8ed1ab_0.conda#2cf4264fffb9e6eff6031c5b6884d61c
+https://conda.anaconda.org/conda-forge/noarch/flatten-dict-0.4.2-pyhd8ed1ab_1.tar.bz2#ccfb30b92adfeb283d4dcae3d0b6441b
+https://conda.anaconda.org/conda-forge/linux-64/numpy-1.26.4-py310hb13e2d6_0.conda#6593de64c935768b6bad3e19b3e978be
+https://conda.anaconda.org/conda-forge/linux-64/biopython-1.83-py310h2372a71_0.conda#0128595946cebfaaf212cc45d4b9cd3c
+https://conda.anaconda.org/conda-forge/linux-64/pandas-2.2.1-py310hcc13569_0.conda#cf5d315e3601a6a2931f63aa9a84dc40
+https://conda.anaconda.org/bioconda/noarch/emu-3.4.5-hdfd78af_0.tar.bz2#34b067c6f82c3796a40e1b0ecaf094d3
diff --git a/bio/emu/combine-outputs/environment.linux-64.pin.txt b/bio/emu/combine-outputs/environment.linux-64.pin.txt
new file mode 100644
index 0000000000..a22b63bcaf
--- /dev/null
+++ b/bio/emu/combine-outputs/environment.linux-64.pin.txt
@@ -0,0 +1,62 @@
+# This file may be used to create an environment using:
+# $ conda create --name <env> --file <this file>
+# platform: linux-64
+@EXPLICIT
+https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81
+https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.2.2-hbcca054_0.conda#2f4327a1cbe7f022401b236e915a5fef
+https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.40-h41732ed_0.conda#7aca3059a1729aa76c597603f10b0dd3
+https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-13.2.0-h7e041cc_5.conda#f6f6600d18a4047b54f803cf708b868a
+https://conda.anaconda.org/conda-forge/linux-64/python_abi-3.10-4_cp310.conda#26322ec5d7712c3ded99dd656142b8ce
+https://conda.anaconda.org/conda-forge/noarch/tzdata-2024a-h0c530f3_0.conda#161081fc7cec0bfda0d86d7cb595f8d8
+https://conda.anaconda.org/conda-forge/linux-64/libgomp-13.2.0-h807b86a_5.conda#d211c42b9ce49aee3734fdc828731689
+https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2#73aaf86a425cc6e73fcf236a5a46396d
+https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-13.2.0-h807b86a_5.conda#d4ff227c46917d3b4565302a2bbb276b
+https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-hd590300_5.conda#69b8b6202a07720f448be700e300ccf4
+https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.28.1-hd590300_0.conda#dcde58ff9a1f30b0037a2315d1846d1f
+https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.1-h166bdaf_0.tar.bz2#30186d27e2c9fa62b45fb1476b7200e3
+https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.18-h0b41bf4_0.conda#6aa9c9de5542ecb07fdda9ca626252d8
+https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda#172bf1cd1ff8629f2b1179945ed45055
+https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.2-h7f98852_5.tar.bz2#d645c6d2ac96843a2bfaccd2d62b3ac3
+https://conda.anaconda.org/conda-forge/linux-64/libgfortran5-13.2.0-ha4646dd_5.conda#7a6bd7a12a4bd359e2afe6c0fa1acace
+https://conda.anaconda.org/conda-forge/linux-64/libnsl-2.0.1-hd590300_0.conda#30fd6e37fe21f86f4bd26d6ee73eeec7
+https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda#40b61aab5c7ba9ff276c41cfffe6b80b
+https://conda.anaconda.org/conda-forge/linux-64/libxcrypt-4.4.36-hd590300_1.conda#5aa797f8787fe7a17d1b0821485b5adc
+https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.2.13-hd590300_5.conda#f36c115f1ee199da648e0597ec2047ad
+https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.4.20240210-h59595ed_0.conda#97da8860a0da5413c7c98a3b3838a645
+https://conda.anaconda.org/conda-forge/linux-64/openssl-3.2.1-hd590300_1.conda#9d731343cff6ee2e5a25c4a091bf8e2a
+https://conda.anaconda.org/conda-forge/linux-64/xz-5.2.6-h166bdaf_0.tar.bz2#2161070d867d1b1204ea749c8eec4ef0
+https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20191231-he28a2e2_2.tar.bz2#4d331e44109e3f0e19b4cb8f9b82f3e1
+https://conda.anaconda.org/conda-forge/linux-64/libgfortran-ng-13.2.0-h69a702a_5.conda#e73e9cfd1191783392131e6238bdb3e9
+https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.58.0-h47da74e_1.conda#700ac6ea6d53d5510591c4344d5c989a
+https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.45.2-h2797004_0.conda#866983a220e27a80cb75e85cb30466a1
+https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.0-h0841786_0.conda#1f5a58e686b13bcfde88b93f547d23fe
+https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8228510_1.conda#47d31b792659ce70f470b5c82fdfb7a4
+https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h4845f30_101.conda#d453b98d9c83e71da0741bb0ff4d76bc
+https://conda.anaconda.org/conda-forge/linux-64/zlib-1.2.13-hd590300_5.conda#68c34ec6149623be41a1933ab996a209
+https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.5-hfc55251_0.conda#04b88013080254850d6c01ed54810589
+https://conda.anaconda.org/bioconda/linux-64/bioawk-1.0-he4a0461_10.tar.bz2#3f4ea155f59ae781753ea76571e8564a
+https://conda.anaconda.org/bioconda/linux-64/k8-0.2.5-hdcf5f25_4.tar.bz2#d3c49a96ae45864706037702775ca7c2
+https://conda.anaconda.org/conda-forge/linux-64/krb5-1.21.2-h659d440_0.conda#cd95826dbd331ed1be26bdf401432844
+https://conda.anaconda.org/conda-forge/linux-64/libopenblas-0.3.27-pthreads_h413a1c8_0.conda#a356024784da6dfd4683dc5ecf45b155
+https://conda.anaconda.org/conda-forge/linux-64/python-3.10.14-hd12c33a_0_cpython.conda#2b4ba962994e8bd4be9ff5b64b75aff2
+https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-22_linux64_openblas.conda#1a2a0cd3153464fee6646f3dd6dad9b8
+https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.7.1-hca28451_0.conda#755c7f876815003337d2c61ff5d047e5
+https://conda.anaconda.org/bioconda/linux-64/minimap2-2.28-he4a0461_0.tar.bz2#27d83cfe6bca3eb50aaeb6334371122d
+https://conda.anaconda.org/conda-forge/noarch/python-tzdata-2024.1-pyhd8ed1ab_0.conda#98206ea9954216ee7540f0c773f2104d
+https://conda.anaconda.org/conda-forge/noarch/pytz-2024.1-pyhd8ed1ab_0.conda#3eeeeb9e4827ace8c0c1419c85d590ad
+https://conda.anaconda.org/conda-forge/noarch/setuptools-69.2.0-pyhd8ed1ab_0.conda#da214ecd521a720a9d521c68047682dc
+https://conda.anaconda.org/conda-forge/noarch/six-1.16.0-pyh6c4a22f_0.tar.bz2#e5f25f8dbc060e9a8d912e432202afc2
+https://conda.anaconda.org/conda-forge/noarch/wheel-0.43.0-pyhd8ed1ab_1.conda#0b5293a157c2b5cd513dd1b03d8d3aae
+https://conda.anaconda.org/conda-forge/noarch/zipp-3.17.0-pyhd8ed1ab_0.conda#2e4d6bc0b14e10f895fc6791a7d9b26a
+https://conda.anaconda.org/conda-forge/noarch/importlib-metadata-7.1.0-pyha770c72_0.conda#0896606848b2dc5cebdf111b6543aa04
+https://conda.anaconda.org/conda-forge/linux-64/libcblas-3.9.0-22_linux64_openblas.conda#4b31699e0ec5de64d5896e580389c9a1
+https://conda.anaconda.org/conda-forge/linux-64/liblapack-3.9.0-22_linux64_openblas.conda#b083767b6c877e24ee597d93b87ab838
+https://conda.anaconda.org/conda-forge/linux-64/pathlib2-2.3.7.post1-py310hff52083_3.conda#62d26790749f62b9329425c901d93c3a
+https://conda.anaconda.org/conda-forge/noarch/pip-24.0-pyhd8ed1ab_0.conda#f586ac1e56c8638b64f9c8122a7b8a67
+https://conda.anaconda.org/bioconda/linux-64/pysam-0.22.0-py310h41dec4a_1.tar.bz2#19fdb9301a6debbb7fe9836670e3feb7
+https://conda.anaconda.org/conda-forge/noarch/python-dateutil-2.9.0-pyhd8ed1ab_0.conda#2cf4264fffb9e6eff6031c5b6884d61c
+https://conda.anaconda.org/conda-forge/noarch/flatten-dict-0.4.2-pyhd8ed1ab_1.tar.bz2#ccfb30b92adfeb283d4dcae3d0b6441b
+https://conda.anaconda.org/conda-forge/linux-64/numpy-1.26.4-py310hb13e2d6_0.conda#6593de64c935768b6bad3e19b3e978be
+https://conda.anaconda.org/conda-forge/linux-64/biopython-1.83-py310h2372a71_0.conda#0128595946cebfaaf212cc45d4b9cd3c
+https://conda.anaconda.org/conda-forge/linux-64/pandas-2.2.1-py310hcc13569_0.conda#cf5d315e3601a6a2931f63aa9a84dc40
+https://conda.anaconda.org/bioconda/noarch/emu-3.4.5-hdfd78af_0.tar.bz2#34b067c6f82c3796a40e1b0ecaf094d3

From 36be3377a911ec819fcac80b5bf26cd1d6f75e30 Mon Sep 17 00:00:00 2001
From: Curro Campuzano <69399781+currocam@users.noreply.github.com>
Date: Thu, 11 Apr 2024 14:31:42 +0200
Subject: [PATCH 15/26] Both outputs named

---
 bio/emu/combine-outputs/meta.yaml      | 5 +++--
 bio/emu/combine-outputs/test/Snakefile | 4 ++--
 bio/emu/combine-outputs/wrapper.py     | 7 +++----
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/bio/emu/combine-outputs/meta.yaml b/bio/emu/combine-outputs/meta.yaml
index 228815066b..45903b1ca2 100644
--- a/bio/emu/combine-outputs/meta.yaml
+++ b/bio/emu/combine-outputs/meta.yaml
@@ -6,8 +6,9 @@ authors:
 input:
   - A list of TSV files obtained with emu abundance. 
 output:
-  - A TSV containing either both abundances and taxonomy or only the abundances.
-  - Optionally, a TSV containing the taxonomy (if splitting the previous file in two).
+  - Abundances. A TSV containing either the abundance of different taxa. 
+  - Taxonomy. If specified, a separate TSV containing the taxonomy. Otherwise, taxonomy will be included in the abundance table. 
+  both abundances and taxonomy or only the abundances.
 params:
   rank: Accepted ranks are 'tax_id', 'species', 'genus', 'family', 'order', 'class', 'phylum' and 'superkingdom'. If ommited, no agglomeration will be done (that is, the default is 'tax_id'). 
   extra: Extra arguments (such as '--counts'). 
diff --git a/bio/emu/combine-outputs/test/Snakefile b/bio/emu/combine-outputs/test/Snakefile
index d446c71419..1b74cb8b0e 100644
--- a/bio/emu/combine-outputs/test/Snakefile
+++ b/bio/emu/combine-outputs/test/Snakefile
@@ -2,7 +2,7 @@ rule combine_outputs:
     input:
         expand("{sample}_rel-abundance.tsv", sample=["sample1", "sample2"]),
     output:
-        "combined_abundances.tsv",
+        abundances="combined_abundances.tsv",
     log:
         "logs/emu/combined_abundances.log",
     wrapper:
@@ -13,7 +13,7 @@ rule combine_outputs_split:
     input:
         expand("{sample}_rel-abundance.tsv", sample=["sample1", "sample2"]),
     output:
-        abundance="counts.tsv",
+        abundances="counts.tsv",
         taxonomy="taxonomy.tsv",
     log:
         "logs/emu/combined_split.log",
diff --git a/bio/emu/combine-outputs/wrapper.py b/bio/emu/combine-outputs/wrapper.py
index 2a4e7f75c2..7d7768ebb6 100644
--- a/bio/emu/combine-outputs/wrapper.py
+++ b/bio/emu/combine-outputs/wrapper.py
@@ -15,14 +15,13 @@
 if not isinstance(input_files, list):
     raise ValueError("Input should be a list of files: " + str(input_files) + "!")
 
-if snakemake.output.get("abundance") and snakemake.output.get("taxonomy"):
+if snakemake.output.get("abundances") and snakemake.output.get("taxonomy"):
     split = True
     extra += " --split-tables"
     taxonomy = snakemake.output.get("taxonomy")
-    abundances = snakemake.output.get("abundance")
-elif isinstance(snakemake.output[0], str):
+    abundances = snakemake.output.get("abundances")
+elif table := snakemake.output.get("abundances"):
     split = False
-    table = snakemake.output[0]
 else:
     raise ValueError(
         "Please provide either one TSV file, or two named TSV files (abundances and taxonomy)"

From f1c7edc0d19c36d1bf3e062348f7ca4d8e238a38 Mon Sep 17 00:00:00 2001
From: Curro Campuzano <69399781+currocam@users.noreply.github.com>
Date: Thu, 11 Apr 2024 14:40:25 +0200
Subject: [PATCH 16/26] Fix broken yaml

---
 bio/emu/combine-outputs/meta.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bio/emu/combine-outputs/meta.yaml b/bio/emu/combine-outputs/meta.yaml
index 45903b1ca2..7808070eef 100644
--- a/bio/emu/combine-outputs/meta.yaml
+++ b/bio/emu/combine-outputs/meta.yaml
@@ -8,7 +8,6 @@ input:
 output:
   - Abundances. A TSV containing either the abundance of different taxa. 
   - Taxonomy. If specified, a separate TSV containing the taxonomy. Otherwise, taxonomy will be included in the abundance table. 
-  both abundances and taxonomy or only the abundances.
 params:
   rank: Accepted ranks are 'tax_id', 'species', 'genus', 'family', 'order', 'class', 'phylum' and 'superkingdom'. If ommited, no agglomeration will be done (that is, the default is 'tax_id'). 
   extra: Extra arguments (such as '--counts'). 

From 893edf6df086ba5a7319905fe5def4b3c100978c Mon Sep 17 00:00:00 2001
From: Curro Campuzano <69399781+currocam@users.noreply.github.com>
Date: Fri, 12 Apr 2024 13:46:12 +0200
Subject: [PATCH 17/26] Update bio/emu/combine-outputs/meta.yaml

Co-authored-by: Filipe G. Vieira <1151762+fgvieira@users.noreply.github.com>
---
 bio/emu/combine-outputs/meta.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bio/emu/combine-outputs/meta.yaml b/bio/emu/combine-outputs/meta.yaml
index 7808070eef..3ab6e95d92 100644
--- a/bio/emu/combine-outputs/meta.yaml
+++ b/bio/emu/combine-outputs/meta.yaml
@@ -6,8 +6,8 @@ authors:
 input:
   - A list of TSV files obtained with emu abundance. 
 output:
-  - Abundances. A TSV containing either the abundance of different taxa. 
-  - Taxonomy. If specified, a separate TSV containing the taxonomy. Otherwise, taxonomy will be included in the abundance table. 
+  - abundances: TSV file containing the abundance of different taxa. 
+  - taxonomy:  TSV file containing the taxonomy (optional; otherwise, taxonomy will be included in the abundance table).
 params:
   rank: Accepted ranks are 'tax_id', 'species', 'genus', 'family', 'order', 'class', 'phylum' and 'superkingdom'. If ommited, no agglomeration will be done (that is, the default is 'tax_id'). 
   extra: Extra arguments (such as '--counts'). 

From 16b79d7980d61206ea65a368898a2a806cd436a7 Mon Sep 17 00:00:00 2001
From: Curro Campuzano <campuzanocurro@gmail.com>
Date: Fri, 12 Apr 2024 13:52:50 +0200
Subject: [PATCH 18/26] Add test case

---
 bio/emu/combine-outputs/test/Snakefile                 | 2 +-
 bio/emu/combine-outputs/test/sample1_rel-abundance.txt | 1 +
 bio/emu/combine-outputs/test/sample2_rel-abundance.txt | 1 +
 bio/emu/combine-outputs/test/sample_rel-abundance.tsv  | 3 ---
 4 files changed, 3 insertions(+), 4 deletions(-)
 create mode 120000 bio/emu/combine-outputs/test/sample1_rel-abundance.txt
 create mode 120000 bio/emu/combine-outputs/test/sample2_rel-abundance.txt
 delete mode 100644 bio/emu/combine-outputs/test/sample_rel-abundance.tsv

diff --git a/bio/emu/combine-outputs/test/Snakefile b/bio/emu/combine-outputs/test/Snakefile
index 1b74cb8b0e..65973721a0 100644
--- a/bio/emu/combine-outputs/test/Snakefile
+++ b/bio/emu/combine-outputs/test/Snakefile
@@ -11,7 +11,7 @@ rule combine_outputs:
 
 rule combine_outputs_split:
     input:
-        expand("{sample}_rel-abundance.tsv", sample=["sample1", "sample2"]),
+        expand("{sample}_rel-abundance.txt", sample=["sample1", "sample2"]),
     output:
         abundances="counts.tsv",
         taxonomy="taxonomy.tsv",
diff --git a/bio/emu/combine-outputs/test/sample1_rel-abundance.txt b/bio/emu/combine-outputs/test/sample1_rel-abundance.txt
new file mode 120000
index 0000000000..6fb3595c9e
--- /dev/null
+++ b/bio/emu/combine-outputs/test/sample1_rel-abundance.txt
@@ -0,0 +1 @@
+sample1_rel-abundance.tsv
\ No newline at end of file
diff --git a/bio/emu/combine-outputs/test/sample2_rel-abundance.txt b/bio/emu/combine-outputs/test/sample2_rel-abundance.txt
new file mode 120000
index 0000000000..c74fd5a73f
--- /dev/null
+++ b/bio/emu/combine-outputs/test/sample2_rel-abundance.txt
@@ -0,0 +1 @@
+sample2_rel-abundance.tsv
\ No newline at end of file
diff --git a/bio/emu/combine-outputs/test/sample_rel-abundance.tsv b/bio/emu/combine-outputs/test/sample_rel-abundance.tsv
deleted file mode 100644
index 89e3ce5712..0000000000
--- a/bio/emu/combine-outputs/test/sample_rel-abundance.tsv
+++ /dev/null
@@ -1,3 +0,0 @@
-tax_id	abundance	superkingdom	phylum	class	order	family	genus	species	estimated counts
-1	1.0	Bacteria	Proteobacteria	Gammaproteobacteria	Pseudomonadales	Pseudomonadaceae	Pseudomonas	amygdali;	2.0
-unassigned	0.0								0.0

From ac4157754abc3932208fa75adbfd0c171906fab5 Mon Sep 17 00:00:00 2001
From: "Filipe G. Vieira" <1151762+fgvieira@users.noreply.github.com>
Date: Fri, 12 Apr 2024 15:34:12 +0200
Subject: [PATCH 19/26] Code reformat

---
 bio/emu/combine-outputs/wrapper.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/bio/emu/combine-outputs/wrapper.py b/bio/emu/combine-outputs/wrapper.py
index 7d7768ebb6..aa3a477bd1 100644
--- a/bio/emu/combine-outputs/wrapper.py
+++ b/bio/emu/combine-outputs/wrapper.py
@@ -11,10 +11,6 @@
 log = snakemake.log_fmt_shell(stdout=True, stderr=True)
 extra = snakemake.params.get("extra", "")
 
-input_files = snakemake.input
-if not isinstance(input_files, list):
-    raise ValueError("Input should be a list of files: " + str(input_files) + "!")
-
 if snakemake.output.get("abundances") and snakemake.output.get("taxonomy"):
     split = True
     extra += " --split-tables"

From 73f5a1f4b7ca75aeb01a7a75ee099ab8433085fb Mon Sep 17 00:00:00 2001
From: "Filipe G. Vieira" <1151762+fgvieira@users.noreply.github.com>
Date: Fri, 12 Apr 2024 15:34:33 +0200
Subject: [PATCH 20/26] Code update

---
 bio/emu/combine-outputs/wrapper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bio/emu/combine-outputs/wrapper.py b/bio/emu/combine-outputs/wrapper.py
index aa3a477bd1..fb18ec92db 100644
--- a/bio/emu/combine-outputs/wrapper.py
+++ b/bio/emu/combine-outputs/wrapper.py
@@ -31,7 +31,7 @@
 
 
 with tempfile.TemporaryDirectory() as tmpdir:
-    for infile in input_files:
+    for infile in snakemake.input:
         # Files has to end in tsv, and contain rel_abundances
         temp = os.path.join(tmpdir, os.path.basename(infile))
         if not temp.endswith("rel_abundances.tsv"):

From 34974a9cfb9a1a7a798f5ada399183642c58e4de Mon Sep 17 00:00:00 2001
From: "Filipe G. Vieira" <1151762+fgvieira@users.noreply.github.com>
Date: Fri, 12 Apr 2024 15:42:37 +0200
Subject: [PATCH 21/26] Code reformat

---
 bio/emu/combine-outputs/wrapper.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/bio/emu/combine-outputs/wrapper.py b/bio/emu/combine-outputs/wrapper.py
index fb18ec92db..a053c0e95c 100644
--- a/bio/emu/combine-outputs/wrapper.py
+++ b/bio/emu/combine-outputs/wrapper.py
@@ -11,17 +11,13 @@
 log = snakemake.log_fmt_shell(stdout=True, stderr=True)
 extra = snakemake.params.get("extra", "")
 
-if snakemake.output.get("abundances") and snakemake.output.get("taxonomy"):
+taxonomy = snakemake.output.get("taxonomy", "")
+abundances = snakemake.output.get("abundances", "")
+if taxonomy and abundances:
     split = True
     extra += " --split-tables"
-    taxonomy = snakemake.output.get("taxonomy")
-    abundances = snakemake.output.get("abundances")
-elif table := snakemake.output.get("abundances"):
+elise:
     split = False
-else:
-    raise ValueError(
-        "Please provide either one TSV file, or two named TSV files (abundances and taxonomy)"
-    )
 
 if "--split-tables" in extra and not split:
     raise ValueError("You cannot use --split-tables and produce a single output.")

From 1eeb8419cebe179df5aed78cd258100b216cc5bb Mon Sep 17 00:00:00 2001
From: "Filipe G. Vieira" <1151762+fgvieira@users.noreply.github.com>
Date: Fri, 12 Apr 2024 15:42:46 +0200
Subject: [PATCH 22/26] Code reformat

---
 bio/emu/combine-outputs/wrapper.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bio/emu/combine-outputs/wrapper.py b/bio/emu/combine-outputs/wrapper.py
index a053c0e95c..d3d56a2c9b 100644
--- a/bio/emu/combine-outputs/wrapper.py
+++ b/bio/emu/combine-outputs/wrapper.py
@@ -41,6 +41,6 @@
         shell("mv {tmpdir}/emu-combined-taxonomy-{rank}.tsv {taxonomy}")
         shell("mv {tmpdir}/emu-combined-abundance-{rank}.tsv {abundances}")
     elif not split and counts:
-        shell("mv {tmpdir}/emu-combined-{rank}-counts.tsv {table}")
+        shell("mv {tmpdir}/emu-combined-{rank}-counts.tsv {abundances}")
     elif not split and not counts:
-        shell("mv {tmpdir}/emu-combined-{rank}.tsv {table}")
+        shell("mv {tmpdir}/emu-combined-{rank}.tsv {abundances}")

From ea0940d830eb25a9b8a71fb2dd393cdda055b5be Mon Sep 17 00:00:00 2001
From: "Filipe G. Vieira" <1151762+fgvieira@users.noreply.github.com>
Date: Fri, 12 Apr 2024 15:43:13 +0200
Subject: [PATCH 23/26] Code cleanup

---
 bio/emu/combine-outputs/wrapper.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/bio/emu/combine-outputs/wrapper.py b/bio/emu/combine-outputs/wrapper.py
index d3d56a2c9b..f9426bae9d 100644
--- a/bio/emu/combine-outputs/wrapper.py
+++ b/bio/emu/combine-outputs/wrapper.py
@@ -19,9 +19,6 @@
 elise:
     split = False
 
-if "--split-tables" in extra and not split:
-    raise ValueError("You cannot use --split-tables and produce a single output.")
-
 rank = snakemake.params.get("rank", "tax_id")
 counts = "--counts" in extra
 

From f5acf5146fd7a9ed11c3bf42a3bd3a3b75f5b530 Mon Sep 17 00:00:00 2001
From: "Filipe G. Vieira" <1151762+fgvieira@users.noreply.github.com>
Date: Fri, 12 Apr 2024 15:43:44 +0200
Subject: [PATCH 24/26] Code cleanup

---
 bio/emu/abundance/wrapper.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bio/emu/abundance/wrapper.py b/bio/emu/abundance/wrapper.py
index 998135990d..98f7c532b6 100644
--- a/bio/emu/abundance/wrapper.py
+++ b/bio/emu/abundance/wrapper.py
@@ -10,7 +10,6 @@
 
 log = snakemake.log_fmt_shell(stdout=True, stderr=True)
 extra = snakemake.params.get("extra", "")
-# Check database (optional)
 if db := snakemake.input.get("db"):
     db = f"--db {db}"
 

From 04c9dffb179302a836fc806360299d6a90040904 Mon Sep 17 00:00:00 2001
From: Curro Campuzano <69399781+currocam@users.noreply.github.com>
Date: Fri, 12 Apr 2024 15:54:40 +0200
Subject: [PATCH 25/26] Update bio/emu/abundance/wrapper.py

Co-authored-by: Filipe G. Vieira <1151762+fgvieira@users.noreply.github.com>
---
 bio/emu/abundance/wrapper.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/bio/emu/abundance/wrapper.py b/bio/emu/abundance/wrapper.py
index 98f7c532b6..d84af9247d 100644
--- a/bio/emu/abundance/wrapper.py
+++ b/bio/emu/abundance/wrapper.py
@@ -10,7 +10,8 @@
 
 log = snakemake.log_fmt_shell(stdout=True, stderr=True)
 extra = snakemake.params.get("extra", "")
-if db := snakemake.input.get("db"):
+db = snakemake.input.get("db", ""):
+if db:
     db = f"--db {db}"
 
 with tempfile.TemporaryDirectory() as tmpdir:

From d7c5b6c9c2c2bdfb4bce94d9d220dfeb719b43e4 Mon Sep 17 00:00:00 2001
From: Curro Campuzano <campuzanocurro@gmail.com>
Date: Fri, 12 Apr 2024 15:58:34 +0200
Subject: [PATCH 26/26] Fix typos

---
 bio/emu/abundance/wrapper.py       | 2 +-
 bio/emu/combine-outputs/wrapper.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/bio/emu/abundance/wrapper.py b/bio/emu/abundance/wrapper.py
index d84af9247d..28a571f814 100644
--- a/bio/emu/abundance/wrapper.py
+++ b/bio/emu/abundance/wrapper.py
@@ -10,7 +10,7 @@
 
 log = snakemake.log_fmt_shell(stdout=True, stderr=True)
 extra = snakemake.params.get("extra", "")
-db = snakemake.input.get("db", ""):
+db = snakemake.input.get("db", "")
 if db:
     db = f"--db {db}"
 
diff --git a/bio/emu/combine-outputs/wrapper.py b/bio/emu/combine-outputs/wrapper.py
index f9426bae9d..e1d62ea787 100644
--- a/bio/emu/combine-outputs/wrapper.py
+++ b/bio/emu/combine-outputs/wrapper.py
@@ -16,7 +16,7 @@
 if taxonomy and abundances:
     split = True
     extra += " --split-tables"
-elise:
+else:
     split = False
 
 rank = snakemake.params.get("rank", "tax_id")