Merge pull request #63 from neherlab/fix/issue-62

Fix/issue 62
neherlab · Nov 21, 2023 · 54f7f90 · 54f7f90
2 parents b11ef76 + 3ca68f3
commit 54f7f90
Show file tree

Hide file tree

Showing 6 changed files with 108 additions and 20 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,9 @@
 # PanGraph Changelog
 
+## v0.7.3
+
+- bugfix in graph building: a particular edge-case would cause minor inconsistencies in the block alignment when merging graphs, see issue [#62](https://github.com/neherlab/pangraph/issues/62) and PR [#63](https://github.com/neherlab/pangraph/pull/63).
+
 ## v0.7.2
 
 - minor fix in tree midpoint rooting during panX export, see [#59](https://github.com/neherlab/pangraph/issues/59).

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "PanGraph"
 uuid = "0f9f61ca-f32c-45e1-b3bc-00138f4f8814"
 authors = ["Nicholas Noll <nbnoll@eml.cc>"]
-version = "0.7.2"
+version = "0.7.3"
 
 [deps]
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"

diff --git a/docs/src/cli/build.md b/docs/src/cli/build.md
@@ -17,6 +17,8 @@ Build a multiple sequence alignment pangraph.
 | alignment kernel     | String  | k          | alignment-kernel | only accepts "minimap2" or "mmseqs"                                                                           |
 | kmer length (mmseqs) | Integer | K          | kmer-length      | kmer length, only used for mmseqs2 alignment kernel. If not specified will use mmseqs default.                |
 | consistency check    | Boolean | t          | test             | toggle to activate consistency check: verifies that input genomes can be exactly reconstructed from the graph |
+| verbose mode         | Boolean | v          | verbose          | toggle to activate verbose mode                                                                               |
+| debug mode           | Boolean | D          | debug            | toggle to activate debug mode: during merging intermediate graphs are saved in the `debug` folder             |
 | random seed          | Int     | r          | random-seed      | random seed for pangraph construction.                                                                        |
 
 ## Arguments

diff --git a/src/align.jl b/src/align.jl
@@ -463,6 +463,7 @@ function align_self(G₁::Graph, energy::Function, minblock::Int, aligner::Funct
         merges = preprocess(hits, skip, energy, block)
         length(merges) > 0 || break
 
+        # dictionary block-id => block
         blocks = align_kernel(merges, minblock, replace, verbose)
         merge!(blocks, G₀.block)
 
@@ -593,6 +594,7 @@ function align_pair(G₁::Graph, G₂::Graph, energy::Function, minblock::Int, a
 
     merges = preprocess(hits, skip, energy, block)
 
+    # dictionary block-id => block
     blocks   = align_kernel(merges, minblock, replace, verbose)
     sequence = merge(G₁.sequence, G₂.sequence)
 
@@ -618,7 +620,7 @@ end
 # TODO: the associative array is a bit hacky...
 #       can we push it directly into the channel?
 """
-	align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(hit)->(-Inf), minblock=100, reference=nothing, maxiter=100)
+	align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(hit)->(-Inf), minblock=100, reference=nothing, maxiter=100, verbose=false, debugdir=nothing)
 
 Aligns a collection of graphs `Gs` using the specified `aligner` function to recover hits.
 Graphs are aligned following an internal guide tree, generated using kmer distance.
@@ -631,7 +633,7 @@ The _lower_ the score, the _better_ the alignment. Only negative energies are co
 
 `compare` is the function to be used to generate pairwise distances that generate the internal guide tree.
 """
-function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(hit)->(-Inf), minblock=100, reference=nothing, maxiter=100, verbose=false, rand_seed=0)
+function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(hit)->(-Inf), minblock=100, reference=nothing, maxiter=100, verbose=false, rand_seed=0, debugdir=nothing)
     function verify(graph; msg="")
         if reference !== nothing
             for (name,path) ∈ graph.sequence
@@ -718,20 +720,26 @@ function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(h
                 # the lock ensures that at most N=Threads.nthreads() processes are
                 # spawning run(`cmd`) instances at the same time
                 G₀ = lock_semaphore(s) do
-                    verbose && log("--> align-pair for clade n. $n_clade")
+                    verbose && log("--> align-pair for clade n. $n_clade from $(Pₗ) and $(Pᵣ)")
                     G₀ = align_pair(Gₗ, Gᵣ, energy, minblock, aligner, verify, verbose)
                     verbose && log("--> align-self for clade n. $n_clade")
                     G₀ = align_self(G₀, energy, minblock, aligner, verify, verbose, maxiter=maxiter)
                     verbose && log("--> graph merging for clade n. $n_clade completed")
                     G₀
                 end
 
-                # DEBUG : save graph at each iteration in a file
-                # open("issue/comp/graph_iteration_$(n_clade).json", "w") do io
-                #     finalize!(G₀)
-                #     marshal(io, G₀; fmt=:json)
-                # end
-
+                # if debug: save graph at each iteration in a file
+                if debugdir !== nothing
+                    open("$(debugdir)/graph_iteration_$(n_clade).json", "w") do io
+                        finalize!(G₀)
+                        marshal(io, G₀; fmt=:json)
+                    end
+                    # if the file exists, remove input intermediate files that were successfully merged
+                    fl = "$(debugdir)/graph_iteration_$(Pₗ).json"
+                    fr = "$(debugdir)/graph_iteration_$(Pᵣ).json"
+                    isfile(fl) && rm(fl)
+                    isfile(fr) && rm(fr)
+                end
 
                 # advance progress bar in a thread-safe way
                 lock(meter_lock) do

diff --git a/src/block.jl b/src/block.jl
@@ -957,6 +957,22 @@ function reconsensus!(b::Block)
     return true
 end
 
+# """
+#     debug function that saves all of the block information on a text file.
+# """
+# function all_block_info(io::IO, b::Block)
+#     println(io, "Block ID: ", b.uuid)
+#     println(io, "Sequence: ", b.sequence .|> Char |> join)
+#     println(io, "Gaps: ", b.gaps)
+#     for (name, dict) in zip(["Mutations", "Insertions", "Deletions"], [b.mutate, b.insert, b.delete])
+#         println(io, "$name ---")
+#         for (node, value) in dict
+#             println(io, "\tNode: ", hash(node))
+#             println(io, "\t\t", value)
+#         end
+#     end
+# end
+
 # TODO: align consensus sequences within overlapping gaps of qry and ref.
 #       right now we parsimoniously stuff all sequences at the beginning of gaps
 #       problems:
@@ -998,7 +1014,6 @@ function rereference(qry::Block, ref::Block, segments)
                     # TODO: allow for (-) hamming alignments
                     gap = gapconsensus(qry, x.qry-1)
                     pos = hamming_align(gap, ref.sequence[Δ])-1
-
                     newgap = (Δ.stop, 0)
 
                     for node ∈ keys(qry)
@@ -1015,7 +1030,7 @@ function rereference(qry::Block, ref::Block, segments)
                             start = Δ.start + pos + δ
                             stop  = start + length(ins) - 1
 
-                            if 1 ≤ start ≤ Δ.stop 
+                            if 1 ≤ start ≤ Δ.stop
                                 for i ∈ start:min(Δ.stop,stop)
                                     if ins[i-start+1] != ref.sequence[i]
                                         if node ∉ keys(combined.mutate)
@@ -1080,17 +1095,40 @@ function rereference(qry::Block, ref::Block, segments)
                     end
 
                     newgap = nothing
-                else
+                else # no insertions in qry, just add as new deletion
                     newdeletes = DelDict(node => Dict(x.ref=>Δ.stop-Δ.start+1) for node ∈ keys(qry))
                     merge!(combined.delete, newdeletes)
                 end
 
                 x = (qry=x.qry, ref=Δ.stop+1)
             end
             (Δ, nothing) => let # sequence in qry consensus not found in ref consensus
-                mutate = translate(lociwithin(qry.mutate,Δ),1-Δ.start)
-                insert = translate(lociwithin(qry.insert,Δ),1-Δ.start)
-                delete = translate(lociwithin(qry.delete,Δ),1-Δ.start)
+                mutate = lociwithin(qry.mutate,Δ)
+                insert = lociwithin(qry.insert,Δ)
+                delete = lociwithin(qry.delete,Δ)
+
+                # delete all query insertions that will be accounted for in this segment
+                del_gaps = Int[]
+                for (node, subdict) in insert
+                    for (locus, nuc) in subdict
+                        # @assert locus[1] in keys(qry.gaps)
+                        # append locus to del_gaps
+                        push!(del_gaps, locus[1])
+                        delete!(qry.insert[node], locus)
+                    end
+                end
+
+                # remove corresponding query gaps
+                for dg in unique(del_gaps)
+                    # @assert dg ∉ [locus[1] for (node, subdict) ∈ qry.insert for (locus, nuc) ∈ keys(subdict)]
+                    delete!(qry.gaps, dg)
+                end
+
+                # shift variations on the segment start
+                mutate = translate(mutate,1-Δ.start)
+                insert = translate(insert,1-Δ.start)
+                delete = translate(delete,1-Δ.start)
+
 
                 if (x.ref-1) ∈ keys(newgaps) # TODO: more sophisticated alignment? have to worry about overriding alignment
                     δ = newgaps[x.ref-1] #hamming_align(qry.sequence[Δ], gapconsensus(combined.insert, newgaps[x.ref-1], x.ref-1)) - 1
@@ -1109,13 +1147,13 @@ function rereference(qry::Block, ref::Block, segments)
                             if (δ+length(seq)) > last(newgap)
                                 newgap = (first(newgap),length(seq)+δ)
                             end
-                            node => InsMap((x.ref-1,δ) => seq) 
+                            node => InsMap((x.ref-1,δ) => seq)
                         else
                             node => InsMap()
                         end
                     end for node ∈ keys(qry)
                 )
-
+                
                 if last(newgap) > 0
                     if newgap[1] ∉ keys(newgaps) || newgap[2] > newgaps[newgap[1]]
                         newgaps[newgap[1]] = newgap[2]
@@ -1143,7 +1181,7 @@ function rereference(qry::Block, ref::Block, segments)
                    ) for node ∈ keys(qry)
                 )
 
-                # XXX: hacky way to ensure deletions are not inclued in newmuts
+                # XXX: hacky way to ensure deletions are not included in newmuts
                 newdels = map(qry.delete,Δq,Δr)
                 for (node, subdict) in newdels
                     for (pos, len) in subdict

diff --git a/src/build.jl b/src/build.jl
@@ -84,6 +84,20 @@ Build = Command(
             "toggle to activate consistency check at each step of the graph merging process",
             false,
         ),
+        Arg(
+            Bool,
+            "debug mode",
+            (short = "-D", long = "--debug"),
+            "toggle to activate debug mode: saves intermediate graphs in `debug` folder",
+            false,
+        ),
+        Arg(
+            Bool,
+            "verbose mode",
+            (short = "-v", long = "--verbose"),
+            "toggle to activate verbose mode",
+            false,
+        ),
         Arg(
             Int,
             "random seed",
@@ -93,16 +107,37 @@ Build = Command(
         ),
     ],
     (args) -> let
+
+        # parse input files
         files = parse(Build, args)
         files = if files === nothing || length(files) == 0
             ["/dev/stdin"]
         else
             files
         end
 
+        # if verbose mode: print all arguments
+        verbose = arg(Build, "-v")
+
+        if verbose
+            println(stderr, "pangraph build command arguments:")
+            for arg in Build.arg
+                println(stderr, "\t", arg.flag.long, " = ", arg.value)
+            end
+        end
+
+        debugdir = nothing
+        if arg(Build, "-D")
+            # create debug directory
+            debugdir = joinpath(pwd(), "debug")
+            isdir(debugdir) || mkdir(debugdir)
+        end
+
+        # parse command arguments
         minblock = arg(Build, "-l")
         circular = arg(Build, "-c")
         uppercase = arg(Build, "-u")
+        verbose = arg(Build, "-v")
 
         α = arg(Build, "-a")
         β = arg(Build, "-b")
@@ -186,8 +221,9 @@ Build = Command(
             minblock = minblock,
             maxiter = maxiter,
             reference = reference,
-            verbose = false,
+            verbose = verbose,
             rand_seed = r_seed,
+            debugdir = debugdir,
         )
         finalize!(graph)