Skip to content

Commit

Permalink
Merge pull request #63 from neherlab/fix/issue-62
Browse files Browse the repository at this point in the history
Fix/issue 62
  • Loading branch information
mmolari committed Nov 21, 2023
2 parents b11ef76 + 3ca68f3 commit 54f7f90
Show file tree
Hide file tree
Showing 6 changed files with 108 additions and 20 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
@@ -1,5 +1,9 @@
# PanGraph Changelog

## v0.7.3

- bugfix in graph building: a particular edge-case would cause minor inconsistencies in the block alignment when merging graphs, see issue [#62](https://github.com/neherlab/pangraph/issues/62) and PR [#63](https://github.com/neherlab/pangraph/pull/63).

## v0.7.2

- minor fix in tree midpoint rooting during panX export, see [#59](https://github.com/neherlab/pangraph/issues/59).
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
@@ -1,7 +1,7 @@
name = "PanGraph"
uuid = "0f9f61ca-f32c-45e1-b3bc-00138f4f8814"
authors = ["Nicholas Noll <nbnoll@eml.cc>"]
version = "0.7.2"
version = "0.7.3"

[deps]
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
Expand Down
2 changes: 2 additions & 0 deletions docs/src/cli/build.md
Expand Up @@ -17,6 +17,8 @@ Build a multiple sequence alignment pangraph.
| alignment kernel | String | k | alignment-kernel | only accepts "minimap2" or "mmseqs" |
| kmer length (mmseqs) | Integer | K | kmer-length | kmer length, only used for mmseqs2 alignment kernel. If not specified will use mmseqs default. |
| consistency check | Boolean | t | test | toggle to activate consistency check: verifies that input genomes can be exactly reconstructed from the graph |
| verbose mode | Boolean | v | verbose | toggle to activate verbose mode |
| debug mode | Boolean | D | debug | toggle to activate debug mode: during merging intermediate graphs are saved in the `debug` folder |
| random seed | Int | r | random-seed | random seed for pangraph construction. |

## Arguments
Expand Down
26 changes: 17 additions & 9 deletions src/align.jl
Expand Up @@ -463,6 +463,7 @@ function align_self(G₁::Graph, energy::Function, minblock::Int, aligner::Funct
merges = preprocess(hits, skip, energy, block)
length(merges) > 0 || break

# dictionary block-id => block
blocks = align_kernel(merges, minblock, replace, verbose)
merge!(blocks, G₀.block)

Expand Down Expand Up @@ -593,6 +594,7 @@ function align_pair(G₁::Graph, G₂::Graph, energy::Function, minblock::Int, a

merges = preprocess(hits, skip, energy, block)

# dictionary block-id => block
blocks = align_kernel(merges, minblock, replace, verbose)
sequence = merge(G₁.sequence, G₂.sequence)

Expand All @@ -618,7 +620,7 @@ end
# TODO: the associative array is a bit hacky...
# can we push it directly into the channel?
"""
align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(hit)->(-Inf), minblock=100, reference=nothing, maxiter=100)
align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(hit)->(-Inf), minblock=100, reference=nothing, maxiter=100, verbose=false, debugdir=nothing)
Aligns a collection of graphs `Gs` using the specified `aligner` function to recover hits.
Graphs are aligned following an internal guide tree, generated using kmer distance.
Expand All @@ -631,7 +633,7 @@ The _lower_ the score, the _better_ the alignment. Only negative energies are co
`compare` is the function to be used to generate pairwise distances that generate the internal guide tree.
"""
function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(hit)->(-Inf), minblock=100, reference=nothing, maxiter=100, verbose=false, rand_seed=0)
function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(hit)->(-Inf), minblock=100, reference=nothing, maxiter=100, verbose=false, rand_seed=0, debugdir=nothing)
function verify(graph; msg="")
if reference !== nothing
for (name,path) graph.sequence
Expand Down Expand Up @@ -718,20 +720,26 @@ function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(h
# the lock ensures that at most N=Threads.nthreads() processes are
# spawning run(`cmd`) instances at the same time
G₀ = lock_semaphore(s) do
verbose && log("--> align-pair for clade n. $n_clade")
verbose && log("--> align-pair for clade n. $n_clade from $(Pₗ) and $(Pᵣ)")
G₀ = align_pair(Gₗ, Gᵣ, energy, minblock, aligner, verify, verbose)
verbose && log("--> align-self for clade n. $n_clade")
G₀ = align_self(G₀, energy, minblock, aligner, verify, verbose, maxiter=maxiter)
verbose && log("--> graph merging for clade n. $n_clade completed")
G₀
end

# DEBUG : save graph at each iteration in a file
# open("issue/comp/graph_iteration_$(n_clade).json", "w") do io
# finalize!(G₀)
# marshal(io, G₀; fmt=:json)
# end

# if debug: save graph at each iteration in a file
if debugdir !== nothing
open("$(debugdir)/graph_iteration_$(n_clade).json", "w") do io
finalize!(G₀)
marshal(io, G₀; fmt=:json)
end
# if the file exists, remove input intermediate files that were successfully merged
fl = "$(debugdir)/graph_iteration_$(Pₗ).json"
fr = "$(debugdir)/graph_iteration_$(Pᵣ).json"
isfile(fl) && rm(fl)
isfile(fr) && rm(fr)
end

# advance progress bar in a thread-safe way
lock(meter_lock) do
Expand Down
56 changes: 47 additions & 9 deletions src/block.jl
Expand Up @@ -957,6 +957,22 @@ function reconsensus!(b::Block)
return true
end

# """
# debug function that saves all of the block information on a text file.
# """
# function all_block_info(io::IO, b::Block)
# println(io, "Block ID: ", b.uuid)
# println(io, "Sequence: ", b.sequence .|> Char |> join)
# println(io, "Gaps: ", b.gaps)
# for (name, dict) in zip(["Mutations", "Insertions", "Deletions"], [b.mutate, b.insert, b.delete])
# println(io, "$name ---")
# for (node, value) in dict
# println(io, "\tNode: ", hash(node))
# println(io, "\t\t", value)
# end
# end
# end

# TODO: align consensus sequences within overlapping gaps of qry and ref.
# right now we parsimoniously stuff all sequences at the beginning of gaps
# problems:
Expand Down Expand Up @@ -998,7 +1014,6 @@ function rereference(qry::Block, ref::Block, segments)
# TODO: allow for (-) hamming alignments
gap = gapconsensus(qry, x.qry-1)
pos = hamming_align(gap, ref.sequence[Δ])-1

newgap =.stop, 0)

for node keys(qry)
Expand All @@ -1015,7 +1030,7 @@ function rereference(qry::Block, ref::Block, segments)
start = Δ.start + pos + δ
stop = start + length(ins) - 1

if 1 start Δ.stop
if 1 start Δ.stop
for i start:min.stop,stop)
if ins[i-start+1] != ref.sequence[i]
if node keys(combined.mutate)
Expand Down Expand Up @@ -1080,17 +1095,40 @@ function rereference(qry::Block, ref::Block, segments)
end

newgap = nothing
else
else # no insertions in qry, just add as new deletion
newdeletes = DelDict(node => Dict(x.ref=>Δ.stop-Δ.start+1) for node keys(qry))
merge!(combined.delete, newdeletes)
end

x = (qry=x.qry, ref=Δ.stop+1)
end
(Δ, nothing) => let # sequence in qry consensus not found in ref consensus
mutate = translate(lociwithin(qry.mutate,Δ),1-Δ.start)
insert = translate(lociwithin(qry.insert,Δ),1-Δ.start)
delete = translate(lociwithin(qry.delete,Δ),1-Δ.start)
mutate = lociwithin(qry.mutate,Δ)
insert = lociwithin(qry.insert,Δ)
delete = lociwithin(qry.delete,Δ)

# delete all query insertions that will be accounted for in this segment
del_gaps = Int[]
for (node, subdict) in insert
for (locus, nuc) in subdict
# @assert locus[1] in keys(qry.gaps)
# append locus to del_gaps
push!(del_gaps, locus[1])
delete!(qry.insert[node], locus)
end
end

# remove corresponding query gaps
for dg in unique(del_gaps)
# @assert dg ∉ [locus[1] for (node, subdict) ∈ qry.insert for (locus, nuc) ∈ keys(subdict)]
delete!(qry.gaps, dg)
end

# shift variations on the segment start
mutate = translate(mutate,1-Δ.start)
insert = translate(insert,1-Δ.start)
delete = translate(delete,1-Δ.start)


if (x.ref-1) keys(newgaps) # TODO: more sophisticated alignment? have to worry about overriding alignment
δ = newgaps[x.ref-1] #hamming_align(qry.sequence[Δ], gapconsensus(combined.insert, newgaps[x.ref-1], x.ref-1)) - 1
Expand All @@ -1109,13 +1147,13 @@ function rereference(qry::Block, ref::Block, segments)
if+length(seq)) > last(newgap)
newgap = (first(newgap),length(seq)+δ)
end
node => InsMap((x.ref-1,δ) => seq)
node => InsMap((x.ref-1,δ) => seq)
else
node => InsMap()
end
end for node keys(qry)
)

if last(newgap) > 0
if newgap[1] keys(newgaps) || newgap[2] > newgaps[newgap[1]]
newgaps[newgap[1]] = newgap[2]
Expand Down Expand Up @@ -1143,7 +1181,7 @@ function rereference(qry::Block, ref::Block, segments)
) for node keys(qry)
)

# XXX: hacky way to ensure deletions are not inclued in newmuts
# XXX: hacky way to ensure deletions are not included in newmuts
newdels = map(qry.delete,Δq,Δr)
for (node, subdict) in newdels
for (pos, len) in subdict
Expand Down
38 changes: 37 additions & 1 deletion src/build.jl
Expand Up @@ -84,6 +84,20 @@ Build = Command(
"toggle to activate consistency check at each step of the graph merging process",
false,
),
Arg(
Bool,
"debug mode",
(short = "-D", long = "--debug"),
"toggle to activate debug mode: saves intermediate graphs in `debug` folder",
false,
),
Arg(
Bool,
"verbose mode",
(short = "-v", long = "--verbose"),
"toggle to activate verbose mode",
false,
),
Arg(
Int,
"random seed",
Expand All @@ -93,16 +107,37 @@ Build = Command(
),
],
(args) -> let

# parse input files
files = parse(Build, args)
files = if files === nothing || length(files) == 0
["/dev/stdin"]
else
files
end

# if verbose mode: print all arguments
verbose = arg(Build, "-v")

if verbose
println(stderr, "pangraph build command arguments:")
for arg in Build.arg
println(stderr, "\t", arg.flag.long, " = ", arg.value)
end
end

debugdir = nothing
if arg(Build, "-D")
# create debug directory
debugdir = joinpath(pwd(), "debug")
isdir(debugdir) || mkdir(debugdir)
end

# parse command arguments
minblock = arg(Build, "-l")
circular = arg(Build, "-c")
uppercase = arg(Build, "-u")
verbose = arg(Build, "-v")

α = arg(Build, "-a")
β = arg(Build, "-b")
Expand Down Expand Up @@ -186,8 +221,9 @@ Build = Command(
minblock = minblock,
maxiter = maxiter,
reference = reference,
verbose = false,
verbose = verbose,
rand_seed = r_seed,
debugdir = debugdir,
)
finalize!(graph)

Expand Down

0 comments on commit 54f7f90

Please sign in to comment.