From 61d63a9ce57ed1c4da61a4046a595feb208e709a Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Sun, 2 Jun 2019 16:23:25 +0200 Subject: [PATCH 1/4] greatly reduce RegisteredAlignment's size by removing Parameters object This never should have been included in this structure, as it would have massively increased the size of the RegisteredAlignment. This might have been the cause of memory problems people have encountered! --- src/AlleleParser.cpp | 2 +- src/AlleleParser.h | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/src/AlleleParser.cpp b/src/AlleleParser.cpp index ca5d67e4..c63d5ace 100644 --- a/src/AlleleParser.cpp +++ b/src/AlleleParser.cpp @@ -2050,7 +2050,7 @@ void AlleleParser::updateAlignmentQueue(long int position, //cerr << "parameters capcoverage " << parameters.capCoverage << " " << rq.size() << endl; if (considerAlignment) { // and insert the registered alignment into that deque - rq.push_front(RegisteredAlignment(currentAlignment, parameters)); + rq.push_front(RegisteredAlignment(currentAlignment)); RegisteredAlignment& ra = rq.front(); registerAlignment(currentAlignment, ra, sampleName, sequencingTech); // backtracking if we have too many mismatches diff --git a/src/AlleleParser.h b/src/AlleleParser.h index 9e700710..89663aff 100644 --- a/src/AlleleParser.h +++ b/src/AlleleParser.h @@ -57,10 +57,8 @@ class RegisteredAlignment { int snpCount; int indelCount; int alleleTypes; - Parameters parameters; - RegisteredAlignment(BAMALIGN& alignment, Parameters parameters) - //: alignment(alignment) + RegisteredAlignment(BAMALIGN& alignment) : start(alignment.POSITION) , end(alignment.ENDPOSITION) , refid(alignment.REFID) @@ -69,7 +67,6 @@ class RegisteredAlignment { , snpCount(0) , indelCount(0) , alleleTypes(0) - , parameters(parameters) { FILLREADGROUP(readgroup, alignment); } From e666b5b374666639626c28ade6ccad1363beccd3 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Sun, 2 Jun 2019 17:02:35 +0200 Subject: [PATCH 2/4] correct mistake with registered alignment skip coverage exclusion --- src/AlleleParser.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/AlleleParser.cpp b/src/AlleleParser.cpp index c63d5ace..ad5b41af 100644 --- a/src/AlleleParser.cpp +++ b/src/AlleleParser.cpp @@ -2084,7 +2084,7 @@ void AlleleParser::removeRegisteredAlignmentsOverlappingPosition(long unsigned i set allelesToErase; while (f != registeredAlignments.end()) { for (deque::iterator d = f->second.begin(); d != f->second.end(); ++d) { - if (d->start >= pos && d->end > pos) { + if (d->start <= pos && d->end > pos) { alignmentsToErase[f->first].insert(d); for (vector::iterator a = d->alleles.begin(); a != d->alleles.end(); ++a) { allelesToErase.insert(&*a); From 4b8845ac78f8e5a8900bbfde9549c16fd66c556d Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Mon, 3 Jun 2019 09:59:51 +0200 Subject: [PATCH 3/4] change test to match current (correct!) behavior --- test/t/01_call_variants.t | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/t/01_call_variants.t b/test/t/01_call_variants.t index 40cb02cb..0756b51a 100755 --- a/test/t/01_call_variants.t +++ b/test/t/01_call_variants.t @@ -119,5 +119,5 @@ samtools view -h tiny/NA12878.chr22.tiny.bam | sed s/NA12878D_HiSeqX_R1.fastq.gz is $(freebayes -f tiny/q.fa -F 0.2 tiny/NA12878.chr22.tiny.bam x.sam -A <(echo 1 8; echo 2 13) | grep 'AN=21' | wc -l) 19 "the CNV map may be used to specify per-sample copy numbers" rm -f x.sam -is $(freebayes -f tiny/q.fa --skip-coverage 30 tiny/NA12878.chr22.tiny.bam | grep -v '^#' | wc -l) 27 "freebayes makes the expected number of calls when capping coverage" -is $(freebayes -f tiny/q.fa -g 30 tiny/NA12878.chr22.tiny.bam | vcfkeepinfo - DP | vcf2tsv | cut -f 8 | tail -n+2 | awk '$1 <= 30 { print }' | wc -l) 27 "all coverage capped calls are below the coverage threshold" +is $(freebayes -f tiny/q.fa --skip-coverage 30 tiny/NA12878.chr22.tiny.bam | grep -v '^#' | wc -l) 22 "freebayes makes the expected number of calls when capping coverage" +is $(freebayes -f tiny/q.fa -g 30 tiny/NA12878.chr22.tiny.bam | vcfkeepinfo - DP | vcf2tsv | cut -f 8 | tail -n+2 | awk '$1 <= 30 { print }' | wc -l) 22 "all coverage capped calls are below the coverage threshold" From ffcee0d8719ffdbbf0aa7059b83999fb3fc9d810 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Mon, 3 Jun 2019 15:16:34 +0200 Subject: [PATCH 4/4] avoid hanging in super low entropy sequence We should never be running the entropy calculation past the read end. It won't ever support a haplotype call to do so. This change may very slightly change behavior of the algorithm in low entropy sequence regions. --- src/AlleleParser.cpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/AlleleParser.cpp b/src/AlleleParser.cpp index ad5b41af..8e47a6aa 100644 --- a/src/AlleleParser.cpp +++ b/src/AlleleParser.cpp @@ -1266,19 +1266,16 @@ Allele AlleleParser::makeAllele(RegisteredAlignment& ra, // a dangerous game int start = pos - currentSequenceStart; double minEntropy = parameters.minRepeatEntropy; - // check first that' wer'e actually ina repeat... TODO - //cerr << "entropy of " << entropy(currentSequence.substr(start, repeatRightBoundary - pos)) << " is too low, " << endl; while (minEntropy > 0 && // ignore if turned off - repeatRightBoundary - currentSequenceStart < currentSequence.size() && //guard + // don't run off the end of the current sequence + repeatRightBoundary - currentSequenceStart < currentSequence.size() && + // there is no point in going past the alignment end + // because we won't make a haplotype call unless we have a covering observation from a read + repeatRightBoundary < alignment.ENDPOSITION && entropy(currentSequence.substr(start, repeatRightBoundary - pos)) < minEntropy) { - //cerr << "entropy of " << entropy(currentSequence.substr(start, repeatRightBoundary - pos)) << " is too low, "; - //cerr << "increasing rought boundary to "; ++repeatRightBoundary; - //cerr << repeatRightBoundary << endl; } - // now we - //cachedRepeatCounts[pos] = repeatCounts(pos - currentSequenceStart, currentSequence, 12); // edge case, the indel is an insertion and matches the reference to the right // this means there is a repeat structure in the read, but not the ref if (currentSequence.substr(pos - currentSequenceStart, length) == readSequence) {