Skip to content

Commit

Permalink
improve performance and reduce false positive
Browse files Browse the repository at this point in the history
  • Loading branch information
sfchen committed Feb 18, 2022
1 parent a52d31b commit 69ddeba
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 8 deletions.
4 changes: 4 additions & 0 deletions src/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,5 +38,9 @@ static const int PACK_SIZE = 1000;
// if the number of in memory packs is full, the producer thread should sleep
static const int PACK_IN_MEM_LIMIT = 100;

// the key dup in normal level will be kept, in high level will be skipped
static const int DUPE_NORMAL_LEVEL = -1;
static const int DUPE_HIGH_LEVEL = -2;


#endif /* COMMON_H */
27 changes: 19 additions & 8 deletions src/indexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,16 +85,24 @@ void Indexer::indexContig(int ctg, string seq, int start) {
if(mKmerPos.count(kmer) >0 ){
GenePos gp = mKmerPos[kmer];
// already marked as a dupe
if(gp.contig < 0) {
mDupeList[gp.position].push_back(site);
if(gp.contig == DUPE_HIGH_LEVEL){
// skip it since it's already a high level dupe
continue;
} else if(gp.contig == DUPE_NORMAL_LEVEL) {
if(mDupeList[gp.position].size() >= GlobalSettings::skipKeyDupThreshold) {
mKmerPos[kmer].contig = DUPE_HIGH_LEVEL;
mDupeList[gp.position]=vector<GenePos>();
}
else
mDupeList[gp.position].push_back(site);
} else {
// else make a new dupe entry
vector<GenePos> gps;
gps.push_back(gp);
gps.push_back(site);
mDupeList.push_back(gps);
// and mark it as a dupe
mKmerPos[kmer].contig = -1;
mKmerPos[kmer].contig = DUPE_NORMAL_LEVEL;
mKmerPos[kmer].position = mDupeList.size() -1;
mUniquePos--;
mDupePos++;
Expand Down Expand Up @@ -126,18 +134,18 @@ vector<SeqMatch> Indexer::mapRead(Read* r) {
}
GenePos gp = mKmerPos[kmer];
// is a dupe
if(gp.contig < 0) {
if(gp.contig == DUPE_HIGH_LEVEL) {
// too much keys in this dupe, then skip it
if(mDupeList[gp.position].size() > GlobalSettings::skipKeyDupThreshold)
continue;
continue;
} else if(gp.contig == DUPE_NORMAL_LEVEL) {
for(int g=0; g<mDupeList[gp.position].size();g++) {
long gplong = gp2long(shift(mDupeList[gp.position][g], i));
if(kmerStat.count(gplong)==0)
kmerStat[gplong] = 1;
else
kmerStat[gplong] += 1;
}
} else {
} else { // not a dupe
long gplong = gp2long(shift(gp, i));
if(kmerStat.count(gplong)==0)
kmerStat[gplong] = 1;
Expand Down Expand Up @@ -183,7 +191,10 @@ vector<SeqMatch> Indexer::mapRead(Read* r) {
continue;
GenePos gp = mKmerPos[kmer];
// is a dupe
if(gp.contig < 0) {
if(gp.contig == DUPE_HIGH_LEVEL) {
// too much keys in this dupe, then skip it
continue;
} else if(gp.contig == DUPE_NORMAL_LEVEL) {
for(int g=0; g<mDupeList[gp.position].size();g++) {
long gplong = gp2long(shift(mDupeList[gp.position][g], i));
if(abs(gplong - gp1) <= 1)
Expand Down

0 comments on commit 69ddeba

Please sign in to comment.