Skip to content

Commit

Permalink
Merge pull request #80 from dib-lab/aaKmers
Browse files Browse the repository at this point in the history
  • Loading branch information
mr-eyes committed May 23, 2021
2 parents e351568 + e03f554 commit 07a3a5e
Show file tree
Hide file tree
Showing 16 changed files with 416 additions and 111 deletions.
6 changes: 4 additions & 2 deletions include/kProcessor/algorithms.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,8 @@ kmerDecoder* initialize_kmerDecoder(int kmer_size, int hash_mode = 1);
* Mode 1: Integer Hashing | Reversible | Full Hashing
* Mode 2: TwoBitsHashing | Not considered hashing, just store the two bits representation
*/
void kmerDecoder_setHashing(kmerDecoder * KD, int hash_mode, bool canonical = true);
void kmerDecoder_setHashing(kDataFrame * KF, int hash_mode, bool canonical = true);
void kmerDecoder_setHashing(kmerDecoder * KD, hashingModes hash_mode);
void kmerDecoder_setHashing(kDataFrame * KF, hashingModes hash_mode);

/// Perform indexing to a sequences file with predefined kmers decoding mode, returns a colored kDataframe.
colored_kDataFrame *index(kmerDecoder *KD, string names_fileName, kDataFrame *frame);
Expand All @@ -104,5 +104,7 @@ colored_kDataFrame *index(kmerDecoder *KD, string names_fileName, kDataFrame *fr
colored_kDataFrame *
index(kDataFrame *frame, std::map<std::string, int> parse_params, string filename, int chunks, string names_fileName);

colored_kDataFrame * index(kDataFrame *frame, string filename, int chunks, string names_fileName);

}
#endif
17 changes: 12 additions & 5 deletions include/kProcessor/kDataFrame.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -340,13 +340,17 @@ class kDataFrameMQF: public kDataFrame{
public:
kDataFrameMQF();
kDataFrameMQF(uint64_t kSize);
kDataFrameMQF(uint64_t kSize, int mode);
kDataFrameMQF(uint64_t kSize, hashingModes hash_mode);
kDataFrameMQF(readingModes RM, hashingModes HM, map<string, int> params);
kDataFrameMQF(uint64_t ksize,uint8_t q,uint8_t fixedCounterSize,uint8_t tagSize
,double falsePositiveRate);

kDataFrameMQF(uint64_t ksize, uint8_t q, int mode);
kDataFrameMQF(uint64_t ksize, uint8_t q, hashingModes hash_mode = integer_hasher);

kDataFrameMQF(QF* mqf,uint64_t ksize,double falsePositiveRate);
kDataFrameMQF(QF *mqf, uint64_t ksize, hashingModes hash_mode);
kDataFrameMQF(QF *mqf, readingModes RM, hashingModes HM, map<string, int> params);

//count histogram is array where count of kmers repeated n times is found at index n. index 0 holds number of distinct kmers.
kDataFrameMQF(uint64_t ksize,vector<uint64_t> countHistogram,uint8_t tagSize
,double falsePositiveRate);
Expand Down Expand Up @@ -439,7 +443,9 @@ class kDataFrameBMQF: public kDataFrame{
kDataFrameIterator* endIterator;
public:
kDataFrameBMQF();
kDataFrameBMQF(uint64_t kSize);
kDataFrameBMQF(readingModes RM, hashingModes HM, map<string, int> params);
kDataFrameBMQF(uint64_t kSize, hashingModes hash_mode = integer_hasher);
kDataFrameBMQF(bufferedMQF* bufferedmqf, readingModes RM, hashingModes HM, map<string, int> params);
kDataFrameBMQF(uint64_t ksize,uint8_t q,uint8_t fixedCounterSize,uint8_t tagSize,double falsePositiveRate);
kDataFrameBMQF(bufferedMQF* bufferedmqf,uint64_t ksize,double falsePositiveRate);
//count histogram is array where count of kmers repeated n times is found at index n. index 0 holds number of distinct kmers.
Expand Down Expand Up @@ -511,6 +517,7 @@ class kDataFrameMAP : public kDataFrame
public:
kDataFrameMAP();
kDataFrameMAP(uint64_t ksize);
kDataFrameMAP(readingModes RM, hashingModes HM, map<string, int> params);
kDataFrameMAP(uint64_t kSize,vector<uint64_t> kmersHistogram);
kDataFrame* getTwin();
void reserve (uint64_t n);
Expand Down Expand Up @@ -588,9 +595,9 @@ class kDataFramePHMAP : public kDataFrame {
flat_hash_map<uint64_t, uint64_t> MAP;
public:
kDataFramePHMAP();

kDataFramePHMAP(uint64_t ksize);
kDataFramePHMAP(uint64_t ksize, int mode);
kDataFramePHMAP(readingModes RM, hashingModes hash_mode, map<string, int> params);
kDataFramePHMAP(uint64_t ksize, hashingModes hash_mode);
kDataFramePHMAP(uint64_t kSize,vector<uint64_t> kmersHistogram);

kDataFrame *getTwin();
Expand Down
216 changes: 201 additions & 15 deletions src/algorithms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ namespace kProcessor {

// Clone the hashing

kmerDecoder_setHashing(KD, kframe->KD->hash_mode, kframe->KD->canonical);
kmerDecoder_setHashing(KD, kframe->KD->hash_mode);

// Processing

Expand Down Expand Up @@ -407,7 +407,7 @@ namespace kProcessor {

// Clone the hashing

kmerDecoder_setHashing(KD, frame->KD->hash_mode, frame->KD->canonical);
kmerDecoder_setHashing(KD, frame->KD->hash_mode);

if (KD->get_kSize() != (int)frame->getkSize()) {
std::cerr << "kmerDecoder kSize must be equal to kDataFrame kSize" << std::endl;
Expand Down Expand Up @@ -537,12 +537,12 @@ namespace kProcessor {
}


void kmerDecoder_setHashing(kmerDecoder * KD, int hash_mode, bool canonical){
KD->setHashingMode(hash_mode, canonical);
void kmerDecoder_setHashing(kmerDecoder * KD, hashingModes hash_mode){
KD->setHashingMode(hash_mode, KD->get_kSize());
}

void kmerDecoder_setHashing(kDataFrame * KF, int hash_mode, bool canonical){
KF->KD->setHashingMode(hash_mode, canonical);
void kmerDecoder_setHashing(kDataFrame * KF, hashingModes hash_mode){
KF->KD->setHashingMode(hash_mode, KF->ksize());
}


Expand Down Expand Up @@ -641,18 +641,14 @@ namespace kProcessor {
}
}

kmerDecoder* initialize_kmerDecoder(int kSize, int hash_mode){
// Mode 0: Murmar Hashing | Irreversible
// Mode 1: Integer Hashing | Reversible | Full Hashing
// Mode 2: TwoBitsHashing | Not considered hashing, just store the two bits representation
return new Kmers(kSize, hash_mode);
kmerDecoder* initialize_kmerDecoder(int kSize, hashingModes HM = integer_hasher){
return new Kmers(kSize, HM);
}

colored_kDataFrame *index(kmerDecoder *KD, string names_fileName, kDataFrame *frame) {

if (KD->get_kSize() != (int)frame->ksize()) {
std::cerr << "kmerDecoder kSize must be equal to kDataFrame kSize" << std::endl;
exit(1);
if (KD->get_kSize() != (int)frame->ksize() && (KD->hash_mode != protein_hasher || KD->hash_mode != proteinDayhoff_hasher)) {
std::cerr << "WARNING: kmerDecoder kSize must be equal to kDataFrame kSize" << std::endl;
}

flat_hash_map<string, string> namesMap;
Expand Down Expand Up @@ -847,7 +843,7 @@ namespace kProcessor {

// Clone the hashing

kmerDecoder_setHashing(KD, frame->KD->hash_mode, frame->KD->canonical);
kmerDecoder_setHashing(KD, frame->KD->hash_mode);

// Processing

Expand Down Expand Up @@ -1028,6 +1024,196 @@ namespace kProcessor {
return res;
}

colored_kDataFrame * index(kDataFrame * frame, string filename, int chunk_size, string names_fileName){

// parse_params["mode"] = 1 > Default: Kmers
// parse_params["mode"] = 2 > Skipmers
// parse_params["mode"] = 3 > Minimizers

// Initialize kmerDecoder
map<string, int> params = kmerDecoder::string_to_params(frame->KD->params_to_string());

kmerDecoder * KD = kmerDecoder::getInstance(filename, chunk_size, frame->KD->slicing_mode, frame->KD->hash_mode, params);

// Processing

if (KD->get_kSize() != (int)frame->ksize()) {
std::cerr << "kmerDecoder kSize must be equal to kDataFrame kSize" << std::endl;
exit(1);
}

flat_hash_map<string, string> namesMap;
flat_hash_map<string, uint64_t> tagsMap;
flat_hash_map<string, uint64_t> groupNameMap;
auto *legend = new flat_hash_map<uint64_t, std::vector<uint32_t>>();
flat_hash_map<uint64_t, uint64_t> colorsCount;
uint64_t readID = 0, groupID = 1;
ifstream namesFile(names_fileName.c_str());
string seqName, groupName;
string line;
priority_queue<uint64_t, vector<uint64_t>, std::greater<uint64_t>> freeColors;
flat_hash_map<string, uint64_t> groupCounter;
// while (namesFile >> seqName >> groupName) {
while (std::getline(namesFile, line)) {
std::vector<string> tokens;
std::istringstream iss(line);
std::string token;
while(std::getline(iss, token, '\t')) // but we can specify a different one
tokens.push_back(token);
seqName = tokens[0];
groupName = tokens[1];
namesMap.insert(make_pair(seqName, groupName));
auto it = groupNameMap.find(groupName);
groupCounter[groupName]++;
if (it == groupNameMap.end()) {
groupNameMap.insert(make_pair(groupName, groupID));
tagsMap.insert(make_pair(to_string(groupID), groupID));
vector<uint32_t> tmp;
tmp.clear();
tmp.push_back(groupID);
legend->insert(make_pair(groupID, tmp));
colorsCount.insert(make_pair(groupID, 0));
groupID++;
}
}

flat_hash_map<uint64_t, string> inv_groupNameMap;
for (auto &_ : groupNameMap)
inv_groupNameMap[_.second] = _.first;


vector<kDataFrameMQF *> frames;
int currIndex = 0;
string kmer;
uint64_t tagBits = 0;
uint64_t maxTagValue = (1ULL << tagBits) - 1;
// kDataFrame *frame;
int kSize = KD->get_kSize();


uint64_t lastTag = 0;
readID = 0;
int __batch_count = 0;
while (!KD->end()) {
KD->next_chunk();
cout << "Processing Chunk(" << ++__batch_count << "): " << chunk_size*__batch_count << " seqs ..." << endl;
flat_hash_map<uint64_t, uint64_t> convertMap;

for (const auto &seq : *KD->getKmers()) {
string readName = seq.first;

auto it = namesMap.find(readName);
if (it == namesMap.end()) {
continue;
// cout << "read " << readName << "dont have group. Please check the group names file." << endl;
}
string groupName = it->second;

uint64_t readTag = groupNameMap.find(groupName)->second;


convertMap.clear();
convertMap.insert(make_pair(0, readTag));
convertMap.insert(make_pair(readTag, readTag));
// cout<<readName<<" "<<seq.size()<<endl;
for (const auto &kmer : seq.second) {
uint64_t currentTag = frame->getCount(kmer.hash);
auto itc = convertMap.find(currentTag);
if (itc == convertMap.end()) {
vector<uint32_t> colors = legend->find(currentTag)->second;
auto tmpiT = find(colors.begin(), colors.end(), readTag);
if (tmpiT == colors.end()) {
colors.push_back(readTag);
sort(colors.begin(), colors.end());
}

string colorsString = to_string(colors[0]);
for (int k = 1; k < colors.size(); k++) {
colorsString += ";" + to_string(colors[k]);
}

auto itTag = tagsMap.find(colorsString);
if (itTag == tagsMap.end()) {
uint64_t newColor;
if (freeColors.size() == 0) {
newColor = groupID++;
} else {
newColor = freeColors.top();
freeColors.pop();
}

tagsMap.insert(make_pair(colorsString, newColor));
legend->insert(make_pair(newColor, colors));
itTag = tagsMap.find(colorsString);
colorsCount[newColor] = 0;
// if(groupID>=maxTagValue){
// cerr<<"Tag size is not enough. ids reached "<<groupID<<endl;
// return -1;
// }
}
uint64_t newColor = itTag->second;

convertMap.insert(make_pair(currentTag, newColor));
itc = convertMap.find(currentTag);
}

if (itc->second != currentTag) {

colorsCount[currentTag]--;
if (colorsCount[currentTag] == 0 && currentTag != 0) {

auto _invGroupNameIT = inv_groupNameMap.find(currentTag);
if (_invGroupNameIT == inv_groupNameMap.end()){
freeColors.push(currentTag);
vector<uint32_t> colors = legend->find(currentTag)->second;
string colorsString = to_string(colors[0]);
for (unsigned int k = 1; k < colors.size(); k++) {
colorsString += ";" + to_string(colors[k]);
}
tagsMap.erase(colorsString);
legend->erase(currentTag);
if (convertMap.find(currentTag) != convertMap.end())
convertMap.erase(currentTag);
}

}
colorsCount[itc->second]++;
}

frame->setCount(kmer.hash, itc->second);
if (frame->getCount(kmer.hash) != itc->second) {
//frame->setC(kmer,itc->second);
cout << "Error Founded " << kmer.str << " from sequence " << readName << " expected "
<< itc->second << " found " << frame->getCount(kmer.hash) << endl;
return NULL;
}
}
readID += 1;
groupCounter[groupName]--;
if (colorsCount[readTag] == 0) {
if (groupCounter[groupName] == 0) {
freeColors.push(readTag);
legend->erase(readTag);
}
}
}
}
colorTable *colors = new intVectorsTable();
for (auto it : *legend) {
colors->setColor(it.first, it.second);
}

colored_kDataFrame *res = new colored_kDataFrame();
res->setColorTable(colors);
res->setkDataFrame(frame);
for (auto iit = namesMap.begin(); iit != namesMap.end(); iit++) {
uint32_t sampleID = groupNameMap[iit->second];
res->namesMap[sampleID] = iit->second;
res->namesMapInv[iit->second] = sampleID;
}
return res;
}


vector<uint64_t> estimateKmersHistogram(string fileName, int kSize ,int threads)
{
Expand Down

0 comments on commit 07a3a5e

Please sign in to comment.