forked from lh3/ropebwt
-
Notifications
You must be signed in to change notification settings - Fork 0
/
CGkArray.h
432 lines (387 loc) · 13.3 KB
/
CGkArray.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
/******************************************************************************
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program; if not, write to the *
* Free Software Foundation, Inc., *
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
*****************************************************************************/
#ifndef _CGkArray_H_
#define _CGkArray_H_
#include "BlockArray.h"
#include "ArrayDoc.h"
#include "HuffWT.h"
// Include from RLCSA
#include "bits/deltavector.h"
// Include from libcds
#include <basics.h> // Defines W == 32
#include <static_bitsequence.h>
// Libcds includes will collide with #define W.
// Re-defining the word size to ulong:
#undef W
#define W (CHAR_BIT*sizeof(unsigned long))
#undef bitset
#undef bitget
#include <string>
#include <vector>
#include <set>
/**
* Implementation of the Compressed Gk Arrays
*
* Use TextCollectionBuilder to construct.
*
*/
class CGkArray
{
public:
/**
* Data types for results
*/
// Pair of read number (0-based numbering) and read position (0-based numbering)
typedef std::pair<unsigned, ulong> position_result;
// Vector of read number and read position pairs
typedef std::vector<position_result> position_vector;
// Range of the suffix array
typedef std::pair<ulong,ulong> sa_range;
// Internal pointer for move left (on arbitrary patterns)
typedef std::pair<sa_range,unsigned> internal_pointer;
/**
* Convert from text position to a pair of <read number, read position>
*
* Both the read number and position follow a 0-based numbering.
*
* Input: Text position
* Output: Read number + read position
*/
inline position_result textPosToReadPos(ulong i) const
{
CSA::DeltaVector::Iterator iter(*textStartPos);
unsigned read = iter.rank(i)-1;
return std::make_pair(read, i - iter.select(read));
}
/**
* Convert from a pair of <read number, read position> to text position
*
* Both the read number and position follow a 0-based numbering.
*
* Input: Read number + read position
* Output: Text position
*/
inline ulong readPosToTextPos(unsigned read, ulong i) const
{
CSA::DeltaVector::Iterator iter(*textStartPos);
return iter.select(read) + i;
}
/**
* Check if given text position is valid (i.e. k-mer does not span over a '\0' byte).
*
* Input: Text position, use readPosToTextPos() to convert
*/
inline bool isValidTextPos(ulong i) const
{
CSA::DeltaVector::Iterator iter(*textStartPos);
unsigned read = iter.rank(i);
return (iter.select(read) - i > gk ? true : false);
}
// Return total length of text (including 0-terminators).
inline ulong getLength() const
{ return n; }
// Return the length of the given read (including 0-terminator)
ulong getLength(unsigned i) const
{
CSA::DeltaVector::Iterator iter(*textStartPos);
return iter.select(i+1) - iter.select(i);
}
// Return the k-mer size used in indexing
unsigned getGkSize() const
{ return gk; }
// Return the number of indexed reads
unsigned getNumberOfReads() const
{ return numberOfTexts; }
/**
* Find the suffix array range for the given k-mer
*
* Input: k-mer
* Output: Suffix array range
*/
sa_range kmerToSARange(uchar const *) const;
/**
* Find the suffix array range for the k-mer at the given position
*
* Input: text position, use readPosToTextPos() to convert
* Output: Suffix array range
*/
sa_range textPosToSARange(ulong x) const
{
ulong y = inverseSA(x);
return make_pair(Blcp->prev(y), Blcp->next(y+1)-1);
}
/**
* Move left operation gives an efficient way to step over each k-mer in a read.
*
* Each call to this method returns a SA range of the preceding read position,
* that is, the k-mers of the read are traversed in backwards order.
* This method allows to compute, for example, so called read-coverage profiles.
*
* Input: <internal value>, use initMoveLeft() to initialize.
* Output: Suffix array range of the preceeding read position.
* After the last step, subsequent calls return an empty SA range.
*/
sa_range moveLeft(ulong &) const;
/**
* Move left operation for arbitrary patterns
*
* Allows an efficient way to step over each k-mer in any given string.
*
* Input: <internal pointer>, use initMoveLeft() to initialize, and
* String that was used to initialize the internal pointer.
* Output: Suffix array range of the preceeding position.
* Range can be empty if the k-mer is not found in the index.
* After the last step, subsequent calls return an empty SA range.
*/
sa_range moveLeft(internal_pointer &, uchar const *) const;
/**
* Initialize move left
*
* Returns an internal value that corresponds to the end
* of the given read.
*
* Input: Read number
* Output: <internal value> that points to the last k-mer of the given read.
*/
ulong initMoveLeft(unsigned) const;
/**
* Initialize move left for arbitrary pattern
*
* Returns <internal pointer> that corresponds to the end
* of the given read.
*
* Input: Arbitrary string, assuming '\0'-terminated
* Output: <internal pointer> that points to the last k-mer of the given pattern.
*/
internal_pointer initMoveLeft(uchar const *) const;
/**
* Q1 In which reads does k-mer occur?
*
* Input: text position, use readPosToTextPos() to convert
* Output: Vector of read numbers and positions (i.e. position of the last occurrence of f in each read)
*/
inline position_vector reportReads(ulong x) const
{
ulong y = inverseSA(x);
return reportReads(Blcp->prev(y), Blcp->next(y+1)-1);
}
/**
* Q1 In which reads does k-mer occur?
*
* Input: range in the suffix array, use kmerToSARange() to find
* Output: Vector of read numbers and positions (i.e. position of the last occurrence of f in each read)
*/
inline position_vector reportReads(sa_range const &range) const
{
return reportReads(range.first, range.second);
}
/**
* Q2 In how many reads does k-mer occur?
*
* Input: text position, use readPosToTextPos() to convert
*/
inline unsigned countReads(ulong x) const
{
ulong y = inverseSA(x);
return countReads(Blcp->prev(y), Blcp->next(y+1)-1);
}
/**
* Q2 In how many reads does k-mer occur?
*
* Input: range in the suffix array, use kmerToSARange() to find
*/
inline unsigned countReads(sa_range const &range) const
{
return countReads(range.first, range.second);
}
/**
* Q3 What are the occurrence positions of k-mer?
*
* Input: text position, use readPosToTextPos() to convert
* Output: Vector of read numbers and positions (i.e. all occurrences of f)
*/
inline position_vector reportOccs(ulong x) const
{
ulong y = inverseSA(x);
return reportOccs(make_pair(Blcp->prev(y), Blcp->next(y+1)-1));
}
/**
* Q3 What are the occurrence positions of k-mer?
*
* Input: range of the suffix array, use kmerToSARange() to find
* Output: Vector of read numbers and positions (i.e. all occurrences of f)
*/
position_vector reportOccs(sa_range const &) const;
/**
* Q4 What is the number of occurrences of k-mer?
*
* Input: text position, use readPosToTextPos() to convert
*/
inline unsigned countOccs(ulong x) const
{
ulong y = inverseSA(x);
return (Blcp->next(y+1)-1) - Blcp->prev(y) + 1;
}
/**
* Q4 What is the number of occurrences of k-mer?
*
* Input: range in the suffix array, use kmerToSARange() to find
*/
inline unsigned countOccs(sa_range const &range) const
{
return range.second - range.first + 1;
}
/**
* Returns a copy of an indexed read.
*
* Input: Read number (0-based)
* Output: Copy of the read. Caller must delete [] the buffer.
*/
uchar * getRead(unsigned) const;
/**
* Given suffix i and substring length l, return T[SA[i] ... SA[i]+l].
*
* Caller must delete [] the buffer.
*/
uchar * getSuffix(ulong dest, unsigned l) const;
/**
* Inverse suffix array
*
* Input: Text position, use readPosToTextPos() to convert
*/
ulong inverseSA(ulong i) const
{
ulong skip = samplerate - i % samplerate;
ulong j;
if (i / samplerate + 1 >= n / samplerate)
{
j = bwtEndPos;
skip = n - i;
}
else
j = (*positions)[i/samplerate+1];
ulong tmp_rank_c = 0; // Cache rank value of c.
while (skip > 0)
{
int c = alphabetrank->access(j, tmp_rank_c);
if (c == '\0')
{
unsigned endmarkerRank = tmp_rank_c-1;
j = Doc->access(endmarkerRank); // LF-mapping for '\0'
if (j==0)
j = numberOfTexts-1;
else
--j;
}
else
j = C[c]+tmp_rank_c-1;
skip --;
}
return j;
}
// For given suffix i, return corresponding read number and read position.
position_result getPosition(ulong i) const
{
ulong tmp_rank_c = 0; // Cache rank value of c.
ulong dist = 0;
uchar c = alphabetrank->access(i, tmp_rank_c);
while (c != '\0' && !sampled->access(i))
{
i = C[c]+tmp_rank_c-1;
c = alphabetrank->access(i, tmp_rank_c);
++ dist;
}
if (c == '\0')
{
// Look-up from the rank among the end-markers in BWT
return std::make_pair(Doc->access(tmp_rank_c-1), dist);
}
return textPosToReadPos((*suffixes)[sampled->rank1(i)-1] + dist);
}
CGkArray(uchar *, ulong, unsigned, unsigned, ulong, unsigned, bool);
// Index from/to disk
CGkArray(std::string const &);
void save(std::string const &) const;
~CGkArray();
private:
// Return C[c] + rank_c(L, i) for given c and i
inline ulong LF(uchar c, ulong i) const
{
if (C[(int)c+1]-C[(int)c] == 0) // FIXME fix alphabet
return C[(int)c];
return C[(int)c] + alphabetrank->rank(c, i);
}
// Helper method for Q1
position_vector reportReads(ulong sp, ulong ep) const
{
unsigned nreads = Blast->rank1(ep)-Blast->rank1(sp-1);
position_vector pv;
pv.reserve(nreads+1);
while (sp < ep)
{
sp = Blast->next(sp);
pv.push_back(getPosition(sp++));
}
return pv;
}
// Helper method for Q2
inline unsigned countReads(ulong sp, ulong ep) const
{
return Blast->rank1(ep) - Blast->rank1(sp-1);
}
// Required by getSuffix(), assuming DNA alphabet
static const char ALPHABET_SHIFTED[];
static const uchar versionFlag;
ulong n;
unsigned samplerate;
unsigned C[256];
ulong bwtEndPos;
HuffWT *alphabetrank;
static_bitsequence_brw32 * sampled;
static_bitsequence_brw32 * Blast;
static_bitsequence_brw32 * Blcp;
unsigned gk;
BlockArray * suffixes;
BlockArray * positions;
CSA::DeltaVector * textStartPos;
// Total number of texts in the collection
unsigned numberOfTexts;
// Length of the longest text
ulong maxTextLength;
// Array of document id's in the order of end-markers in BWT
ArrayDoc *Doc;
uchar * BWT(uchar *);
void makewavelet(uchar *);
void maketables(bool);
void traverseBWT(uint *);
void traverseBWT(uint *, ulong, ulong, unsigned);
static_bitsequence_brw32 * buildBlcp();
/**
* Count end-markers in given interval
*/
inline unsigned CountEndmarkers(ulong sp, ulong ep) const
{
if (sp > ep)
return 0;
ulong ranksp = 0;
if (sp != 0)
ranksp = alphabetrank->rank(0, sp - 1);
return alphabetrank->rank(0, ep) - ranksp;
}
}; // class CGkArray
#endif