diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 diff --git a/Doxyfile b/Doxyfile old mode 100644 new mode 100755 diff --git a/LICENSE b/LICENSE old mode 100644 new mode 100755 diff --git a/Makefile.in b/Makefile.in old mode 100644 new mode 100755 diff --git a/README.md b/README.md old mode 100644 new mode 100755 index e8e163f..7b55e87 --- a/README.md +++ b/README.md @@ -110,6 +110,7 @@ bft build k treshold_compression {kmers|kmers_comp} list_genome_files output_fil bft load file_bft [-add_genomes {kmers|kmers_comp} list_genome_files output_file] [Options] Options: +[-query_sequences threshold list_sequence_files] [-query_kmers {kmers|kmers_comp} list_kmer_files] [-query_branching {kmers|kmers_comp} list_kmer_files] [-extract_kmers {kmers|kmers_comp} kmers_file] @@ -133,6 +134,7 @@ Command **load** loads a BFT from file *file_bft*. ### Options * **-add_genomes** adds the genomes listed in *list_genome_files* to the BFT stored in *file_bft*, the new BFT is written in *output_file* +* **-query_sequences** queries the BFT for the sequences written in the files of *list_sequence_files*. For each file of *list_sequence_files* is output a CSV file: columns are the genomes represented in the BFT, rows are the queried sequences, the intersection of a column and a row is a binary value indicating if the sequence represented by the row is present in the genome represented by the column. Threshold is a float (0 < threshold <= 1) indicating the percentage of *k*-mers from each query sequence that must occur in sample *x* to be reported present in sample *x*. * **-query_kmers** queries the BFT for *k*-mers written in the files of *list_kmer_files*. For each file of *list_kmer_files* is output a CSV file: columns are the genomes represented in the BFT, rows are the queried *k*-mers, the intersection of a column and a row is a binary value indicating if the *k*-mer represented by the row is present in the genome represented by the column. * **-query_branching** queries the BFT for the number of *k*-mers written in the files of *list_kmer_files* that are branching in the colored de-Bruijn graph represented by the BFT. * **-extract_kmers** extracts the *k*-mers stored in the BFT and writes them to a *k*-mers file named *kmers_file* (see below for input file types). diff --git a/configure b/configure index a32da7f..6ad4e70 100755 --- a/configure +++ b/configure @@ -650,6 +650,7 @@ infodir docdir oldincludedir includedir +runstatedir localstatedir sharedstatedir sysconfdir @@ -721,6 +722,7 @@ datadir='${datarootdir}' sysconfdir='${prefix}/etc' sharedstatedir='${prefix}/com' localstatedir='${prefix}/var' +runstatedir='${localstatedir}/run' includedir='${prefix}/include' oldincludedir='/usr/include' docdir='${datarootdir}/doc/${PACKAGE_TARNAME}' @@ -973,6 +975,15 @@ do | -silent | --silent | --silen | --sile | --sil) silent=yes ;; + -runstatedir | --runstatedir | --runstatedi | --runstated \ + | --runstate | --runstat | --runsta | --runst | --runs \ + | --run | --ru | --r) + ac_prev=runstatedir ;; + -runstatedir=* | --runstatedir=* | --runstatedi=* | --runstated=* \ + | --runstate=* | --runstat=* | --runsta=* | --runst=* | --runs=* \ + | --run=* | --ru=* | --r=*) + runstatedir=$ac_optarg ;; + -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb) ac_prev=sbindir ;; -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \ @@ -1110,7 +1121,7 @@ fi for ac_var in exec_prefix prefix bindir sbindir libexecdir datarootdir \ datadir sysconfdir sharedstatedir localstatedir includedir \ oldincludedir docdir infodir htmldir dvidir pdfdir psdir \ - libdir localedir mandir + libdir localedir mandir runstatedir do eval ac_val=\$$ac_var # Remove trailing slashes. @@ -1263,6 +1274,7 @@ Fine tuning of the installation directories: --sysconfdir=DIR read-only single-machine data [PREFIX/etc] --sharedstatedir=DIR modifiable architecture-independent data [PREFIX/com] --localstatedir=DIR modifiable single-machine data [PREFIX/var] + --runstatedir=DIR modifiable per-process data [LOCALSTATEDIR/run] --libdir=DIR object code libraries [EPREFIX/lib] --includedir=DIR C header files [PREFIX/include] --oldincludedir=DIR C header files for non-gcc [/usr/include] diff --git a/doc/doxygen/doxygen_sqlite3.db b/doc/doxygen/doxygen_sqlite3.db old mode 100644 new mode 100755 diff --git a/doc/doxygen/html/Node_8h_source.html b/doc/doxygen/html/Node_8h_source.html old mode 100644 new mode 100755 index fa691a9..7502036 --- a/doc/doxygen/html/Node_8h_source.html +++ b/doc/doxygen/html/Node_8h_source.html @@ -3,7 +3,7 @@ - + Bloom Filter Trie: include/Node.h Source File @@ -22,7 +22,7 @@ - @@ -31,7 +31,7 @@
+
Bloom Filter Trie
- + @@ -88,251 +88,7 @@
Node.h
-
1 #pragma once
-
2 
-
3 /* ===================================================================================================================================
-
4 * INCLUDES AND DEFINES
-
5 * ===================================================================================================================================
-
6 */
-
7 
-
8 #include <stdio.h>
-
9 #include <stdlib.h>
-
10 #include <stdint.h>
-
11 #include <inttypes.h>
-
12 #include <math.h>
-
13 #include <limits.h>
-
14 #include <string.h>
-
15 
-
16 #include <Judy.h>
-
17 
-
18 #include <omp.h>
-
19 
-
20 #include "default_param.h"
-
21 #include "useful_macros.h"
-
22 #include "UC.h"
-
23 #include "xxhash.h"
-
24 
-
25 /* ===================================================================================================================================
-
26 * STRUCTURES DECLARATION
-
27 * ===================================================================================================================================
-
28 */
-
29 
-
30 // info_per_level is a structure which contains pointers on macro used to manipulate the field children_type of a CC.
-
31 // The structure is used in an array, initialized in create_info_per_level()
-
32 typedef struct{
-
33  int nb_ucs_skp;
-
34  int nb_kmers_uc; //Number of prefixes a CC, at a specific level of the tree, can contain.
-
35  int mask_shift_kmer; //Suffixes are encoded in arrays of 8bits cells: mask_shift_kmer covers only the bits used on the last cell
-
36  int size_kmer_in_bytes; //Size of the suffixes represented at a given level of the tree, in bytes
-
37  int size_kmer_in_bytes_minus_1; //Size of the suffixes represented at a given level of the tree, minus the size of the prefixes ps, in bytes
-
38  int nb_bits_per_cell_skip_filter2;
-
39  int nb_bits_per_cell_skip_filter3;
-
40  int nb_bytes_per_cell_skip_filter2;
-
41  int nb_bytes_per_cell_skip_filter3;
-
42  int modulo_hash;
-
43  int tresh_suf_pref;
-
44  int level_min;
-
45  int root;
-
46 } info_per_level;
-
47 
-
48 // A node is a list of containers of two types: Compressed Containers (CC) and Uncompressed Containers (UC).
-
49 // It can contain 0 or more CCs in CC_array, plus always one UC which can be empty (UC_array.substrings == NULL) or not.
-
50 typedef struct {
-
51  void* CC_array;
-
52  UC UC_array;
-
53 } __attribute__ ((__packed__)) Node;
-
54 
-
55 //resultPresence is a structure produced by presenceKmer(). It contains information about the presence of a prefix p into a given node.
-
56 typedef struct{
-
57  void* node;
-
58  void* container; //Ptr to the container (UC or CC) which contain the prefix p or cc->children that contain the substring we are looking for
-
59  void* link_child; //Ptr to the container (Node or uint8_t*) having potentially the suffix linked to the prefix p
-
60 
-
61  int level_node;
-
62  int pos_container;
-
63 
-
64  int bucket; //Position of the array containing the suffixes linked to prefix p in children
-
65  int pos_sub_bucket; // Position (in term of suffix+annotation) of the first suffix linked to prefix p in children[bucket]
-
66 
-
67  int children_type_leaf; //Boolean indicating that container is a leaf of the tree
-
68  int container_is_UC; //Boolean indicating if container is a UC
-
69 
-
70  int posFilter2; //position of p in filter2 (where it is or where it should be) or size of suffix
-
71  int posFilter3; //position of p in filter3 (where it is or where it should be) or position of match in link_child
-
72  int pos_extra_filter3; //position of p in extra_filter3 (where it is or where it should be) or size of suffix
-
73 
-
74  int pos_children;
-
75  int count_children;
-
76  int count_nodes;
-
77 
-
78  uint8_t substring[SIZE_BYTES_SUF_PREF]; // the prefix p
-
79 
-
80  uint8_t presBF; //Boolean indicating if p is said present in the Bloom Filter
-
81  uint8_t presFilter2; //Boolean indicating if p_u is present in the Second Filter
-
82  uint8_t presFilter3; //Boolean indicating if p_v is present in the Third Filter
-
83 
-
84 } resultPresence;
-
85 
-
91 typedef struct {
-
92  char** filenames;
-
94  uint64_t* hash_v;
-
95 
-
96  uint16_t** skip_sp;
-
97 
-
98  annotation_array_elem* comp_set_colors;
-
99 
-
100  info_per_level* info_per_lvl;
-
101 
-
102  annotation_inform* ann_inf;
-
103 
-
104  resultPresence* res;
-
105 
-
106  int k;
-
107  int r1;
-
108  int r2;
- -
110  int length_comp_set_colors;
- -
113  uint8_t compressed;
-
114  uint8_t marked;
-
115 
-
116  Node node;
-
117 } BFT_Root;
-
118 
-
124 typedef struct{
-
125  char* kmer;
-
126  uint8_t* kmer_comp;
-
127  resultPresence* res;
-
128 } BFT_kmer;
-
129 
-
130 /* ===================================================================================================================================
-
131 * INLINE FUNCTIONS DECLARATION
-
132 * ===================================================================================================================================
-
133 */
-
134 
-
135 inline bool are_genomes_ids_overlapping(BFT_Root* root_1, BFT_Root* root_2);
-
136 inline uint64_t* create_hash_v_array(int rand_seed1, int rand_seed2);
-
137 inline Node* createNode(void);
-
138 inline void initiateNode(Node* node);
-
139 inline resultPresence* create_resultPresence();
-
140 inline void initialize_resultPresence(resultPresence* res);
-
141 
-
142 inline bool are_genomes_ids_overlapping(BFT_Root* root_1, BFT_Root* root_2){
-
143 
-
144  ASSERT_NULL_PTR(root_1,"get_overlap_genomes_ids()\n")
-
145  ASSERT_NULL_PTR(root_2,"get_overlap_genomes_ids()\n")
-
146 
-
147  if (strcmp(root_1->filenames[root_1->nb_genomes-1], root_2->filenames[0]) == 0)
-
148  return true;
-
149 
-
150  return false;
-
151 }
-
152 
-
153 inline uint64_t* create_hash_v_array(int rand_seed1, int rand_seed2){
-
154 
-
155  int j, nb_bits;
-
156  uint32_t nb_hash_v = pow(4, NB_CHAR_SUF_PREF);
-
157 
-
158  uint64_t* hash_v = malloc(nb_hash_v * 2 * sizeof(uint64_t));
-
159  ASSERT_NULL_PTR(hash_v, "create_hash_v_array()")
-
160 
-
161  uint8_t gen_sub[SIZE_BYTES_SUF_PREF];
-
162 
-
163  for (uint32_t i = 0; i < nb_hash_v; i++){
-
164 
-
165  nb_bits = NB_CHAR_SUF_PREF * 2;
-
166 
-
167  for (j = 0; j < SIZE_BYTES_SUF_PREF; j++){
-
168 
-
169  nb_bits -= SIZE_BITS_UINT_8T;
-
170 
-
171  if (nb_bits >= 0) gen_sub[j] = (i >> nb_bits) & 0xff;
-
172  else gen_sub[j] = (i << (-nb_bits)) & 0xff;
-
173  }
-
174 
-
175  hash_v[i * 2] = XXH64(gen_sub, SIZE_BYTES_SUF_PREF, rand_seed1);
-
176  hash_v[i * 2 + 1] = XXH64(gen_sub, SIZE_BYTES_SUF_PREF, rand_seed2);
-
177  }
-
178 
-
179  return hash_v;
-
180 }
-
181 
-
182 /* ---------------------------------------------------------------------------------------------------------------
-
183 * createNode()
-
184 * ---------------------------------------------------------------------------------------------------------------
-
185 * Allocate and initialize a node
-
186 * ---------------------------------------------------------------------------------------------------------------
-
187 * ---------------------------------------------------------------------------------------------------------------
-
188 */
-
189 inline Node* createNode(void){
-
190  Node* node = malloc(sizeof(Node));
-
191  ASSERT_NULL_PTR(node,"createNode()")
-
192 
-
193  node->CC_array = NULL;
-
194  initializeUC(&(node->UC_array));
-
195 
-
196  return node;
-
197 }
-
198 
-
199 /* ---------------------------------------------------------------------------------------------------------------
-
200 * initiateNode(node)
-
201 * ---------------------------------------------------------------------------------------------------------------
-
202 * Initialize a node
-
203 * ---------------------------------------------------------------------------------------------------------------
-
204 * node: a pointer on a Node structure
-
205 * ---------------------------------------------------------------------------------------------------------------
-
206 */
-
207 inline void initiateNode(Node* node){
-
208 
-
209  ASSERT_NULL_PTR(node,"initiateNode()")
-
210 
-
211  node->CC_array = NULL;
-
212  initializeUC(&(node->UC_array));
-
213 
-
214  return;
-
215 }
-
216 
-
217 inline resultPresence* create_resultPresence(){
-
218 
-
219  resultPresence* res = calloc(1,sizeof(resultPresence));
-
220  ASSERT_NULL_PTR(res,"create_resultPresence()")
-
221 
-
222  res->node = NULL;
-
223  res->container = NULL;
-
224  res->link_child = NULL;
-
225  res->posFilter2 = INT_MAX;
-
226  res->posFilter3 = INT_MAX;
-
227  res->pos_extra_filter3 = INT_MAX;
-
228 
-
229  return res;
-
230 }
-
231 
-
232 inline void initialize_resultPresence(resultPresence* res){
-
233 
-
234  ASSERT_NULL_PTR(res,"initialize_resultPresence()")
-
235 
-
236  res->node = NULL;
-
237  res->container = NULL;
-
238  res->link_child = NULL;
-
239  res->pos_container = 0;
-
240  res->level_node = 0;
-
241  res->bucket = 0;
-
242  res->pos_sub_bucket = 0;
-
243  res->presBF = 0;
-
244  res->presFilter2 = 0;
-
245  res->presFilter3 = 0;
-
246  res->count_children = 0;
-
247  res->count_nodes = 0;
-
248  res->pos_children = 0;
-
249  res->children_type_leaf = 0;
-
250  res->container_is_UC = 0;
-
251  res->posFilter2 = INT_MAX;
-
252  res->posFilter3 = INT_MAX;
-
253  res->pos_extra_filter3 = INT_MAX;
-
254 
-
255  return;
-
256 }
-
char ** filenames
Inserted genome file names.
Definition: Node.h:92
+
1 #pragma once
2 
3 /* ===================================================================================================================================
4 * INCLUDES AND DEFINES
5 * ===================================================================================================================================
6 */
7 
8 #include <stdio.h>
9 #include <stdlib.h>
10 #include <stdint.h>
11 #include <inttypes.h>
12 #include <math.h>
13 #include <limits.h>
14 #include <string.h>
15 
16 #include <Judy.h>
17 
18 #include <omp.h>
19 
20 #include "default_param.h"
21 #include "useful_macros.h"
22 #include "UC.h"
23 #include "xxhash.h"
24 
25 /* ===================================================================================================================================
26 * STRUCTURES DECLARATION
27 * ===================================================================================================================================
28 */
29 
30 // info_per_level is a structure which contains pointers on macro used to manipulate the field children_type of a CC.
31 // The structure is used in an array, initialized in create_info_per_level()
32 typedef struct{
33  int nb_ucs_skp;
34  int nb_kmers_uc; //Number of prefixes a CC, at a specific level of the tree, can contain.
35  int mask_shift_kmer; //Suffixes are encoded in arrays of 8bits cells: mask_shift_kmer covers only the bits used on the last cell
36  int size_kmer_in_bytes; //Size of the suffixes represented at a given level of the tree, in bytes
37  int size_kmer_in_bytes_minus_1; //Size of the suffixes represented at a given level of the tree, minus the size of the prefixes ps, in bytes
38  int nb_bits_per_cell_skip_filter2;
39  int nb_bits_per_cell_skip_filter3;
40  int nb_bytes_per_cell_skip_filter2;
41  int nb_bytes_per_cell_skip_filter3;
42  int modulo_hash;
43  int tresh_suf_pref;
44  int level_min;
45  int root;
46 } info_per_level;
47 
48 // A node is a list of containers of two types: Compressed Containers (CC) and Uncompressed Containers (UC).
49 // It can contain 0 or more CCs in CC_array, plus always one UC which can be empty (UC_array.substrings == NULL) or not.
50 typedef struct {
51  void* CC_array;
52  UC UC_array;
53 } __attribute__ ((__packed__)) Node;
54 
55 //resultPresence is a structure produced by presenceKmer(). It contains information about the presence of a prefix p into a given node.
56 typedef struct{
57  void* node;
58  void* container; //Ptr to the container (UC or CC) which contain the prefix p or cc->children that contain the substring we are looking for
59  void* link_child; //Ptr to the container (Node or uint8_t*) having potentially the suffix linked to the prefix p
60 
61  int level_node;
62  int pos_container;
63 
64  int bucket; //Position of the array containing the suffixes linked to prefix p in children
65  int pos_sub_bucket; // Position (in term of suffix+annotation) of the first suffix linked to prefix p in children[bucket]
66 
67  int children_type_leaf; //Boolean indicating that container is a leaf of the tree
68  int container_is_UC; //Boolean indicating if container is a UC
69 
70  int posFilter2; //position of p in filter2 (where it is or where it should be) or size of suffix
71  int posFilter3; //position of p in filter3 (where it is or where it should be) or position of match in link_child
72  int pos_extra_filter3; //position of p in extra_filter3 (where it is or where it should be) or size of suffix
73 
74  int pos_children;
75  int count_children;
76  int count_nodes;
77 
78  uint8_t substring[SIZE_BYTES_SUF_PREF]; // the prefix p
79 
80  uint8_t presBF; //Boolean indicating if p is said present in the Bloom Filter
81  uint8_t presFilter2; //Boolean indicating if p_u is present in the Second Filter
82  uint8_t presFilter3; //Boolean indicating if p_v is present in the Third Filter
83 
84 } resultPresence;
85 
91 typedef struct {
92  char** filenames;
94  uint64_t* hash_v;
95 
96  uint16_t** skip_sp;
97 
98  annotation_array_elem* comp_set_colors;
99 
100  info_per_level* info_per_lvl;
101 
102  annotation_inform* ann_inf;
103 
104  resultPresence* res;
105 
106  int k;
107  int r1;
108  int r2;
110  int length_comp_set_colors;
113  uint8_t compressed;
114  uint8_t marked;
115 
116  Node node;
117 } BFT_Root;
118 
124 typedef struct{
125  char* kmer;
126  uint8_t* kmer_comp;
127  resultPresence* res;
128 } BFT_kmer;
129 
130 /* ===================================================================================================================================
131 * INLINE FUNCTIONS DECLARATION
132 * ===================================================================================================================================
133 */
134 
135 inline bool are_genomes_ids_overlapping(BFT_Root* root_1, BFT_Root* root_2);
136 inline uint64_t* create_hash_v_array(int rand_seed1, int rand_seed2);
137 inline Node* createNode(void);
138 inline void initiateNode(Node* node);
139 inline resultPresence* create_resultPresence();
140 inline void initialize_resultPresence(resultPresence* res);
141 
142 inline bool are_genomes_ids_overlapping(BFT_Root* root_1, BFT_Root* root_2){
143 
144  ASSERT_NULL_PTR(root_1,"get_overlap_genomes_ids()\n")
145  ASSERT_NULL_PTR(root_2,"get_overlap_genomes_ids()\n")
146 
147  if (strcmp(root_1->filenames[root_1->nb_genomes-1], root_2->filenames[0]) == 0)
148  return true;
149 
150  return false;
151 }
152 
153 inline uint64_t* create_hash_v_array(int rand_seed1, int rand_seed2){
154 
155  int j, nb_bits;
156  uint32_t nb_hash_v = pow(4, NB_CHAR_SUF_PREF);
157 
158  uint64_t* hash_v = malloc(nb_hash_v * 2 * sizeof(uint64_t));
159  ASSERT_NULL_PTR(hash_v, "create_hash_v_array()")
160 
161  uint8_t gen_sub[SIZE_BYTES_SUF_PREF];
162 
163  for (uint32_t i = 0; i < nb_hash_v; i++){
164 
165  nb_bits = NB_CHAR_SUF_PREF * 2;
166 
167  for (j = 0; j < SIZE_BYTES_SUF_PREF; j++){
168 
169  nb_bits -= SIZE_BITS_UINT_8T;
170 
171  if (nb_bits >= 0) gen_sub[j] = (i >> nb_bits) & 0xff;
172  else gen_sub[j] = (i << (-nb_bits)) & 0xff;
173  }
174 
175  hash_v[i * 2] = XXH64(gen_sub, SIZE_BYTES_SUF_PREF, rand_seed1);
176  hash_v[i * 2 + 1] = XXH64(gen_sub, SIZE_BYTES_SUF_PREF, rand_seed2);
177  }
178 
179  return hash_v;
180 }
181 
182 /* ---------------------------------------------------------------------------------------------------------------
183 * createNode()
184 * ---------------------------------------------------------------------------------------------------------------
185 * Allocate and initialize a node
186 * ---------------------------------------------------------------------------------------------------------------
187 * ---------------------------------------------------------------------------------------------------------------
188 */
189 inline Node* createNode(void){
190  Node* node = malloc(sizeof(Node));
191  ASSERT_NULL_PTR(node,"createNode()")
192 
193  node->CC_array = NULL;
194  initializeUC(&(node->UC_array));
195 
196  return node;
197 }
198 
199 /* ---------------------------------------------------------------------------------------------------------------
200 * initiateNode(node)
201 * ---------------------------------------------------------------------------------------------------------------
202 * Initialize a node
203 * ---------------------------------------------------------------------------------------------------------------
204 * node: a pointer on a Node structure
205 * ---------------------------------------------------------------------------------------------------------------
206 */
207 inline void initiateNode(Node* node){
208 
209  ASSERT_NULL_PTR(node,"initiateNode()")
210 
211  node->CC_array = NULL;
212  initializeUC(&(node->UC_array));
213 
214  return;
215 }
216 
217 inline resultPresence* create_resultPresence(){
218 
219  resultPresence* res = calloc(1,sizeof(resultPresence));
220  ASSERT_NULL_PTR(res,"create_resultPresence()")
221 
222  res->node = NULL;
223  res->container = NULL;
224  res->link_child = NULL;
225  res->posFilter2 = INT_MAX;
226  res->posFilter3 = INT_MAX;
227  res->pos_extra_filter3 = INT_MAX;
228 
229  return res;
230 }
231 
232 inline void initialize_resultPresence(resultPresence* res){
233 
234  ASSERT_NULL_PTR(res,"initialize_resultPresence()")
235 
236  res->node = NULL;
237  res->container = NULL;
238  res->link_child = NULL;
239  res->pos_container = 0;
240  res->level_node = 0;
241  res->bucket = 0;
242  res->pos_sub_bucket = 0;
243  res->presBF = 0;
244  res->presFilter2 = 0;
245  res->presFilter3 = 0;
246  res->count_children = 0;
247  res->count_nodes = 0;
248  res->pos_children = 0;
249  res->children_type_leaf = 0;
250  res->container_is_UC = 0;
251  res->posFilter2 = INT_MAX;
252  res->posFilter3 = INT_MAX;
253  res->pos_extra_filter3 = INT_MAX;
254 
255  return;
256 }
char ** filenames
Inserted genome file names.
Definition: Node.h:92
int treshold_compression
Color compression is triggered every BFT_Root::treshold_compression genome inserted.
Definition: Node.h:111
int nb_genomes
Number of genomes inserted.
Definition: Node.h:109
resultPresence * res
Contains information about the location of BFT_kmer::kmer in a BFT_Root.
Definition: Node.h:127
@@ -344,9 +100,9 @@
diff --git a/doc/doxygen/html/annotated.html b/doc/doxygen/html/annotated.html old mode 100644 new mode 100755 index d0b0a58..57010b2 --- a/doc/doxygen/html/annotated.html +++ b/doc/doxygen/html/annotated.html @@ -3,7 +3,7 @@ - + Bloom Filter Trie: Data Structures @@ -22,7 +22,7 @@ - @@ -31,7 +31,7 @@
+
Bloom Filter Trie
- + @@ -95,9 +95,9 @@ diff --git a/doc/doxygen/html/arrowdown.png b/doc/doxygen/html/arrowdown.png old mode 100644 new mode 100755 diff --git a/doc/doxygen/html/arrowright.png b/doc/doxygen/html/arrowright.png old mode 100644 new mode 100755 diff --git a/doc/doxygen/html/bc_s.png b/doc/doxygen/html/bc_s.png old mode 100644 new mode 100755 diff --git a/doc/doxygen/html/bdwn.png b/doc/doxygen/html/bdwn.png old mode 100644 new mode 100755 diff --git a/doc/doxygen/html/bft_8h.html b/doc/doxygen/html/bft_8h.html old mode 100644 new mode 100755 index 933071a..1c6fdbf --- a/doc/doxygen/html/bft_8h.html +++ b/doc/doxygen/html/bft_8h.html @@ -3,7 +3,7 @@ - + Bloom Filter Trie: include/bft.h File Reference @@ -22,7 +22,7 @@ - @@ -31,7 +31,7 @@
+
Bloom Filter Trie
- + @@ -192,12 +192,6 @@ void free_BFT_kmer_content (BFT_kmer *bft_kmer, int nb_bft_kmer)  Function freeing the content of allocated BFT_kmers. More...
  -BFT_kmerget_kmer (const char *kmer, BFT *bft) - Function searching for a k-mer in a BFT. More...
-  -bool is_kmer_in_cdbg (BFT_kmer *bft_kmer) - Function testing if a k-mer is in a BFT. More...
-  void extract_kmers_to_disk (BFT *bft, char *filename_output, bool compressed_output)  Function extracting the k-mers of a BFT in a file. More...
  @@ -207,6 +201,24 @@ size_t write_kmer_comp_to_disk (BFT_kmer *bft_kmer, BFT *bft, va_list args)  Function writing an 2 bits encoded k-mer in a file. More...
  +
Query functions
+

These functions query for k-mers or sequences.

+
+BFT_kmerget_kmer (const char *kmer, BFT *bft) + Function searching for a k-mer in a BFT. More...
+  +bool is_kmer_in_cdbg (BFT_kmer *bft_kmer) + Function testing if a k-mer is in a BFT. More...
+  +uint32_t * query_sequence (BFT *bft, char *sequence, double threshold) + Function querying a BFT for a sequence. More...
+  +
Pattern matching functions
+

These functions provide pattern matching functionalities over the k-mers or paths of a colored de Bruijn graph stored as a BFT.

+
+bool prefix_matching (BFT *bft, char *prefix, BFT_func_ptr f,...) + Function for prefix matching over the k-mers of a BFT. More...
Marking functions

These functions allow to mark k-mers of a colored de Bruijn graph with flags.

@@ -249,12 +261,6 @@ void v_iterate_over_kmers (BFT *bft, BFT_func_ptr f, va_list args)  Function iterating over the k-mers of a BFT. More...
  -
Pattern matching functions
-

These functions provide pattern matching functionalities over the k-mers or paths of a colored de Bruijn graph stored as a BFT.

-
-bool prefix_matching (BFT *bft, char *prefix, BFT_func_ptr f,...) - Function for prefix matching over the k-mers of a BFT. More...
Disk I/O functions

These functions write and load a BFT from disk.

@@ -1224,6 +1230,49 @@

Function Documentation

Returns
a boolean indicating the presence (true) or absence (false) of the k-mer in a the genome.
+ + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + +
uint32_t* query_sequence (BFTbft,
char * sequence,
double threshold 
)
+
+ +

Function querying a BFT for a sequence.

+
Parameters
+ + + + +
bftis a BFT to be queried.
sequenceis a string to query.
thresholdis a float (0 < threshold <= 1) indicating the minimum percentage of k-mers from the queried sequence that must be present in a genome to have the queried sequence reported present in this genome.
+
+
+
Returns
a pointer to a sorted array of genome identifiers in which the queried sequence occurs (according to parameter threshold) or NULL if the queried sequence is not present in at least one genome (according to parameter threshold). The first element of the array (position 0) indicates how many ids are in this array.
+
@@ -1621,9 +1670,9 @@

Function Documentation

diff --git a/doc/doxygen/html/bft_8h_source.html b/doc/doxygen/html/bft_8h_source.html old mode 100644 new mode 100755 index cf4a907..0340473 --- a/doc/doxygen/html/bft_8h_source.html +++ b/doc/doxygen/html/bft_8h_source.html @@ -3,7 +3,7 @@ - + Bloom Filter Trie: include/bft.h Source File @@ -22,7 +22,7 @@ - @@ -31,7 +31,7 @@
+
Bloom Filter Trie
- + @@ -88,107 +88,7 @@
bft.h
-Go to the documentation of this file.
1 
-
6 #pragma once
-
7 
-
8 #include <stdio.h>
-
9 #include <stdlib.h>
-
10 #include <stdint.h>
-
11 
-
12 #include <stdarg.h>
-
13 #include <stdbool.h>
-
14 
-
15 //#include "UC_annotation.h"
-
16 #include "insertNode.h"
-
17 #include "branchingNode.h"
-
18 #include "fasta.h"
-
19 #include "marking.h"
-
20 #include "replaceAnnotation.h"
-
21 #include "write_to_disk.h"
-
22 #include "extract_kmers.h"
-
23 #include "printMemory.h"
-
24 #include "file_io.h"
-
25 #include "useful_macros.h"
-
26 
-
28 typedef BFT_Root BFT;
-
29 
-
33 typedef struct{
-
34  uint8_t* annot;
-
35  uint8_t* annot_ext;
-
36  uint8_t* annot_cplx;
-
37 
-
38  int size_annot;
-
39  int size_annot_cplx;
-
40 
-
41  uint8_t from_BFT;
- -
43 
-
51 typedef size_t (*BFT_func_ptr) (BFT_kmer* bft_kmer, BFT* bft, va_list args);
-
52 
-
53 inline uint8_t intersection_annots(const uint8_t a, const uint8_t b);
-
54 inline uint8_t union_annots(const uint8_t a, const uint8_t b);
-
55 inline uint8_t sym_difference_annots(const uint8_t a, const uint8_t b);
-
56 
-
61 BFT* create_cdbg(int k, int treshold_compression);
-
63 void free_cdbg(BFT* bft);
-
65 
-
70 void insert_genomes_from_files(int nb_files, char** paths, BFT* bft, char* prefix_bft_filename);
-
72 void insert_kmers_new_genome(int nb_kmers, char** kmers, char* genome_name, BFT* bft);
-
73 void insert_kmers_last_genome(int nb_kmers, char** kmers, BFT* bft);
-
75 
-
80 BFT_kmer* create_kmer(const char* kmer, int k);
- -
83 void free_BFT_kmer(BFT_kmer* bft_kmer, int nb_bft_kmer);
-
84 void free_BFT_kmer_content(BFT_kmer* bft_kmer, int nb_bft_kmer);
-
85 BFT_kmer* get_kmer(const char* kmer, BFT* bft);
-
86 bool is_kmer_in_cdbg(BFT_kmer* bft_kmer);
-
87 void extract_kmers_to_disk(BFT* bft, char* filename_output, bool compressed_output);
-
88 size_t write_kmer_ascii_to_disk(BFT_kmer* bft_kmer, BFT* bft, va_list args);
-
89 size_t write_kmer_comp_to_disk(BFT_kmer* bft_kmer, BFT* bft, va_list args);
-
91 
- -
98 void free_BFT_annotation(BFT_annotation* bft_annot);
- -
100 bool presence_genome(uint32_t id_genome, BFT_annotation* bft_annot, BFT* bft);
-
101 
-
102 inline uint8_t intersection_annots(const uint8_t a, const uint8_t b){
-
103  return a & b;
-
104 }
-
105 
-
106 inline uint8_t union_annots(const uint8_t a, const uint8_t b){
-
107  return a | b;
-
108 }
-
109 
-
110 inline uint8_t sym_difference_annots(const uint8_t a, const uint8_t b){
-
111  return a ^ b;
-
112 }
-
113 
-
114 BFT_annotation* intersection_annotations(BFT* bft, uint32_t nb_annotations, ... );
-
115 BFT_annotation* union_annotations(BFT* bft, uint32_t nb_annotations, ... );
-
116 BFT_annotation* sym_difference_annotations(BFT* bft, uint32_t nb_annotations, ... );
-
117 uint32_t* get_list_id_genomes(BFT_annotation* bft_annot, BFT* bft);
-
118 uint32_t get_count_id_genomes(BFT_annotation* bft_annot, BFT* bft);
-
119 uint32_t* intersection_list_id_genomes(uint32_t* list_a, uint32_t* list_b);
-
121 
-
126 void set_marking(BFT* bft);
-
128 void unset_marking(BFT* bft);
-
129 void set_flag_kmer(uint8_t flag, BFT_kmer* bft_kmer, BFT* bft);
-
130 uint8_t get_flag_kmer(BFT_kmer* bft_kmer, BFT* bft);
-
132 
-
137 void set_neighbors_traversal(BFT* bft);
-
139 void unset_neighbors_traversal(BFT* bft);
-
140 BFT_kmer* get_neighbors(BFT_kmer* bft_kmer, BFT* bft);
-
141 BFT_kmer* get_predecessors(BFT_kmer* bft_kmer, BFT* bft);
-
142 BFT_kmer* get_successors(BFT_kmer* bft_kmer, BFT* bft);
-
144 
-
149 void iterate_over_kmers(BFT* bft, BFT_func_ptr f, ... );
-
151 void v_iterate_over_kmers(BFT* bft, BFT_func_ptr f, va_list args);
-
153 
-
158 bool prefix_matching(BFT* bft, char* prefix, BFT_func_ptr f, ...);
-
161 
-
166 void write_BFT(BFT* bft, char* filename, bool compress_annotations);
-
168 BFT* load_BFT(char* filename);
-
BFT_annotation * intersection_annotations(BFT *bft, uint32_t nb_annotations,...)
Function computing the intersection of a set of annotations.
Definition: bft.c:421
+Go to the documentation of this file.
1 
6 #pragma once
7 
8 #include <stdio.h>
9 #include <stdlib.h>
10 #include <stdint.h>
11 
12 #include <stdarg.h>
13 #include <stdbool.h>
14 
15 #include "intersection.h"
16 #include "insertNode.h"
17 #include "branchingNode.h"
18 #include "fasta.h"
19 #include "marking.h"
20 #include "replaceAnnotation.h"
21 #include "write_to_disk.h"
22 #include "extract_kmers.h"
23 #include "printMemory.h"
24 #include "file_io.h"
25 #include "useful_macros.h"
26 
28 typedef BFT_Root BFT;
29 
33 typedef struct{
34  uint8_t* annot;
35  uint8_t* annot_ext;
36  uint8_t* annot_cplx;
37 
38  int size_annot;
39  int size_annot_cplx;
40 
41  uint8_t from_BFT;
43 
51 typedef size_t (*BFT_func_ptr) (BFT_kmer* bft_kmer, BFT* bft, va_list args);
52 
53 inline uint8_t intersection_annots(const uint8_t a, const uint8_t b);
54 inline uint8_t union_annots(const uint8_t a, const uint8_t b);
55 inline uint8_t sym_difference_annots(const uint8_t a, const uint8_t b);
56 
61 BFT* create_cdbg(int k, int treshold_compression);
63 void free_cdbg(BFT* bft);
65 
70 void insert_genomes_from_files(int nb_files, char** paths, BFT* bft, char* prefix_bft_filename);
72 void insert_kmers_new_genome(int nb_kmers, char** kmers, char* genome_name, BFT* bft);
73 void insert_kmers_last_genome(int nb_kmers, char** kmers, BFT* bft);
75 
80 BFT_kmer* create_kmer(const char* kmer, int k);
83 void free_BFT_kmer(BFT_kmer* bft_kmer, int nb_bft_kmer);
84 void free_BFT_kmer_content(BFT_kmer* bft_kmer, int nb_bft_kmer);
85 void extract_kmers_to_disk(BFT* bft, char* filename_output, bool compressed_output);
86 size_t write_kmer_ascii_to_disk(BFT_kmer* bft_kmer, BFT* bft, va_list args);
87 size_t write_kmer_comp_to_disk(BFT_kmer* bft_kmer, BFT* bft, va_list args);
89 
96 void free_BFT_annotation(BFT_annotation* bft_annot);
98 bool presence_genome(uint32_t id_genome, BFT_annotation* bft_annot, BFT* bft);
99 
100 inline uint8_t intersection_annots(const uint8_t a, const uint8_t b){
101  return a & b;
102 }
103 
104 inline uint8_t union_annots(const uint8_t a, const uint8_t b){
105  return a | b;
106 }
107 
108 inline uint8_t sym_difference_annots(const uint8_t a, const uint8_t b){
109  return a ^ b;
110 }
111 
112 BFT_annotation* intersection_annotations(BFT* bft, uint32_t nb_annotations, ... );
113 BFT_annotation* union_annotations(BFT* bft, uint32_t nb_annotations, ... );
114 BFT_annotation* sym_difference_annotations(BFT* bft, uint32_t nb_annotations, ... );
115 uint32_t* get_list_id_genomes(BFT_annotation* bft_annot, BFT* bft);
116 uint32_t get_count_id_genomes(BFT_annotation* bft_annot, BFT* bft);
117 uint32_t* intersection_list_id_genomes(uint32_t* list_a, uint32_t* list_b);
119 
124 BFT_kmer* get_kmer(const char* kmer, BFT* bft);
126 bool is_kmer_in_cdbg(BFT_kmer* bft_kmer);
127 uint32_t* query_sequence(BFT* bft, char* sequence, double threshold);
129 
134 bool prefix_matching(BFT* bft, char* prefix, BFT_func_ptr f, ...);
137 
142 void set_marking(BFT* bft);
144 void unset_marking(BFT* bft);
145 void set_flag_kmer(uint8_t flag, BFT_kmer* bft_kmer, BFT* bft);
146 uint8_t get_flag_kmer(BFT_kmer* bft_kmer, BFT* bft);
148 
153 void set_neighbors_traversal(BFT* bft);
155 void unset_neighbors_traversal(BFT* bft);
156 BFT_kmer* get_neighbors(BFT_kmer* bft_kmer, BFT* bft);
157 BFT_kmer* get_predecessors(BFT_kmer* bft_kmer, BFT* bft);
158 BFT_kmer* get_successors(BFT_kmer* bft_kmer, BFT* bft);
160 
165 void iterate_over_kmers(BFT* bft, BFT_func_ptr f, ... );
167 void v_iterate_over_kmers(BFT* bft, BFT_func_ptr f, va_list args);
169 
174 void write_BFT(BFT* bft, char* filename, bool compress_annotations);
176 BFT* load_BFT(char* filename);
BFT_annotation * intersection_annotations(BFT *bft, uint32_t nb_annotations,...)
Function computing the intersection of a set of annotations.
Definition: bft.c:421
void iterate_over_kmers(BFT *bft, BFT_func_ptr f,...)
Function iterating over the k-mers of a BFT.
Definition: bft.c:1049
bool presence_genome(uint32_t id_genome, BFT_annotation *bft_annot, BFT *bft)
Function testing if a k-mer occured in a genome.
Definition: bft.c:395
BFT_Root BFT
Root vertex of a BFT.
Definition: bft.h:28
@@ -227,15 +127,16 @@
void unset_neighbors_traversal(BFT *bft)
Function unlocking a locked graph for traversal.
Definition: bft.c:784
BFT * create_cdbg(int k, int treshold_compression)
Function creating a colored de Bruijn graph stored in a BFT.
Definition: bft.c:12
BFT_kmer * get_predecessors(BFT_kmer *bft_kmer, BFT *bft)
Function extracting the predecessors of a k-mer.
Definition: bft.c:891
+
uint32_t * query_sequence(BFT *bft, char *sequence, double threshold)
Function querying a BFT for a sequence.
Definition: bft.c:1237
BFT_annotation * union_annotations(BFT *bft, uint32_t nb_annotations,...)
Function computing the union of a set of annotations.
Definition: bft.c:488
BFT_kmer * get_kmer(const char *kmer, BFT *bft)
Function searching for a k-mer in a BFT.
Definition: bft.c:216
Root vertex of a BFT.
Definition: Node.h:91
diff --git a/doc/doxygen/html/classes.html b/doc/doxygen/html/classes.html old mode 100644 new mode 100755 index b3d017c..e5f2250 --- a/doc/doxygen/html/classes.html +++ b/doc/doxygen/html/classes.html @@ -3,7 +3,7 @@ - + Bloom Filter Trie: Data Structure Index @@ -22,7 +22,7 @@ - @@ -31,7 +31,7 @@
+
Bloom Filter Trie
- + @@ -86,7 +86,7 @@
B
- +
@@ -97,9 +97,9 @@ diff --git a/doc/doxygen/html/closed.png b/doc/doxygen/html/closed.png old mode 100644 new mode 100755 diff --git a/doc/doxygen/html/dir_68267d1309a1af8e8297ef4c3efbcdba.html b/doc/doxygen/html/dir_68267d1309a1af8e8297ef4c3efbcdba.html old mode 100644 new mode 100755 index 3ef6ef6..e37a7e2 --- a/doc/doxygen/html/dir_68267d1309a1af8e8297ef4c3efbcdba.html +++ b/doc/doxygen/html/dir_68267d1309a1af8e8297ef4c3efbcdba.html @@ -3,7 +3,7 @@ - +Bloom Filter Trie: src Directory Reference @@ -22,7 +22,7 @@
  B  
BFT_kmer   BFT_Root   
- @@ -31,7 +31,7 @@
+
Bloom Filter Trie
- + @@ -39,7 +39,7 @@ +

Files

-file  bft.c -  -file  snippets.c
diff --git a/doc/doxygen/html/dir_d44c64559bbebec7f509842c48db8b23.html b/doc/doxygen/html/dir_d44c64559bbebec7f509842c48db8b23.html old mode 100644 new mode 100755 index 2b14007..f5e63fa --- a/doc/doxygen/html/dir_d44c64559bbebec7f509842c48db8b23.html +++ b/doc/doxygen/html/dir_d44c64559bbebec7f509842c48db8b23.html @@ -3,7 +3,7 @@ - + Bloom Filter Trie: include Directory Reference @@ -22,7 +22,7 @@ - @@ -31,7 +31,7 @@
+
Bloom Filter Trie
- + @@ -39,7 +39,7 @@ +
file  bft.h [code]  Interface containing all functions to use a BFT.
  -file  Node.h [code] -  file  snippets.h [code]  Code snippets using a BFT.
  @@ -97,9 +101,9 @@
diff --git a/doc/doxygen/html/doc.png b/doc/doxygen/html/doc.png old mode 100644 new mode 100755 diff --git a/doc/doxygen/html/doxygen.css b/doc/doxygen/html/doxygen.css old mode 100644 new mode 100755 index a000833..1425ec5 --- a/doc/doxygen/html/doxygen.css +++ b/doc/doxygen/html/doxygen.css @@ -1,4 +1,4 @@ -/* The standard CSS for doxygen 1.8.9.1 */ +/* The standard CSS for doxygen 1.8.11 */ body, table, div, p, dl { font: 400 14px/22px Roboto,sans-serif; @@ -206,6 +206,11 @@ div.line { transition-duration: 0.5s; } +div.line:after { + content:"\000A"; + white-space: pre; +} + div.line.glow { background-color: cyan; box-shadow: 0 0 10px cyan; @@ -242,7 +247,7 @@ div.ah, span.ah { -webkit-box-shadow: 2px 2px 3px #999; -moz-box-shadow: rgba(0, 0, 0, 0.15) 2px 2px 2px; background-image: -webkit-gradient(linear, left top, left bottom, from(#eee), to(#000),color-stop(0.3, #444)); - background-image: -moz-linear-gradient(center top, #eee 0%, #444 40%, #000); + background-image: -moz-linear-gradient(center top, #eee 0%, #444 40%, #000 110%); } div.classindex ul { @@ -832,6 +837,10 @@ address { color: #2A3D61; } +table.doxtable caption { + caption-side: top; +} + table.doxtable { border-collapse:collapse; margin-top: 4px; @@ -997,6 +1006,18 @@ div.summary a white-space: nowrap; } +table.classindex +{ + margin: 10px; + white-space: nowrap; + margin-left: 3%; + margin-right: 3%; + width: 94%; + border: 0; + border-spacing: 0; + padding: 0; +} + div.ingroups { font-size: 8pt; @@ -1108,6 +1129,11 @@ dl.section dd { border: 0px none; } +#projectalign +{ + vertical-align: middle; +} + #projectname { font: 300% Tahoma, Arial,sans-serif; @@ -1191,7 +1217,7 @@ div.toc { border-radius: 7px 7px 7px 7px; float: right; height: auto; - margin: 0 20px 10px 10px; + margin: 0 8px 10px 10px; width: 200px; } diff --git a/doc/doxygen/html/doxygen.png b/doc/doxygen/html/doxygen.png old mode 100644 new mode 100755 diff --git a/doc/doxygen/html/dynsections.js b/doc/doxygen/html/dynsections.js old mode 100644 new mode 100755 diff --git a/doc/doxygen/html/files.html b/doc/doxygen/html/files.html old mode 100644 new mode 100755 index 55a1caf..c1ec95a --- a/doc/doxygen/html/files.html +++ b/doc/doxygen/html/files.html @@ -3,7 +3,7 @@ - + Bloom Filter Trie: File List @@ -22,7 +22,7 @@ - @@ -31,7 +31,7 @@
+
Bloom Filter Trie
- + @@ -95,9 +95,9 @@ diff --git a/doc/doxygen/html/folderclosed.png b/doc/doxygen/html/folderclosed.png old mode 100644 new mode 100755 diff --git a/doc/doxygen/html/folderopen.png b/doc/doxygen/html/folderopen.png old mode 100644 new mode 100755 diff --git a/doc/doxygen/html/functions.html b/doc/doxygen/html/functions.html old mode 100644 new mode 100755 index 7c04ec3..e442ed0 --- a/doc/doxygen/html/functions.html +++ b/doc/doxygen/html/functions.html @@ -3,7 +3,7 @@ - + Bloom Filter Trie: Data Fields @@ -22,7 +22,7 @@ - @@ -31,7 +31,7 @@
+
Bloom Filter Trie
- + @@ -113,9 +113,9 @@ diff --git a/doc/doxygen/html/functions_vars.html b/doc/doxygen/html/functions_vars.html old mode 100644 new mode 100755 index 7a3431c..5588481 --- a/doc/doxygen/html/functions_vars.html +++ b/doc/doxygen/html/functions_vars.html @@ -3,7 +3,7 @@ - + Bloom Filter Trie: Data Fields - Variables @@ -22,7 +22,7 @@ - @@ -31,7 +31,7 @@
+
Bloom Filter Trie
- + @@ -113,9 +113,9 @@ diff --git a/doc/doxygen/html/globals.html b/doc/doxygen/html/globals.html old mode 100644 new mode 100755 index af3b885..e03ed76 --- a/doc/doxygen/html/globals.html +++ b/doc/doxygen/html/globals.html @@ -3,7 +3,7 @@ - + Bloom Filter Trie: Globals @@ -22,7 +22,7 @@ - @@ -31,7 +31,7 @@
+
Bloom Filter Trie
- + @@ -84,6 +84,7 @@
  • l
  • n
  • p
  • +
  • q
  • s
  • u
  • v
  • @@ -280,6 +281,13 @@

    - p -

    +

    - q -

    + +

    - s -