Add support for large maps/sets (more than 2^32 - 1 elements) through…

… an extra class template parameter (#18)
Tessil · Jul 31, 2018 · 9642242 · 9642242
1 parent cb3564e
commit 9642242
Show file tree

Hide file tree

Showing 5 changed files with 73 additions and 26 deletions.
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@ The values are stored contiguously in an underlying structure, no holes in-betwe
 
 To resolve collisions on hashes, the library uses robin hood probing with backward shift deletion.
 
-The library provides a behaviour similar to a `std::deque/std::vector` with unique values but with an average time complexity of O(1) for lookups and an amortised time complexity of O(1) for insertions. This comes at the price of a little higher memory footprint (8 bytes per bucket).
+The library provides a behaviour similar to a `std::deque/std::vector` with unique values but with an average time complexity of O(1) for lookups and an amortised time complexity of O(1) for insertions. This comes at the price of a little higher memory footprint (8 bytes per bucket by default).
 
 Two classes are provided: `tsl::ordered_map` and `tsl::ordered_set`.
 
@@ -38,7 +38,7 @@ for(auto it = map.begin(); it != map.end(); ++it) {
     it.value() = 2; // Ok
 }
 ```
-- The map can only hold up to 2<sup>32</sup> - 1 values, that is 4 294 967 295 values.
+- By default the map can only hold up to 2<sup>32</sup> - 1 values, that is 4 294 967 295 values. This can be raised through the `IndexType` class template parameter, check the [API](https://tessil.github.io/ordered-map/classtsl_1_1ordered__map.html#details) for details. 
 - No support for some bucket related methods (like bucket_size, bucket, ...).
 
 

diff --git a/tests/ordered_map_tests.cpp b/tests/ordered_map_tests.cpp
@@ -759,6 +759,30 @@ BOOST_AUTO_TEST_CASE(test_modify_value) {
     }
 }
 
+/**
+ * max_size
+ */
+BOOST_AUTO_TEST_CASE(test_max_size) {
+    // TODO not compatible on systems with sizeof(std::size_t) < sizeof(std::uint32_t), will not build.
+    tsl::ordered_map<int, int, std::hash<int>, std::equal_to<int>, 
+                     std::allocator<std::pair<int, int>>, std::vector<std::pair<int, int>>, std::uint16_t> map;
+    BOOST_CHECK(map.max_size() <= std::numeric_limits<std::uint16_t>::max());
+    BOOST_CHECK(map.max_size() > std::numeric_limits<std::uint8_t>::max());
+
+
+    tsl::ordered_map<int, int, std::hash<int>, std::equal_to<int>, 
+                     std::allocator<std::pair<int, int>>, std::vector<std::pair<int, int>>, std::uint32_t> map2;
+    BOOST_CHECK(map2.max_size() <= std::numeric_limits<std::uint32_t>::max());
+    BOOST_CHECK(map2.max_size() > std::numeric_limits<std::uint16_t>::max());
+
+
+    using max_size_type = std::conditional<sizeof(std::size_t) == sizeof(std::uint64_t), std::uint64_t, std::uint32_t>::type;
+    using min_size_type = std::conditional<sizeof(std::size_t) == sizeof(std::uint64_t), std::uint32_t, std::uint16_t>::type;
+    tsl::ordered_map<int, int, std::hash<int>, std::equal_to<int>, 
+                     std::allocator<std::pair<int, int>>, std::vector<std::pair<int, int>>, max_size_type> map3;
+    BOOST_CHECK(map3.max_size() <= std::numeric_limits<max_size_type>::max());
+    BOOST_CHECK(map3.max_size() > std::numeric_limits<min_size_type>::max());
+}
 
 /**
  * constructor

diff --git a/tsl/ordered_hash.h b/tsl/ordered_hash.h
@@ -114,17 +114,24 @@ struct is_vector<T, typename std::enable_if<
 
 
 /**
- * Each bucket entry stores a 32-bits index which is the index in m_values corresponding to the bucket's value 
- * and a 32 bits hash (truncated if the original was 64-bits) corresponding to the hash of the value.
+ * Each bucket entry stores an index which is the index in m_values corresponding to the bucket's value 
+ * and a hash (which may be truncated to 32 bits depending on IndexType) corresponding to the hash of the value.
  * 
- * The 32-bit index limits the size of the map to 2^32 - 1 elements (-1 due to a reserved value used to mark a
- * bucket as empty).
+ * The size of IndexType limits the size of the hash table to std::numeric_limits<IndexType>::max() - 1 elements (-1 due to 
+ * a reserved value used to mark a bucket as empty).
  */
+template<class IndexType>
 class bucket_entry {
-public:
-    using index_type = std::uint_least32_t;
-    using truncated_hash_type = std::uint_least32_t;
+    static_assert(std::is_unsigned<IndexType>::value, "IndexType must be an unsigned value.");
+    static_assert(std::numeric_limits<IndexType>::max() <= std::numeric_limits<std::size_t>::max(), 
+                  "std::numeric_limits<IndexType>::max() must be <= std::numeric_limits<std::size_t>::max().");
 
+public:
+    using index_type = IndexType;
+    using truncated_hash_type = typename std::conditional<std::numeric_limits<IndexType>::max() <= 
+                                                          std::numeric_limits<std::uint_least32_t>::max(),
+                                                              std::uint_least32_t, 
+                                                              std::size_t>::type;
 
     bucket_entry() noexcept: m_index(EMPTY_MARKER_INDEX), m_hash(0) {
     }
@@ -174,7 +181,7 @@ class bucket_entry {
     }
 
     static std::size_t max_size() noexcept {
-        return std::numeric_limits<index_type>::max() - NB_RESERVED_INDEXES;
+        return static_cast<std::size_t>(std::numeric_limits<index_type>::max()) - NB_RESERVED_INDEXES;
     }
 
 private:
@@ -218,7 +225,8 @@ template<class ValueType,
          class Hash,
          class KeyEqual,
          class Allocator,
-         class ValueTypeContainer>
+         class ValueTypeContainer,
+         class IndexType>
 class ordered_hash: private Hash, private KeyEqual {
 private:
     template<typename U>
@@ -356,6 +364,8 @@ class ordered_hash: private Hash, private KeyEqual {
 
 
 private:
+    using bucket_entry = tsl::detail_ordered_hash::bucket_entry<IndexType>;
+
     using buckets_container_allocator = typename 
                             std::allocator_traits<allocator_type>::template rebind_alloc<bucket_entry>; 
 
@@ -1155,10 +1165,10 @@ class ordered_hash: private Hash, private KeyEqual {
 
         /*
          * m_values.erase shifted all the values on the right of the erased value, 
-         * shift the indexes by 1 in the buckets array for these values.
+         * shift the indexes by -1 in the buckets array for these values.
          */
         if(it_bucket->index() != m_values.size()) {
-            shift_indexes_in_buckets(it_bucket->index(), short(1));
+            shift_indexes_in_buckets(it_bucket->index(), char(-1));
         }        
 
         // Mark the bucket as empty and do a backward shift of the values on the right
@@ -1168,21 +1178,22 @@ class ordered_hash: private Hash, private KeyEqual {
 
     /**
      * Go through each value from [from_ivalue, m_values.size()) in m_values and for each
-     * bucket corresponding to the value, shift the indexes to the left by delta.
+     * bucket corresponding to the value, shift the index by delta.
      */
-    void shift_indexes_in_buckets(index_type from_ivalue, short delta) noexcept  {
-        static_assert(std::is_unsigned<index_type>::value && sizeof(index_type) >= sizeof(short), 
-                      "index_type should be unsigned and sizeof(index_type) >= sizeof(short)");
+    void shift_indexes_in_buckets(index_type from_ivalue, char delta) noexcept  {
+        tsl_assert(delta == 1 || delta == -1);
 
         for(std::size_t ivalue = from_ivalue; ivalue < m_values.size(); ivalue++) {
-            std::size_t ibucket = bucket_for_hash(hash_key(KeySelect()(m_values[ivalue])));
+            // All the values in m_values have been shifted by delta. Find the bucket corresponding 
+            // to the value m_values[ivalue]
+            const index_type old_index = static_cast<index_type>(ivalue - delta);
 
-            // Modulo arithmetic, we should be alright for index_type(ivalue + delta). TODO further checks
-            while(m_buckets[ibucket].index() != index_type(ivalue + delta)) {
+            std::size_t ibucket = bucket_for_hash(hash_key(KeySelect()(m_values[ivalue])));
+            while(m_buckets[ibucket].index() != old_index) {
                 ibucket = next_bucket(ibucket);
             }
 
-            m_buckets[ibucket].set_index(index_type(m_buckets[ibucket].index() - delta));
+            m_buckets[ibucket].set_index(index_type(ivalue));
         }
     }
 
@@ -1289,7 +1300,7 @@ class ordered_hash: private Hash, private KeyEqual {
          * we need to shift the indexes in m_buckets.
          */
         if(index_insert_position != m_values.size() - 1) {
-            shift_indexes_in_buckets(index_insert_position + 1, short(-1));
+            shift_indexes_in_buckets(index_insert_position + 1, char(1));
         }
 
         return std::make_pair(iterator(m_values.begin() + index_insert_position), true);

diff --git a/tsl/ordered_map.h b/tsl/ordered_map.h
@@ -26,6 +26,7 @@
 
 
 #include <cstddef>
+#include <cstdint>
 #include <deque>
 #include <functional>
 #include <initializer_list>
@@ -53,6 +54,10 @@ namespace tsl {
  * 
  * The behaviour of the hash map is undefinded if the destructor of Key or T throws an exception.
  * 
+ * By default the maximum size of a map is limited to 2^32 - 1 values, if needed this can be changed through
+ * the IndexType template parameter. Using an `uint64_t` will raise this limit to 2^64 - 1 values but each
+ * bucket will use 16 bytes instead of 8 bytes in addition to the space needed to store the values.
+ * 
  * Iterators invalidation:
  *  - clear, operator=, reserve, rehash: always invalidate the iterators (also invalidate end()).
  *  - insert, emplace, emplace_hint, operator[]: when a std::vector is used as ValueTypeContainer 
@@ -67,7 +72,8 @@ template<class Key,
          class Hash = std::hash<Key>,
          class KeyEqual = std::equal_to<Key>,
          class Allocator = std::allocator<std::pair<Key, T>>,
-         class ValueTypeContainer = std::deque<std::pair<Key, T>, Allocator>>
+         class ValueTypeContainer = std::deque<std::pair<Key, T>, Allocator>,
+         class IndexType = std::uint_least32_t>
 class ordered_map {
 private:
     template<typename U>
@@ -100,7 +106,7 @@ class ordered_map {
     };
 
     using ht = detail_ordered_hash::ordered_hash<std::pair<Key, T>, KeySelect, ValueSelect,
-                                                 Hash, KeyEqual, Allocator, ValueTypeContainer>;
+                                                 Hash, KeyEqual, Allocator, ValueTypeContainer, IndexType>;
 
 public:
     using key_type = typename ht::key_type;

diff --git a/tsl/ordered_set.h b/tsl/ordered_set.h
@@ -26,6 +26,7 @@
 
 
 #include <cstddef>
+#include <cstdint>
 #include <deque>
 #include <functional>
 #include <initializer_list>
@@ -52,6 +53,10 @@ namespace tsl {
  * 
  * The behaviour of the hash set is undefinded if the destructor of Key throws an exception.
  * 
+ * By default the maximum size of a set is limited to 2^32 - 1 values, if needed this can be changed through
+ * the IndexType template parameter. Using an `uint64_t` will raise this limit to 2^64 - 1 values but each
+ * bucket will use 16 bytes instead of 8 bytes in addition to the space needed to store the values.
+ * 
  * Iterators invalidation:
  *  - clear, operator=, reserve, rehash: always invalidate the iterators (also invalidate end()).
  *  - insert, emplace, emplace_hint, operator[]: when a std::vector is used as ValueTypeContainer 
@@ -65,7 +70,8 @@ template<class Key,
          class Hash = std::hash<Key>,
          class KeyEqual = std::equal_to<Key>,
          class Allocator = std::allocator<Key>,
-         class ValueTypeContainer = std::deque<Key, Allocator>>
+         class ValueTypeContainer = std::deque<Key, Allocator>,
+         class IndexType = std::uint_least32_t>
 class ordered_set {
 private:
     template<typename U>
@@ -85,7 +91,7 @@ class ordered_set {
     };
 
     using ht = detail_ordered_hash::ordered_hash<Key, KeySelect, void,
-                                                 Hash, KeyEqual, Allocator, ValueTypeContainer>;
+                                                 Hash, KeyEqual, Allocator, ValueTypeContainer, IndexType>;
 
 public:
     using key_type = typename ht::key_type;