Skip to content

Commit

Permalink
Add support for large maps/sets (more than 2^32 - 1 elements) through…
Browse files Browse the repository at this point in the history
… an extra class template parameter (#18)
  • Loading branch information
Tessil committed Jul 31, 2018
1 parent cb3564e commit 9642242
Show file tree
Hide file tree
Showing 5 changed files with 73 additions and 26 deletions.
4 changes: 2 additions & 2 deletions README.md
Expand Up @@ -8,7 +8,7 @@ The values are stored contiguously in an underlying structure, no holes in-betwe

To resolve collisions on hashes, the library uses robin hood probing with backward shift deletion.

The library provides a behaviour similar to a `std::deque/std::vector` with unique values but with an average time complexity of O(1) for lookups and an amortised time complexity of O(1) for insertions. This comes at the price of a little higher memory footprint (8 bytes per bucket).
The library provides a behaviour similar to a `std::deque/std::vector` with unique values but with an average time complexity of O(1) for lookups and an amortised time complexity of O(1) for insertions. This comes at the price of a little higher memory footprint (8 bytes per bucket by default).

Two classes are provided: `tsl::ordered_map` and `tsl::ordered_set`.

Expand Down Expand Up @@ -38,7 +38,7 @@ for(auto it = map.begin(); it != map.end(); ++it) {
it.value() = 2; // Ok
}
```
- The map can only hold up to 2<sup>32</sup> - 1 values, that is 4 294 967 295 values.
- By default the map can only hold up to 2<sup>32</sup> - 1 values, that is 4 294 967 295 values. This can be raised through the `IndexType` class template parameter, check the [API](https://tessil.github.io/ordered-map/classtsl_1_1ordered__map.html#details) for details.
- No support for some bucket related methods (like bucket_size, bucket, ...).
Expand Down
24 changes: 24 additions & 0 deletions tests/ordered_map_tests.cpp
Expand Up @@ -759,6 +759,30 @@ BOOST_AUTO_TEST_CASE(test_modify_value) {
}
}

/**
* max_size
*/
BOOST_AUTO_TEST_CASE(test_max_size) {
// TODO not compatible on systems with sizeof(std::size_t) < sizeof(std::uint32_t), will not build.
tsl::ordered_map<int, int, std::hash<int>, std::equal_to<int>,
std::allocator<std::pair<int, int>>, std::vector<std::pair<int, int>>, std::uint16_t> map;
BOOST_CHECK(map.max_size() <= std::numeric_limits<std::uint16_t>::max());
BOOST_CHECK(map.max_size() > std::numeric_limits<std::uint8_t>::max());


tsl::ordered_map<int, int, std::hash<int>, std::equal_to<int>,
std::allocator<std::pair<int, int>>, std::vector<std::pair<int, int>>, std::uint32_t> map2;
BOOST_CHECK(map2.max_size() <= std::numeric_limits<std::uint32_t>::max());
BOOST_CHECK(map2.max_size() > std::numeric_limits<std::uint16_t>::max());


using max_size_type = std::conditional<sizeof(std::size_t) == sizeof(std::uint64_t), std::uint64_t, std::uint32_t>::type;
using min_size_type = std::conditional<sizeof(std::size_t) == sizeof(std::uint64_t), std::uint32_t, std::uint16_t>::type;
tsl::ordered_map<int, int, std::hash<int>, std::equal_to<int>,
std::allocator<std::pair<int, int>>, std::vector<std::pair<int, int>>, max_size_type> map3;
BOOST_CHECK(map3.max_size() <= std::numeric_limits<max_size_type>::max());
BOOST_CHECK(map3.max_size() > std::numeric_limits<min_size_type>::max());
}

/**
* constructor
Expand Down
51 changes: 31 additions & 20 deletions tsl/ordered_hash.h
Expand Up @@ -114,17 +114,24 @@ struct is_vector<T, typename std::enable_if<


/**
* Each bucket entry stores a 32-bits index which is the index in m_values corresponding to the bucket's value
* and a 32 bits hash (truncated if the original was 64-bits) corresponding to the hash of the value.
* Each bucket entry stores an index which is the index in m_values corresponding to the bucket's value
* and a hash (which may be truncated to 32 bits depending on IndexType) corresponding to the hash of the value.
*
* The 32-bit index limits the size of the map to 2^32 - 1 elements (-1 due to a reserved value used to mark a
* bucket as empty).
* The size of IndexType limits the size of the hash table to std::numeric_limits<IndexType>::max() - 1 elements (-1 due to
* a reserved value used to mark a bucket as empty).
*/
template<class IndexType>
class bucket_entry {
public:
using index_type = std::uint_least32_t;
using truncated_hash_type = std::uint_least32_t;
static_assert(std::is_unsigned<IndexType>::value, "IndexType must be an unsigned value.");
static_assert(std::numeric_limits<IndexType>::max() <= std::numeric_limits<std::size_t>::max(),
"std::numeric_limits<IndexType>::max() must be <= std::numeric_limits<std::size_t>::max().");

public:
using index_type = IndexType;
using truncated_hash_type = typename std::conditional<std::numeric_limits<IndexType>::max() <=
std::numeric_limits<std::uint_least32_t>::max(),
std::uint_least32_t,
std::size_t>::type;

bucket_entry() noexcept: m_index(EMPTY_MARKER_INDEX), m_hash(0) {
}
Expand Down Expand Up @@ -174,7 +181,7 @@ class bucket_entry {
}

static std::size_t max_size() noexcept {
return std::numeric_limits<index_type>::max() - NB_RESERVED_INDEXES;
return static_cast<std::size_t>(std::numeric_limits<index_type>::max()) - NB_RESERVED_INDEXES;
}

private:
Expand Down Expand Up @@ -218,7 +225,8 @@ template<class ValueType,
class Hash,
class KeyEqual,
class Allocator,
class ValueTypeContainer>
class ValueTypeContainer,
class IndexType>
class ordered_hash: private Hash, private KeyEqual {
private:
template<typename U>
Expand Down Expand Up @@ -356,6 +364,8 @@ class ordered_hash: private Hash, private KeyEqual {


private:
using bucket_entry = tsl::detail_ordered_hash::bucket_entry<IndexType>;

using buckets_container_allocator = typename
std::allocator_traits<allocator_type>::template rebind_alloc<bucket_entry>;

Expand Down Expand Up @@ -1155,10 +1165,10 @@ class ordered_hash: private Hash, private KeyEqual {

/*
* m_values.erase shifted all the values on the right of the erased value,
* shift the indexes by 1 in the buckets array for these values.
* shift the indexes by -1 in the buckets array for these values.
*/
if(it_bucket->index() != m_values.size()) {
shift_indexes_in_buckets(it_bucket->index(), short(1));
shift_indexes_in_buckets(it_bucket->index(), char(-1));
}

// Mark the bucket as empty and do a backward shift of the values on the right
Expand All @@ -1168,21 +1178,22 @@ class ordered_hash: private Hash, private KeyEqual {

/**
* Go through each value from [from_ivalue, m_values.size()) in m_values and for each
* bucket corresponding to the value, shift the indexes to the left by delta.
* bucket corresponding to the value, shift the index by delta.
*/
void shift_indexes_in_buckets(index_type from_ivalue, short delta) noexcept {
static_assert(std::is_unsigned<index_type>::value && sizeof(index_type) >= sizeof(short),
"index_type should be unsigned and sizeof(index_type) >= sizeof(short)");
void shift_indexes_in_buckets(index_type from_ivalue, char delta) noexcept {
tsl_assert(delta == 1 || delta == -1);

for(std::size_t ivalue = from_ivalue; ivalue < m_values.size(); ivalue++) {
std::size_t ibucket = bucket_for_hash(hash_key(KeySelect()(m_values[ivalue])));
// All the values in m_values have been shifted by delta. Find the bucket corresponding
// to the value m_values[ivalue]
const index_type old_index = static_cast<index_type>(ivalue - delta);

// Modulo arithmetic, we should be alright for index_type(ivalue + delta). TODO further checks
while(m_buckets[ibucket].index() != index_type(ivalue + delta)) {
std::size_t ibucket = bucket_for_hash(hash_key(KeySelect()(m_values[ivalue])));
while(m_buckets[ibucket].index() != old_index) {
ibucket = next_bucket(ibucket);
}

m_buckets[ibucket].set_index(index_type(m_buckets[ibucket].index() - delta));
m_buckets[ibucket].set_index(index_type(ivalue));
}
}

Expand Down Expand Up @@ -1289,7 +1300,7 @@ class ordered_hash: private Hash, private KeyEqual {
* we need to shift the indexes in m_buckets.
*/
if(index_insert_position != m_values.size() - 1) {
shift_indexes_in_buckets(index_insert_position + 1, short(-1));
shift_indexes_in_buckets(index_insert_position + 1, char(1));
}

return std::make_pair(iterator(m_values.begin() + index_insert_position), true);
Expand Down
10 changes: 8 additions & 2 deletions tsl/ordered_map.h
Expand Up @@ -26,6 +26,7 @@


#include <cstddef>
#include <cstdint>
#include <deque>
#include <functional>
#include <initializer_list>
Expand Down Expand Up @@ -53,6 +54,10 @@ namespace tsl {
*
* The behaviour of the hash map is undefinded if the destructor of Key or T throws an exception.
*
* By default the maximum size of a map is limited to 2^32 - 1 values, if needed this can be changed through
* the IndexType template parameter. Using an `uint64_t` will raise this limit to 2^64 - 1 values but each
* bucket will use 16 bytes instead of 8 bytes in addition to the space needed to store the values.
*
* Iterators invalidation:
* - clear, operator=, reserve, rehash: always invalidate the iterators (also invalidate end()).
* - insert, emplace, emplace_hint, operator[]: when a std::vector is used as ValueTypeContainer
Expand All @@ -67,7 +72,8 @@ template<class Key,
class Hash = std::hash<Key>,
class KeyEqual = std::equal_to<Key>,
class Allocator = std::allocator<std::pair<Key, T>>,
class ValueTypeContainer = std::deque<std::pair<Key, T>, Allocator>>
class ValueTypeContainer = std::deque<std::pair<Key, T>, Allocator>,
class IndexType = std::uint_least32_t>
class ordered_map {
private:
template<typename U>
Expand Down Expand Up @@ -100,7 +106,7 @@ class ordered_map {
};

using ht = detail_ordered_hash::ordered_hash<std::pair<Key, T>, KeySelect, ValueSelect,
Hash, KeyEqual, Allocator, ValueTypeContainer>;
Hash, KeyEqual, Allocator, ValueTypeContainer, IndexType>;

public:
using key_type = typename ht::key_type;
Expand Down
10 changes: 8 additions & 2 deletions tsl/ordered_set.h
Expand Up @@ -26,6 +26,7 @@


#include <cstddef>
#include <cstdint>
#include <deque>
#include <functional>
#include <initializer_list>
Expand All @@ -52,6 +53,10 @@ namespace tsl {
*
* The behaviour of the hash set is undefinded if the destructor of Key throws an exception.
*
* By default the maximum size of a set is limited to 2^32 - 1 values, if needed this can be changed through
* the IndexType template parameter. Using an `uint64_t` will raise this limit to 2^64 - 1 values but each
* bucket will use 16 bytes instead of 8 bytes in addition to the space needed to store the values.
*
* Iterators invalidation:
* - clear, operator=, reserve, rehash: always invalidate the iterators (also invalidate end()).
* - insert, emplace, emplace_hint, operator[]: when a std::vector is used as ValueTypeContainer
Expand All @@ -65,7 +70,8 @@ template<class Key,
class Hash = std::hash<Key>,
class KeyEqual = std::equal_to<Key>,
class Allocator = std::allocator<Key>,
class ValueTypeContainer = std::deque<Key, Allocator>>
class ValueTypeContainer = std::deque<Key, Allocator>,
class IndexType = std::uint_least32_t>
class ordered_set {
private:
template<typename U>
Expand All @@ -85,7 +91,7 @@ class ordered_set {
};

using ht = detail_ordered_hash::ordered_hash<Key, KeySelect, void,
Hash, KeyEqual, Allocator, ValueTypeContainer>;
Hash, KeyEqual, Allocator, ValueTypeContainer, IndexType>;

public:
using key_type = typename ht::key_type;
Expand Down

0 comments on commit 9642242

Please sign in to comment.