/
bloom.go
119 lines (105 loc) · 2.75 KB
/
bloom.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
package goestimators
import (
"log"
"math"
)
// Bloom size enum
type bloomSize int
const (
// Size256 indicates a 256-bit Bloom filter, with two hash functions (k=2)
Size256 bloomSize = iota
// Size65536 indicates a 65536-bit Bloom filter, with four hash functions (k=4)
Size65536
)
// Bloom is the bloom filter structure
type Bloom struct {
size bloomSize
nInts int
base []uint64
}
// NewBloom creates a new Bloom filter structure of a given size.
func NewBloom(size bloomSize) *Bloom {
var nInts int
switch size {
case Size256:
nInts = 4
case Size65536:
nInts = 1024
default:
log.Panic("Invalid Bloom size:", size)
}
b := Bloom{
nInts: nInts,
size: size,
base: make([]uint64, nInts),
}
return &b
}
// FalsePositiveProbability returns the upper bounds probability of this Bloom filter
// having a false positive, given the number of elements which would be stored in it.
func (b Bloom) FalsePositiveProbability(n uint64) float64 {
var k float64
var m float64
switch b.size {
case Size256:
k = 2
m = 256
case Size65536:
k = 4
m = 65536
default:
log.Panic("Invalid Bloom size:", b.size)
}
return math.Pow(1-math.Pow(math.E, -((k*(float64(n)+0.5))/(m-1))), k)
}
// Observe includes the given byte slice in the Bloom map.
func (b *Bloom) Observe(buf []byte) {
ints := b.buf2ints(buf)
for i, ii := range ints {
b.base[i] |= ii
}
}
// Add is deprecated: use Observe() to match the HyperLogLog API
func (b *Bloom) Add(buf []byte) {
b.Observe(buf)
}
// Check checks if the given byte slice is in the Bloom map.
// Returns false if there's absolutely no chance that the item is in the buffer.
func (b *Bloom) Check(buf []byte) bool {
ints := b.buf2ints(buf)
for i, ii := range ints {
if b.base[i]&ii != ii {
return false
}
}
return true
}
// This is where the inner loop of the Bloom filter happens.
// See https://en.wikipedia.org/wiki/Bloom_filter for a decent description.
func (b *Bloom) buf2ints(buf []byte) []uint64 {
i := make([]uint64, b.nInts)
h := hash64(buf)
var mask uint64
var shift uint64
// A performance optimization: for a 256-bit Bloom filter, we use 2 bytes of the 64-bit
// hash function as 2 independant hash functions (k=2).
// For a 65536-bit Bloom filter, we split the 64-bit hash function into 4 16-bit hashes (k=4).
if b.size == Size256 {
mask = 0xff
shift = 8
h = h & 0xffff // Set the limit to two 8-bit hash functions (first 2 bytes of the 64-bit hash).
} else if b.size == Size65536 {
mask = 0xffff
shift = 16
} else {
log.Panic("Invalid size:", b.size)
}
for h > 0 {
b := h & mask // Extract one of the k hash functions
//fmt.Println(buf, h, b, shift)
// set the b'th bit in i
i[b/64] |= 1 << (b % 64) // Go is smart enough to do this with shifts.
h = h >> shift
}
return i
}