Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Estimator merging does not work #2

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 8 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
source "http://rubygems.org"

gemspec

group :development do
gem 'echoe'
gem 'rspec'
end
68 changes: 31 additions & 37 deletions ext/hyperloglog.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ extern "C" void hyperbuilder_set_register(BoolArray<uword64> *registers, uword32
// hyperbuilder_printBits(value);
// cout << "VALUE(S) ";
// hyperbuilder_printBits(value << shift);

registers->setWord( bucketPos, (registers->getWord(bucketPos) & ~(maskBits << shift)) | (value << shift) );
// cout << "AFTER ";
// hyperbuilder_printBits(registers->getWord(bucketPos));
Expand All @@ -61,13 +61,13 @@ extern "C" uword32 hyperbuilder_get_register(BoolArray<uword64> *registers, uwor
return (registers->getWord(bucketPos) & (maskBits << shift)) >> shift;
}

// Hashing and Calculations
// Hashing and Calculations
extern "C" uword64 hyperbuilder_clz(uword32 x) {
uword32 zeros = 0;
for (short msb = 8 * sizeof(x);
!(x & (1ULL << msb)) && msb >= 0;
--msb, ++zeros) {}

return zeros;
}

Expand All @@ -85,7 +85,7 @@ extern "C" void hyperbuilder_free(HyperBuilder *builder) {

extern "C" VALUE hyperbuilder_new(VALUE klass, VALUE bits) {
HyperBuilder *builder = ALLOC(HyperBuilder);

builder->bits = FIX2INT(bits);
builder->registerCount = static_cast<uword32>(pow(2, FIX2INT(bits)));
builder->registers = new BoolArray<uword64>(static_cast<size_t>( (floor(builder->registerCount / 12) + 1) * 64) );
Expand All @@ -95,13 +95,13 @@ extern "C" VALUE hyperbuilder_new(VALUE klass, VALUE bits) {
extern "C" VALUE hyperbuilder_offer(VALUE self, VALUE item) {
HyperBuilder *builder;
Data_Get_Struct(self, HyperBuilder, builder);

uword32 x = hyperbuilder_hash(item);
uword32 j = x >> (32 - builder->bits);
uword64 r = hyperbuilder_clz( (x << builder->bits) | (1 << (builder->bits - 1)) + 1 );

uword32 registerValue = hyperbuilder_get_register(builder->registers, j);

if(registerValue < r) {
hyperbuilder_set_register(builder->registers, j, r);
return Qtrue;
Expand All @@ -120,12 +120,12 @@ extern "C" VALUE hyperbuilder_reset(VALUE self) {
extern "C" VALUE hyperbuilder_serialize(VALUE self) {
HyperBuilder *builder;
Data_Get_Struct(self, HyperBuilder, builder);

EWAHBoolArray<uword64> ewahBitset;
for(uword32 i = 0; i < floor((builder->registerCount) / 12) + 1; i++) {
ewahBitset.add(builder->registers->getWord(i));
}

stringstream ss;
ewahBitset.write(ss);
return rb_str_new(ss.str().c_str(), ss.str().size());
Expand All @@ -134,30 +134,24 @@ extern "C" VALUE hyperbuilder_serialize(VALUE self) {
extern "C" VALUE hyperbuilder_to_s(VALUE self) {
HyperBuilder *builder;
Data_Get_Struct(self, HyperBuilder, builder);

stringstream ss;
ss << "[";
for(uword32 r = 0; r < builder->registerCount; r++) {
ss << hyperbuilder_get_register(builder->registers, r) << ", ";
}
ss << "]";

return rb_str_new(ss.str().c_str(), ss.str().size());
}

extern "C" VALUE hyperbuilder_size_in_bits(VALUE self) {
HyperBuilder *builder;
Data_Get_Struct(self, HyperBuilder, builder);

return INT2FIX(builder->registers->sizeInBits());
}


// extern "C" VALUE hyperbuilder_merge(VALUE args) {
// // return a new hyperbuilder from merging a bunch of other ones
// return Qnil;
// }

/*
* HyperEstimator
*/
Expand All @@ -168,15 +162,15 @@ extern "C" VALUE hyperbuilder_size_in_bits(VALUE self) {

extern "C" VALUE hyperestimator_new(VALUE klass, VALUE bits, VALUE serialized) {
HyperEstimator *estimator = ALLOC(HyperEstimator);

estimator->bits = FIX2INT(bits);
estimator->registers = new EWAHBoolArray<uword64>();
estimator->registerCount = static_cast<uword32>(pow(2, FIX2INT(bits)));

stringstream ss;
ss.write(RSTRING(serialized)->ptr, RSTRING(serialized)->len);
estimator->registers->read(ss, true);

return Data_Wrap_Struct(klass, 0, hyperestimator_free, estimator);
}

Expand All @@ -190,20 +184,20 @@ extern "C" VALUE hyperbuilder_estimator(VALUE self) {
extern "C" VALUE hyperestimator_merge(VALUE estimators) {
uword32 bits = 0;
BoolArray<uword64> registers[RARRAY(estimators)->len];

// Collect all the expanded registers
for(int i = 0; i < RARRAY(estimators)->len; i++) {
HyperEstimator *estimator;
Data_Get_Struct(*(RARRAY(estimators)->ptr), HyperEstimator, estimator);
Data_Get_Struct(rb_ary_entry(estimators, i), HyperEstimator, estimator);

if(bits == 0) {
bits = estimator->bits;
} else if(bits != estimator->bits) {
rb_raise(rb_eRuntimeError, "Cannot union estimators that aren't of the same size");
}
registers[i] = estimator->registers->toBoolArray();
}

uword32 registerCount = static_cast<uword32>(pow(2, bits));
BoolArray<uword64> *mergedRegisters = new BoolArray<uword64>((registerCount + 1) * 64);
for(int e = 0; e < RARRAY(estimators)->len; e++) {
Expand All @@ -214,31 +208,31 @@ extern "C" VALUE hyperestimator_merge(VALUE estimators) {
}
}
}

HyperBuilder *builder = ALLOC(HyperBuilder);
VALUE klass = rb_path2class("HyperBuilder");

builder->bits = bits;
builder->registers = mergedRegisters;
builder->registerCount = registerCount;

return Data_Wrap_Struct(klass, 0, hyperbuilder_free, builder);
}

extern "C" VALUE hyperestimator_estimate(VALUE klass, VALUE estimators) {
VALUE merged = hyperestimator_merge(estimators);

HyperBuilder *builder;
Data_Get_Struct(merged, HyperBuilder, builder);

double rSum = 0;
for(uword32 j = 0; j < builder->registerCount; j++) {
rSum += pow(2, (-1 * (int)hyperbuilder_get_register(builder->registers, j)));
}

double alphaM = 0.7213 / (1 + 1.079 / builder->registerCount);
double estimate = alphaM * pow(builder->registerCount, 2) * (1 / rSum);

if(estimate < (5.0/2.0) * builder->registerCount) {
uword32 zeros = 0;
for(uword32 z = 0; z < builder->registerCount; z++) {
Expand All @@ -255,17 +249,17 @@ extern "C" VALUE hyperestimator_estimate(VALUE klass, VALUE estimators) {
extern "C" VALUE hyperestimator_to_s(VALUE self) {
HyperEstimator *estimator;
Data_Get_Struct(self, HyperEstimator, estimator);

BoolArray<uword64> registers = estimator->registers->toBoolArray();

stringstream ss;

ss << "[";
for(uword32 r = 0; r < estimator->registerCount; r++) {
ss << hyperbuilder_get_register(&registers, r) << ", ";
}
ss << "]";

return rb_str_new(ss.str().c_str(), ss.str().size());
}

Expand All @@ -286,4 +280,4 @@ extern "C" void Init_hyperloglog() {
rb_define_singleton_method(rbHyperEstimator, "new", (ruby_method*) &hyperestimator_new, 2);
rb_define_singleton_method(rbHyperEstimator, "estimate", (ruby_method*) &hyperestimator_estimate, -2);
rb_define_method(rbHyperEstimator, "to_s", (ruby_method*) &hyperestimator_to_s, 0);
}
}
5 changes: 0 additions & 5 deletions hyperloglog.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,5 @@ Gem::Specification.new do |s|

if s.respond_to? :specification_version then
s.specification_version = 3

if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
else
end
else
end
end
45 changes: 34 additions & 11 deletions spec/hyperloglog_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,23 @@ def load_data(name)
@builder = HyperBuilder.new(11)
0.upto(9) {|index| @builder.offer(index.to_s)}
end

it 'should put to_s' do
@builder.to_s.should_not == nil
end

it 'should serialize' do
@builder.serialize.should_not == nil
end

it 'should generate an estimator' do
@builder.estimator.should_not == nil
end

it 'should create the proper estimator' do
@builder.to_s.should == @builder.estimator.to_s
end

it 'should reset' do
@builder.reset
@builder.size_in_bits.should == 0
Expand All @@ -36,18 +36,41 @@ def load_data(name)
it 'should generate good estimates' do
items = load_data('small_integers')
total_items = items.uniq.length

4.upto(20) do |m|
se = total_items * (1.04 / Math.sqrt(2**m))

builder = HyperBuilder.new(m)
items.each{|item| builder.offer(item.to_s)}
items.each {|item| builder.offer(item.to_s)}

estimate = HyperEstimator.estimate(builder.estimator)

# puts "For m = #{m} we should have #{estimate} in [#{total_items - (3 * se)}, #{total_items + (3 * se)}]"
estimate.should be >= total_items - (3 * se)
estimate.should be <= total_items + (3 * se)
end
end
end

it 'should merge estimators' do
items = load_data('small_integers')
total_items = items.uniq.length

items_a = items[0, items.count*0.75]
items_b = items[items.count*0.5, items.count*0.5]

4.upto(20) do |m|
se = total_items * (1.04 / Math.sqrt(2**m))

builder_a = HyperBuilder.new(m)
builder_b = HyperBuilder.new(m)
items_a.each {|item| builder_a.offer(item.to_s)}
items_b.each {|item| builder_b.offer(item.to_s)}

estimate = HyperEstimator.estimate(builder_a.estimator, builder_b.estimator)

# puts "For m = #{m} we should have #{estimate} in [#{total_items - (3 * se)}, #{total_items + (3 * se)}]"
estimate.should be >= total_items - (3 * se)
estimate.should be <= total_items + (3 * se)
end
end
end