Skip to content

Commit

Permalink
Dot plot scatter plot implementation
Browse files Browse the repository at this point in the history
1. Exposed matplotlib scatter plot API to the user via kwargs
2. added large scale scatter plot support
3. parameterized heatmap streaming aggregator

In progress:
1. bar chart is still in progress

TODOs:

1. need to remove numpy dependency
2. need to support customizable view style
3. need to support user defined point size for large scale plot
  • Loading branch information
kaiyuzhao committed Apr 28, 2016
1 parent 42f4f5d commit a86b2a4
Show file tree
Hide file tree
Showing 18 changed files with 1,473 additions and 1 deletion.
1 change: 1 addition & 0 deletions oss_local_scripts/conda_requirements.txt
Expand Up @@ -5,6 +5,7 @@ MarkupSafe==0.23
Werkzeug==0.10.4
decorator==3.4.2
itsdangerous==0.24
matplotlib==1.4.3

This comment has been minimized.

Copy link
@znation

znation Apr 28, 2016

Current version is 1.5.1. Why 1.4.3 here?

Also, since matplotlib is a very common package with a slowly-changing API, I wonder if we should be less specific about the dependency version. If users have (for instance) 1.4.6 or 1.5.2, perhaps we should allow it and try to use it, rather than forcing upgrade or downgrade of whatever they have?

This comment has been minimized.

Copy link
@kaiyuzhao

kaiyuzhao Apr 28, 2016

Author Owner

there is a known issue with 1.5.1 related to virtualenv. For dev dependency, I recommend we use 1.4.3.

Here's a solution if we want to use 1.5.1, but I don't like it. http://stackoverflow.com/questions/21784641/installation-issue-with-matplotlib-python

Issue tracker pypa/virtualenv#54

mock==1.0.1
nose==1.3.3
nltk==3.0.0
Expand Down
1 change: 1 addition & 0 deletions oss_local_scripts/pip_requirements.txt
Expand Up @@ -2,6 +2,7 @@ awscli==1.6.2
httpretty==0.8.8
librato-metrics==0.8.2
moto==0.4.1
matplotlib==1.4.3
prettytable==0.7.2
xmltodict==0.9.2
argparse==1.2.1
Expand Down
10 changes: 10 additions & 0 deletions oss_src/unity/extensions/CMakeLists.txt
Expand Up @@ -5,6 +5,16 @@ macro(make_extension NAME)
message(STATUS "Registering Extension: " ${NAME})
make_library(${NAME} ${ARGV} REQUIRES unity boost SHARED)
endmacro()
#----------------------------

This comment has been minimized.

Copy link
@znation

znation Apr 28, 2016

Let's rename this extension to streaming_aggregation.

make_extension(_canvas_streaming SOURCES
_canvas/streaming/extrema.cpp
_canvas/streaming/groupby.cpp
_canvas/streaming/heatmap.cpp
_canvas/streaming/histogram/continuous.cpp
_canvas/streaming/hyperloglog.cpp
_canvas/streaming/registration.cpp
)

#----------------------------
make_extension(additional_sframe_utilities SOURCES additional_sframe_utilities.cpp)
make_extension(grouped_sframe SOURCES grouped_sframe.cpp)
Expand Down
44 changes: 44 additions & 0 deletions oss_src/unity/extensions/_canvas/streaming/extrema.cpp
@@ -0,0 +1,44 @@
#include "extrema.hpp"

namespace graphlab {
namespace _canvas {

This comment has been minimized.

Copy link
@znation

znation Apr 28, 2016

Same rename. How about graphlab::streaming_aggregation instead of graphlab::_canvas::streaming?

This comment has been minimized.

Copy link
@kaiyuzhao

kaiyuzhao Apr 28, 2016

Author Owner

I changed it to graphlab::plot::streaming

namespace streaming {

flexible_type extrema::get_max() const {
return m_max;
}

flexible_type extrema::get_min() const {
return m_min;
}

bool extrema::update(const flexible_type& value) {
if (!m_initialized) {
m_initialized = true;
m_max = value;
m_min = value;
return true;
}

if (value > m_max) {
m_max = value;
return true;
}

if (value < m_min) {
m_min = value;
return true;
}

return false;
}

bool extrema::update(const extrema& value) {
return this->update(value.get_min()) || this->update(value.get_max());
}

bool bounding_box::update(const bounding_box& value) {
return this->x.update(value.x) || this->y.update(value.y);
}

}}}
37 changes: 37 additions & 0 deletions oss_src/unity/extensions/_canvas/streaming/extrema.hpp
@@ -0,0 +1,37 @@
#include <unity/lib/toolkit_function_macros.hpp>
#include <unity/lib/toolkit_class_macros.hpp>

namespace graphlab {
namespace _canvas {
namespace streaming {

class extrema : public toolkit_class_base {
private:
bool m_initialized = false;
flexible_type m_max;
flexible_type m_min;

public:
bool update(const extrema& value);
bool update(const flexible_type& value);
flexible_type get_max() const;
flexible_type get_min() const;

BEGIN_CLASS_MEMBER_REGISTRATION("_canvas.streaming.extrema.1d")
REGISTER_GETTER("max", extrema::get_max)
REGISTER_GETTER("min", extrema::get_min)
END_CLASS_MEMBER_REGISTRATION
};

struct bounding_box : public toolkit_class_base {
extrema x;
extrema y;
bool update(const bounding_box& value);

BEGIN_CLASS_MEMBER_REGISTRATION("_canvas.streaming.extrema.2d")
REGISTER_PROPERTY(x)
REGISTER_PROPERTY(y)
END_CLASS_MEMBER_REGISTRATION
};

}}}
66 changes: 66 additions & 0 deletions oss_src/unity/extensions/_canvas/streaming/groupby.cpp
@@ -0,0 +1,66 @@
#include "groupby.hpp"

namespace graphlab {
namespace _canvas {
namespace streaming {

void summary_stats::add_element_simple(const flexible_type& value) {
m_average.add_element_simple(value);
m_count.add_element_simple(value);
m_max.add_element_simple(value);
m_min.add_element_simple(value);
m_sum.add_element_simple(value);
m_stdv.add_element_simple(value);
m_variance.add_element_simple(value);
}

void summary_stats::combine(const summary_stats& other) {
m_average.combine(other.m_average);
m_count.combine(other.m_count);
m_max.combine(other.m_max);
m_min.combine(other.m_min);
m_sum.combine(other.m_sum);
m_stdv.combine(other.m_stdv);
m_variance.combine(other.m_variance);
}

void summary_stats::partial_finalize() {
m_average.partial_finalize();
m_count.partial_finalize();
m_max.partial_finalize();
m_min.partial_finalize();
m_sum.partial_finalize();
m_stdv.partial_finalize();
m_variance.partial_finalize();
}

flexible_type summary_stats::emit() const {
return flex_dict({
{"mean", m_average.emit()},
{"count", m_count.emit()},
{"max", m_max.emit()},
{"min", m_min.emit()},
{"sum", m_sum.emit()},
{"std", m_stdv.emit()},
{"var", m_variance.emit()}
});
}

void summary_stats::set_input_type(flex_type_enum type) {
m_average.set_input_type(type);
// set_input_type is not supported for count. not sure why not...
//m_count.set_input_type(type);
m_max.set_input_type(type);
m_min.set_input_type(type);
m_sum.set_input_type(type);
m_stdv.set_input_type(type);
m_variance.set_input_type(type);
}

void groupby_quantile_result::insert_category(const flexible_type& category) {
groupby_result<groupby_operators::quantile>::insert_category(category);
auto& agg = m_aggregators.at(category);
agg.init(std::vector<double>({0, 0.01, 0.09, 0.10, 0.25, 0.50, 0.75, 0.90, 0.91, 0.99, 1.0}));
}

}}}
164 changes: 164 additions & 0 deletions oss_src/unity/extensions/_canvas/streaming/groupby.hpp
@@ -0,0 +1,164 @@
#include <sframe/groupby_aggregate_operators.hpp>
#include <unity/lib/toolkit_function_macros.hpp>
#include <unity/lib/toolkit_class_macros.hpp>
#include <unity/lib/gl_sframe.hpp>

#include "transformation.hpp"

namespace graphlab {
namespace _canvas {
namespace streaming {

class summary_stats {
private:
groupby_operators::average m_average;
groupby_operators::count m_count;
groupby_operators::max m_max;
groupby_operators::min m_min;
groupby_operators::sum m_sum;
groupby_operators::stdv m_stdv;
groupby_operators::variance m_variance;

public:
void add_element_simple(const flexible_type& value);
void combine(const summary_stats& other);
void partial_finalize();
flexible_type emit() const;
void set_input_type(flex_type_enum type);
};

// Intended for boxes and whiskers or bar chart (bivariate plot, categorical
// vs. numeric). For now, just groups by one column (x), doing aggregation per
// category on a second column (y). Limited to the first n categories
// encountered in the x column.
// TODO -- pick the limited set of categories intelligently (n most popular
// rather than n first)
template<typename Aggregation>
class groupby_result : public toolkit_class_base {
protected:
// keeps track of one aggregator per category (unique value on first column)
std::unordered_map<flexible_type, Aggregation> m_aggregators;

virtual void insert_category(const flexible_type& category) {
auto inserted = m_aggregators.emplace(category, Aggregation());
DASSERT_TRUE(inserted.second); // emplace should succeed
auto& agg = m_aggregators.at(category);
DASSERT_TRUE(m_type != flex_type_enum::UNDEFINED);
agg.set_input_type(m_type);
}

private:
constexpr static size_t CATEGORY_LIMIT = 1000;
flex_int m_omitted_categories = 0;
flex_type_enum m_type = flex_type_enum::UNDEFINED;

static void update_or_combine(Aggregation& aggregation, const flexible_type& other) {
aggregation.add_element_simple(other);
}
static void update_or_combine(Aggregation& aggregation, const Aggregation& other) {
// TODO this is bad -- we need a non-const Aggregation in order to call
// partial_finalize, but this parameter is deeply const.
const_cast<Aggregation&>(other).partial_finalize();
aggregation.combine(other);
}

protected:
template<typename T>
void update_or_combine(const flexible_type& category, const T& value) {
auto find_key = m_aggregators.find(category);
if (find_key == m_aggregators.end()) {
// insert new category if there is room
if (m_aggregators.size() < CATEGORY_LIMIT) {
this->insert_category(category);
groupby_result::update_or_combine(m_aggregators.at(category), value);
} else {
m_omitted_categories++;
}
} else {
groupby_result::update_or_combine((*find_key).second, value);
}
}
void update(const flexible_type& category, const flexible_type& value) {
const flex_type_enum type = value.get_type();
if (type == flex_type_enum::UNDEFINED) {
return; // ignore undefined values, they don't make sense in groupby
}
this->set_input_type(type);
this->update_or_combine(category, value);
}

public:
void combine(const groupby_result<Aggregation>& other) {
this->set_input_type(other.get_input_type());
for (const auto& pair : other.m_aggregators) {
this->update_or_combine(pair.first, pair.second);
}
}
void update(const std::vector<flexible_type>& values) {
// by convention, values[0] is the grouped column,
// and values[1] is the aggregated column
DASSERT_GE(values.size(), 2);
this->update(values[0], values[1]);
}
std::unordered_map<flexible_type, flexible_type> get_grouped() {
std::unordered_map<flexible_type, flexible_type> ret;
for (const auto& pair : m_aggregators) {
ret.emplace(pair.first, pair.second.emit());
}
return ret;
}
flex_int get_omitted() { return m_omitted_categories; }
void set_input_type(flex_type_enum type) {
if (m_type == flex_type_enum::UNDEFINED) {
m_type = type;
} else {
DASSERT_TRUE(m_type == type);
}
}
flex_type_enum get_input_type() const {
return m_type;
}
};

template<typename Result>
class groupby : public transformation<gl_sframe, Result, Result, 1000000> {
protected:
virtual void merge_results(std::vector<Result>& transformers) override {
for (auto& result : transformers) {
this->m_transformer.combine(result);
}
}
};

class groupby_summary_result : public groupby_result<summary_stats> {
public:
BEGIN_CLASS_MEMBER_REGISTRATION("_canvas.streaming.groupby.summary.result")
REGISTER_GETTER("grouped", groupby_summary_result::get_grouped)
REGISTER_GETTER("omitted", groupby_summary_result::get_omitted)
END_CLASS_MEMBER_REGISTRATION
};

class groupby_summary : public groupby<groupby_summary_result> {
public:
BEGIN_CLASS_MEMBER_REGISTRATION("_canvas.streaming.groupby.summary")
TRANSFORMATION_REGISTRATION(groupby_summary)
END_CLASS_MEMBER_REGISTRATION
};

class groupby_quantile_result : public groupby_result<groupby_operators::quantile> {
public:
virtual void insert_category(const flexible_type& category) override;
BEGIN_CLASS_MEMBER_REGISTRATION("_canvas.streaming.groupby.quantile.result")
REGISTER_GETTER("grouped", groupby_quantile_result::get_grouped)
REGISTER_GETTER("omitted", groupby_quantile_result::get_omitted)
END_CLASS_MEMBER_REGISTRATION
};

class groupby_quantile : public groupby<groupby_quantile_result> {
public:
BEGIN_CLASS_MEMBER_REGISTRATION("_canvas.streaming.groupby.quantile")
TRANSFORMATION_REGISTRATION(groupby_quantile)
END_CLASS_MEMBER_REGISTRATION
};

}}}

0 comments on commit a86b2a4

Please sign in to comment.