forked from turi-code/SFrame
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Dot plot scatter plot implementation
1. Exposed matplotlib scatter plot API to the user via kwargs 2. added large scale scatter plot support 3. parameterized heatmap streaming aggregator In progress: 1. bar chart is still in progress TODOs: 1. need to remove numpy dependency 2. need to support customizable view style 3. need to support user defined point size for large scale plot
- Loading branch information
Showing
18 changed files
with
1,473 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ MarkupSafe==0.23 | |
Werkzeug==0.10.4 | ||
decorator==3.4.2 | ||
itsdangerous==0.24 | ||
matplotlib==1.4.3 | ||
This comment has been minimized.
Sorry, something went wrong.
This comment has been minimized.
Sorry, something went wrong.
kaiyuzhao
Author
Owner
|
||
mock==1.0.1 | ||
nose==1.3.3 | ||
nltk==3.0.0 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
#include "extrema.hpp" | ||
|
||
namespace graphlab { | ||
namespace _canvas { | ||
This comment has been minimized.
Sorry, something went wrong.
znation
|
||
namespace streaming { | ||
|
||
flexible_type extrema::get_max() const { | ||
return m_max; | ||
} | ||
|
||
flexible_type extrema::get_min() const { | ||
return m_min; | ||
} | ||
|
||
bool extrema::update(const flexible_type& value) { | ||
if (!m_initialized) { | ||
m_initialized = true; | ||
m_max = value; | ||
m_min = value; | ||
return true; | ||
} | ||
|
||
if (value > m_max) { | ||
m_max = value; | ||
return true; | ||
} | ||
|
||
if (value < m_min) { | ||
m_min = value; | ||
return true; | ||
} | ||
|
||
return false; | ||
} | ||
|
||
bool extrema::update(const extrema& value) { | ||
return this->update(value.get_min()) || this->update(value.get_max()); | ||
} | ||
|
||
bool bounding_box::update(const bounding_box& value) { | ||
return this->x.update(value.x) || this->y.update(value.y); | ||
} | ||
|
||
}}} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
#include <unity/lib/toolkit_function_macros.hpp> | ||
#include <unity/lib/toolkit_class_macros.hpp> | ||
|
||
namespace graphlab { | ||
namespace _canvas { | ||
namespace streaming { | ||
|
||
class extrema : public toolkit_class_base { | ||
private: | ||
bool m_initialized = false; | ||
flexible_type m_max; | ||
flexible_type m_min; | ||
|
||
public: | ||
bool update(const extrema& value); | ||
bool update(const flexible_type& value); | ||
flexible_type get_max() const; | ||
flexible_type get_min() const; | ||
|
||
BEGIN_CLASS_MEMBER_REGISTRATION("_canvas.streaming.extrema.1d") | ||
REGISTER_GETTER("max", extrema::get_max) | ||
REGISTER_GETTER("min", extrema::get_min) | ||
END_CLASS_MEMBER_REGISTRATION | ||
}; | ||
|
||
struct bounding_box : public toolkit_class_base { | ||
extrema x; | ||
extrema y; | ||
bool update(const bounding_box& value); | ||
|
||
BEGIN_CLASS_MEMBER_REGISTRATION("_canvas.streaming.extrema.2d") | ||
REGISTER_PROPERTY(x) | ||
REGISTER_PROPERTY(y) | ||
END_CLASS_MEMBER_REGISTRATION | ||
}; | ||
|
||
}}} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
#include "groupby.hpp" | ||
|
||
namespace graphlab { | ||
namespace _canvas { | ||
namespace streaming { | ||
|
||
void summary_stats::add_element_simple(const flexible_type& value) { | ||
m_average.add_element_simple(value); | ||
m_count.add_element_simple(value); | ||
m_max.add_element_simple(value); | ||
m_min.add_element_simple(value); | ||
m_sum.add_element_simple(value); | ||
m_stdv.add_element_simple(value); | ||
m_variance.add_element_simple(value); | ||
} | ||
|
||
void summary_stats::combine(const summary_stats& other) { | ||
m_average.combine(other.m_average); | ||
m_count.combine(other.m_count); | ||
m_max.combine(other.m_max); | ||
m_min.combine(other.m_min); | ||
m_sum.combine(other.m_sum); | ||
m_stdv.combine(other.m_stdv); | ||
m_variance.combine(other.m_variance); | ||
} | ||
|
||
void summary_stats::partial_finalize() { | ||
m_average.partial_finalize(); | ||
m_count.partial_finalize(); | ||
m_max.partial_finalize(); | ||
m_min.partial_finalize(); | ||
m_sum.partial_finalize(); | ||
m_stdv.partial_finalize(); | ||
m_variance.partial_finalize(); | ||
} | ||
|
||
flexible_type summary_stats::emit() const { | ||
return flex_dict({ | ||
{"mean", m_average.emit()}, | ||
{"count", m_count.emit()}, | ||
{"max", m_max.emit()}, | ||
{"min", m_min.emit()}, | ||
{"sum", m_sum.emit()}, | ||
{"std", m_stdv.emit()}, | ||
{"var", m_variance.emit()} | ||
}); | ||
} | ||
|
||
void summary_stats::set_input_type(flex_type_enum type) { | ||
m_average.set_input_type(type); | ||
// set_input_type is not supported for count. not sure why not... | ||
//m_count.set_input_type(type); | ||
m_max.set_input_type(type); | ||
m_min.set_input_type(type); | ||
m_sum.set_input_type(type); | ||
m_stdv.set_input_type(type); | ||
m_variance.set_input_type(type); | ||
} | ||
|
||
void groupby_quantile_result::insert_category(const flexible_type& category) { | ||
groupby_result<groupby_operators::quantile>::insert_category(category); | ||
auto& agg = m_aggregators.at(category); | ||
agg.init(std::vector<double>({0, 0.01, 0.09, 0.10, 0.25, 0.50, 0.75, 0.90, 0.91, 0.99, 1.0})); | ||
} | ||
|
||
}}} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,164 @@ | ||
#include <sframe/groupby_aggregate_operators.hpp> | ||
#include <unity/lib/toolkit_function_macros.hpp> | ||
#include <unity/lib/toolkit_class_macros.hpp> | ||
#include <unity/lib/gl_sframe.hpp> | ||
|
||
#include "transformation.hpp" | ||
|
||
namespace graphlab { | ||
namespace _canvas { | ||
namespace streaming { | ||
|
||
class summary_stats { | ||
private: | ||
groupby_operators::average m_average; | ||
groupby_operators::count m_count; | ||
groupby_operators::max m_max; | ||
groupby_operators::min m_min; | ||
groupby_operators::sum m_sum; | ||
groupby_operators::stdv m_stdv; | ||
groupby_operators::variance m_variance; | ||
|
||
public: | ||
void add_element_simple(const flexible_type& value); | ||
void combine(const summary_stats& other); | ||
void partial_finalize(); | ||
flexible_type emit() const; | ||
void set_input_type(flex_type_enum type); | ||
}; | ||
|
||
// Intended for boxes and whiskers or bar chart (bivariate plot, categorical | ||
// vs. numeric). For now, just groups by one column (x), doing aggregation per | ||
// category on a second column (y). Limited to the first n categories | ||
// encountered in the x column. | ||
// TODO -- pick the limited set of categories intelligently (n most popular | ||
// rather than n first) | ||
template<typename Aggregation> | ||
class groupby_result : public toolkit_class_base { | ||
protected: | ||
// keeps track of one aggregator per category (unique value on first column) | ||
std::unordered_map<flexible_type, Aggregation> m_aggregators; | ||
|
||
virtual void insert_category(const flexible_type& category) { | ||
auto inserted = m_aggregators.emplace(category, Aggregation()); | ||
DASSERT_TRUE(inserted.second); // emplace should succeed | ||
auto& agg = m_aggregators.at(category); | ||
DASSERT_TRUE(m_type != flex_type_enum::UNDEFINED); | ||
agg.set_input_type(m_type); | ||
} | ||
|
||
private: | ||
constexpr static size_t CATEGORY_LIMIT = 1000; | ||
flex_int m_omitted_categories = 0; | ||
flex_type_enum m_type = flex_type_enum::UNDEFINED; | ||
|
||
static void update_or_combine(Aggregation& aggregation, const flexible_type& other) { | ||
aggregation.add_element_simple(other); | ||
} | ||
static void update_or_combine(Aggregation& aggregation, const Aggregation& other) { | ||
// TODO this is bad -- we need a non-const Aggregation in order to call | ||
// partial_finalize, but this parameter is deeply const. | ||
const_cast<Aggregation&>(other).partial_finalize(); | ||
aggregation.combine(other); | ||
} | ||
|
||
protected: | ||
template<typename T> | ||
void update_or_combine(const flexible_type& category, const T& value) { | ||
auto find_key = m_aggregators.find(category); | ||
if (find_key == m_aggregators.end()) { | ||
// insert new category if there is room | ||
if (m_aggregators.size() < CATEGORY_LIMIT) { | ||
this->insert_category(category); | ||
groupby_result::update_or_combine(m_aggregators.at(category), value); | ||
} else { | ||
m_omitted_categories++; | ||
} | ||
} else { | ||
groupby_result::update_or_combine((*find_key).second, value); | ||
} | ||
} | ||
void update(const flexible_type& category, const flexible_type& value) { | ||
const flex_type_enum type = value.get_type(); | ||
if (type == flex_type_enum::UNDEFINED) { | ||
return; // ignore undefined values, they don't make sense in groupby | ||
} | ||
this->set_input_type(type); | ||
this->update_or_combine(category, value); | ||
} | ||
|
||
public: | ||
void combine(const groupby_result<Aggregation>& other) { | ||
this->set_input_type(other.get_input_type()); | ||
for (const auto& pair : other.m_aggregators) { | ||
this->update_or_combine(pair.first, pair.second); | ||
} | ||
} | ||
void update(const std::vector<flexible_type>& values) { | ||
// by convention, values[0] is the grouped column, | ||
// and values[1] is the aggregated column | ||
DASSERT_GE(values.size(), 2); | ||
this->update(values[0], values[1]); | ||
} | ||
std::unordered_map<flexible_type, flexible_type> get_grouped() { | ||
std::unordered_map<flexible_type, flexible_type> ret; | ||
for (const auto& pair : m_aggregators) { | ||
ret.emplace(pair.first, pair.second.emit()); | ||
} | ||
return ret; | ||
} | ||
flex_int get_omitted() { return m_omitted_categories; } | ||
void set_input_type(flex_type_enum type) { | ||
if (m_type == flex_type_enum::UNDEFINED) { | ||
m_type = type; | ||
} else { | ||
DASSERT_TRUE(m_type == type); | ||
} | ||
} | ||
flex_type_enum get_input_type() const { | ||
return m_type; | ||
} | ||
}; | ||
|
||
template<typename Result> | ||
class groupby : public transformation<gl_sframe, Result, Result, 1000000> { | ||
protected: | ||
virtual void merge_results(std::vector<Result>& transformers) override { | ||
for (auto& result : transformers) { | ||
this->m_transformer.combine(result); | ||
} | ||
} | ||
}; | ||
|
||
class groupby_summary_result : public groupby_result<summary_stats> { | ||
public: | ||
BEGIN_CLASS_MEMBER_REGISTRATION("_canvas.streaming.groupby.summary.result") | ||
REGISTER_GETTER("grouped", groupby_summary_result::get_grouped) | ||
REGISTER_GETTER("omitted", groupby_summary_result::get_omitted) | ||
END_CLASS_MEMBER_REGISTRATION | ||
}; | ||
|
||
class groupby_summary : public groupby<groupby_summary_result> { | ||
public: | ||
BEGIN_CLASS_MEMBER_REGISTRATION("_canvas.streaming.groupby.summary") | ||
TRANSFORMATION_REGISTRATION(groupby_summary) | ||
END_CLASS_MEMBER_REGISTRATION | ||
}; | ||
|
||
class groupby_quantile_result : public groupby_result<groupby_operators::quantile> { | ||
public: | ||
virtual void insert_category(const flexible_type& category) override; | ||
BEGIN_CLASS_MEMBER_REGISTRATION("_canvas.streaming.groupby.quantile.result") | ||
REGISTER_GETTER("grouped", groupby_quantile_result::get_grouped) | ||
REGISTER_GETTER("omitted", groupby_quantile_result::get_omitted) | ||
END_CLASS_MEMBER_REGISTRATION | ||
}; | ||
|
||
class groupby_quantile : public groupby<groupby_quantile_result> { | ||
public: | ||
BEGIN_CLASS_MEMBER_REGISTRATION("_canvas.streaming.groupby.quantile") | ||
TRANSFORMATION_REGISTRATION(groupby_quantile) | ||
END_CLASS_MEMBER_REGISTRATION | ||
}; | ||
|
||
}}} |
Oops, something went wrong.
Current version is 1.5.1. Why 1.4.3 here?
Also, since matplotlib is a very common package with a slowly-changing API, I wonder if we should be less specific about the dependency version. If users have (for instance) 1.4.6 or 1.5.2, perhaps we should allow it and try to use it, rather than forcing upgrade or downgrade of whatever they have?