Skip to content

Commit

Permalink
WIP: new data preprocessing functionality and regression suite
Browse files Browse the repository at this point in the history
  • Loading branch information
Ravenwater committed May 2, 2023
1 parent 7ef2203 commit 26b3208
Show file tree
Hide file tree
Showing 15 changed files with 373 additions and 103 deletions.
7 changes: 6 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,8 @@ option(BUILD_NUMBER_CONVERSIONS "Set to ON to build conversion test sui

# Basic Linear Algebra tests
option(BUILD_LINEAR_ALGEBRA_BLAS "Set to ON to build the BLAS tests" OFF)
option(BUILD_LINEAR_ALGEBRA_VMATH "Set to ON to build the BLAS vector math lib" OFF)
option(BUILD_LINEAR_ALGEBRA_VMATH "Set to ON to build the vector math lib" OFF)
option(BUILD_LINEAR_ALGEBRA_DATA "Set to ON to build the data prep math lib" OFF)

# benchmarking
option(BUILD_BENCHMARK_ERROR "Set to ON to build error benchmarks" OFF)
Expand Down Expand Up @@ -580,6 +581,7 @@ if(BUILD_ALL)
# build the BLAS test/verification suites
set(BUILD_LINEAR_ALGEBRA_BLAS ON)
set(BUILD_LINEAR_ALGEBRA_VMATH ON)
set(BUILD_LINEAR_ALGEBRA_DATA ON)

# build the C API library
#set(BUILD_C_API_PURE_LIB ON)
Expand Down Expand Up @@ -949,6 +951,9 @@ endif(BUILD_LINEAR_ALGEBRA_BLAS)
if(BUILD_LINEAR_ALGEBRA_VMATH)
add_subdirectory("linalg/vmath")
endif(BUILD_LINEAR_ALGEBRA_VMATH)
if(BUILD_LINEAR_ALGEBRA_DATA)
add_subdirectory("linalg/data")
endif(BUILD_LINEAR_ALGEBRA_DATA)

####
# Configuration summary
Expand Down
45 changes: 0 additions & 45 deletions benchmark/error/blas/dot.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,51 +134,6 @@ void SampleError(unsigned N = 10000, double mean = 0.0, double stddev = 2.0) {
DotProductError< integer<8> >(x, minx, maxx, y, miny, maxy);
}

namespace sw {
namespace universal {
// data normalization

// minmaxscaler rescales the elements of a vector from their original
// range [min, max] to a new range [lb, ub]
template<typename Scalar>
blas::vector<Scalar> minmaxscaler(const blas::vector<Scalar>& v, Scalar lb = 0, Scalar ub = 1) {
blas::vector<Scalar> t;
if (lb >= ub) {
std::cerr << "target range is inconsistent\n";
return t;
}
std::pair< Scalar, Scalar> mm = blas::range(v);
Scalar min = mm.first;
Scalar max = mm.second;
auto scale = (ub - lb) / (max - min);
auto offset = lb - min * scale;
std::cout << min << ", " << max << ", " << lb << ", " << ub << ", " << scale << ", " << offset << '\n';
for (auto e : v) {
t.push_back( e * scale + offset );
}
return t;
}

template<typename Target>
blas::vector<Target> compress(const blas::vector<double>& v) {
auto maxpos = double(std::numeric_limits<Target>::max());

auto vminmax = arange(v);
auto minValue = vminmax.first;
auto maxValue = vminmax.second;

sw::universal::blas::vector<Target> t(v.size());
auto sqrtMaxpos = sqrt(maxpos);
double maxScale = 1.0;
if (abs(maxValue) > sqrtMaxpos) maxScale = sqrtMaxpos / maxValue;
t = maxScale * v;

return t;
}

}
}

/*
* When we want to take arbitrary vectors and want to faithfully calculate a
* dot product using lower precision types, we need to 'squeeze' the values
Expand Down
16 changes: 16 additions & 0 deletions benchmark/error/sampling/sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,22 @@ void SampleError(sw::universal::blas::vector<double>& reals) {
std::cout << "Maximum sampling error : " << maxError << '\n';
}

// Regression testing guards: typically set by the cmake configuration, but MANUAL_TESTING is an override
#define MANUAL_TESTING 0
// REGRESSION_LEVEL_OVERRIDE is set by the cmake file to drive a specific regression intensity
// It is the responsibility of the regression test to organize the tests in a quartile progression.
//#undef REGRESSION_LEVEL_OVERRIDE
#ifndef REGRESSION_LEVEL_OVERRIDE
#undef REGRESSION_LEVEL_1
#undef REGRESSION_LEVEL_2
#undef REGRESSION_LEVEL_3
#undef REGRESSION_LEVEL_4
#define REGRESSION_LEVEL_1 1
#define REGRESSION_LEVEL_2 1
#define REGRESSION_LEVEL_3 1
#define REGRESSION_LEVEL_4 1
#endif

int main()
try {
using namespace sw::universal;
Expand Down
68 changes: 23 additions & 45 deletions benchmark/error/scaling/scaling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,51 +11,7 @@
#include <universal/number/posit/posit.hpp>
#include <universal/number/lns/lns.hpp>
#include <universal/blas/blas.hpp>

namespace sw {
namespace universal {
// data normalization

// minmaxscaler rescales the elements of a vector from their original
// range [min, max] to a new range [lb, ub]
template<typename Scalar>
blas::vector<Scalar> minmaxscaler(const blas::vector<Scalar>& v, Scalar lb = 0, Scalar ub = 1) {
blas::vector<Scalar> t;
if (lb >= ub) {
std::cerr << "target range is inconsistent\n";
return t;
}
std::pair< Scalar, Scalar> mm = blas::range(v);
Scalar min = mm.first;
Scalar max = mm.second;
auto scale = (ub - lb) / (max - min);
auto offset = lb - min * scale;
std::cout << min << ", " << max << ", " << lb << ", " << ub << ", " << scale << ", " << offset << '\n';
for (auto e : v) {
t.push_back( e * scale + offset );
}
return t;
}

template<typename Target>
blas::vector<Target> compress(const blas::vector<double>& v) {
auto maxpos = double(std::numeric_limits<Target>::max());

auto vminmax = arange(v);
auto minValue = vminmax.first;
auto maxValue = vminmax.second;

sw::universal::blas::vector<Target> t(v.size());
auto sqrtMaxpos = sqrt(maxpos);
double maxScale = 1.0;
if (abs(maxValue) > sqrtMaxpos) maxScale = sqrtMaxpos / maxValue;
t = maxScale * v;

return t;
}

}
}
#include <universal/verification/test_suite.hpp>

/*
* When we want to take arbitrary vectors and want to faithfully calculate a
Expand All @@ -70,11 +26,33 @@ namespace sw {
*
*/

// Regression testing guards: typically set by the cmake configuration, but MANUAL_TESTING is an override
#define MANUAL_TESTING 0
// REGRESSION_LEVEL_OVERRIDE is set by the cmake file to drive a specific regression intensity
// It is the responsibility of the regression test to organize the tests in a quartile progression.
//#undef REGRESSION_LEVEL_OVERRIDE
#ifndef REGRESSION_LEVEL_OVERRIDE
#undef REGRESSION_LEVEL_1
#undef REGRESSION_LEVEL_2
#undef REGRESSION_LEVEL_3
#undef REGRESSION_LEVEL_4
#define REGRESSION_LEVEL_1 1
#define REGRESSION_LEVEL_2 1
#define REGRESSION_LEVEL_3 1
#define REGRESSION_LEVEL_4 1
#endif

int main()
try {
using namespace sw::universal;
using namespace sw::universal::blas;

std::string test_suite = "benchmark error in scaling operations";
std::string test_tag = "data distribution scaling";
bool reportTestCases = true;
int nrOfFailedTestCases = 0;

ReportTestSuiteHeader(test_suite, reportTestCases);
unsigned N{ 10000 };
double mean{ 0.0 }, stddev{ 1.0 };

Expand Down
85 changes: 85 additions & 0 deletions include/universal/blas/scaling.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,90 @@ std::pair<typename Vector::value_type, typename Vector::value_type> arange(const
return std::pair(running_min, running_max);
}

// data normalization

/*
X_std = (X - X.min) / (X.max - X.min)
X_scaled = X_std * (ub - lb) + min
where [lb, ub] = feature_range.
The transformation is calculated as
X_scaled = scale * X + min - X.min * scale
where scale = (max - min) / (X.max - X.min)
*/

// minmaxscaler rescales the elements of a vector from their original
// range [min, max] to a new range [lb, ub]
template<typename Scalar>
blas::vector<Scalar> minmaxscaler(const blas::vector<Scalar>& v, Scalar lb = 0, Scalar ub = 1) {
blas::vector<Scalar> t;
if (lb >= ub) {
std::cerr << "target range is inconsistent\n";
return t;
}
std::pair< Scalar, Scalar> mm = blas::range(v);
Scalar min = mm.first;
Scalar max = mm.second;
auto scale = (ub - lb) / (max - min);
auto offset = lb - min * scale;
std::cout << min << ", " << max << ", " << lb << ", " << ub << ", " << scale << ", " << offset << '\n';
for (auto e : v) t.push_back(e * scale + offset);

return t;
}

template<typename Target>
blas::vector<Target> compress(const blas::vector<double>& v) {
auto maxpos = double(std::numeric_limits<Target>::max());

auto vminmax = arange(v);
auto minValue = vminmax.first;
auto maxValue = vminmax.second;

sw::universal::blas::vector<Target> t(v.size());
auto sqrtMaxpos = sqrt(maxpos);
double maxScale = 1.0;
if (abs(maxValue) > sqrtMaxpos) maxScale = sqrtMaxpos / maxValue;
t = maxScale * v;

return t;
}


/*
Standardize features by removing the mean and scaling to unit variance.
The standard score of a sample `x` is calculated as:
z = (x - u) / s
where `u` is the mean of the training samples or zero if `with_mean=False`,
and `s` is the standard deviation of the training samples or one if
`with_std=False`.
Centering and scaling happen independently on each feature by computing
the relevant statistics on the samples in the training set. Mean and
standard deviation are then stored to be used on later data using
:meth:`transform`.
Standardization of a dataset is a common requirement for many
machine learning estimators: they might behave badly if the
individual features do not more or less look like standard normally
distributed data (e.g. Gaussian with 0 mean and unit variance).
For instance many elements used in the objective function of
a learning algorithm (such as the RBF kernel of Support Vector
Machines or the L1 and L2 regularizers of linear models) assume that
all features are centered around 0 and have variance in the same
order. If a feature has a variance that is orders of magnitude larger
than others, it might dominate the objective function and make the
estimator unable to learn from other features correctly as expected.
This scaler can also be applied to sparse CSR or CSC matrices by passing
`with_mean=False` to avoid breaking the sparsity structure of the data.
*/

}}} // namespace sw::universal::blas
16 changes: 15 additions & 1 deletion include/universal/blas/vector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@

#elif defined(_MSC_VER)
/* Microsoft Visual Studio. --------------------------------- */
// already defineds _NODISCARD
// already defines _NODISCARD

#elif defined(__PGI)
/* Portland Group PGCC/PGCPP. ------------------------------- */
Expand Down Expand Up @@ -389,4 +389,18 @@ posit<nbits, es> operator*(const vector< posit<nbits, es> >& a, const vector< po
return p;
}

template<typename Scalar>
bool operator==(const vector<Scalar>& lhs, const vector<Scalar>& rhs) {
if (lhs.size() != rhs.size()) return false;
for (unsigned i = 0; i < lhs.size(); ++i) {
if (lhs[i] != rhs[i]) return false;
}
return true;
}

template<typename Scalar>
bool operator!=(const vector<Scalar>& lhs, const vector<Scalar>& rhs) {
return !(lhs == rhs);
}

}}} // namespace sw::universal::blas
2 changes: 1 addition & 1 deletion linalg/blas/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
file (GLOB SOURCES "./*.cpp")

compile_all("true" "blas" "Basic Linear Algebra/blas" "${SOURCES}")
compile_all("true" "blas" "Linear Algebra/blas" "${SOURCES}")
2 changes: 1 addition & 1 deletion linalg/blas/README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Basic Linear Algebra Subroutine tests

This directory contains verification tests for the sw::unum::blas library
This directory contains verification tests for the sw::universal::blas library
7 changes: 5 additions & 2 deletions linalg/blas/vector_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,11 @@ try {
using Scalar = posit<nbits, es>;
using Vector = blas::vector<Scalar>;

// error full and error free dot products
nrOfFailedTestCases += ReportTestResult(VerifyErrorFreeFusedDotProduct(std::numeric_limits<posit<nbits, es> >::max()), test_tag, "error free posit dot");
std::cout << "error full and error free dot products\n";
// posit<8,0> is failing on 32k sums of epsilon
nrOfFailedTestCases += ReportTestResult(VerifyErrorFreeFusedDotProduct(std::numeric_limits<posit<8, 2> >::max()), test_tag, "error free posit<8,2> dot");
nrOfFailedTestCases += ReportTestResult(VerifyErrorFreeFusedDotProduct(std::numeric_limits<posit<16, 2> >::max()), test_tag, "error free posit<16,2> dot");
nrOfFailedTestCases += ReportTestResult(VerifyErrorFreeFusedDotProduct(std::numeric_limits<posit<32, 2> >::max()), test_tag, "error free posit<32,2> dot");
// TBD: no fdp yet for cfloat or lns
// nrOfFailedTestCases += ReportTestResult(VerifyErrorFreeFusedDotProduct(std::numeric_limits< bfloat_t >::max()), test_tag, "error free bfloat16 dot");
// nrOfFailedTestCases += ReportTestResult(VerifyErrorFreeFusedDotProduct(std::numeric_limits< lns<16, 8> >::max()), test_tag, "error free lns dot");
Expand Down
3 changes: 3 additions & 0 deletions linalg/data/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
file (GLOB SOURCES "./*.cpp")

compile_all("true" "data" "Linear Algebra/data" "${SOURCES}")
32 changes: 32 additions & 0 deletions linalg/data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Data preparation functions

This directory contains verification tests for the data preprocessing vector functions.

standardization
normalization
minmax_scale
maxabs_scale
robust scaler
binarizer
kernel_centerer
quantile_transformer
power_transformer

layer norm
softmax
max pooling


data preprocessing of images vs language


mixed-precision
layer to layer what do our activations look like?

what are the trouble cases?
- the layer transformation expands the dynamic range of the input activations


log-normal distributions (the gradients are log-normal) normal distributed in log space

what does a log-normal distribution look like in linear space?

0 comments on commit 26b3208

Please sign in to comment.