WIP: new data preprocessing functionality and regression suite

stillwater-sc · May 2, 2023 · 26b3208 · 26b3208
1 parent 7ef2203
commit 26b3208
Show file tree

Hide file tree

Showing 15 changed files with 373 additions and 103 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -138,7 +138,8 @@ option(BUILD_NUMBER_CONVERSIONS          "Set to ON to build conversion test sui
 
 # Basic Linear Algebra tests
 option(BUILD_LINEAR_ALGEBRA_BLAS         "Set to ON to build the BLAS tests"                   OFF)
-option(BUILD_LINEAR_ALGEBRA_VMATH        "Set to ON to build the BLAS vector math lib"         OFF)
+option(BUILD_LINEAR_ALGEBRA_VMATH        "Set to ON to build the vector math lib"              OFF)
+option(BUILD_LINEAR_ALGEBRA_DATA         "Set to ON to build the data prep math lib"           OFF)
 
 # benchmarking
 option(BUILD_BENCHMARK_ERROR             "Set to ON to build error benchmarks"                 OFF)
@@ -580,6 +581,7 @@ if(BUILD_ALL)
 	# build the BLAS test/verification suites
 	set(BUILD_LINEAR_ALGEBRA_BLAS ON)
 	set(BUILD_LINEAR_ALGEBRA_VMATH ON)
+	set(BUILD_LINEAR_ALGEBRA_DATA ON)
 
 	# build the C API library
 	#set(BUILD_C_API_PURE_LIB ON)
@@ -949,6 +951,9 @@ endif(BUILD_LINEAR_ALGEBRA_BLAS)
 if(BUILD_LINEAR_ALGEBRA_VMATH)
 add_subdirectory("linalg/vmath")
 endif(BUILD_LINEAR_ALGEBRA_VMATH)
+if(BUILD_LINEAR_ALGEBRA_DATA)
+add_subdirectory("linalg/data")
+endif(BUILD_LINEAR_ALGEBRA_DATA)
 
 ####
 # Configuration summary

diff --git a/benchmark/error/blas/dot.cpp b/benchmark/error/blas/dot.cpp
@@ -134,51 +134,6 @@ void SampleError(unsigned N = 10000, double mean = 0.0, double stddev = 2.0) {
 	DotProductError< integer<8> >(x, minx, maxx, y, miny, maxy);
 }
 
-namespace sw {
-	namespace universal {
-		// data normalization
-
-		// minmaxscaler rescales the elements of a vector from their original 
-		// range [min, max] to a new range [lb, ub]
-		template<typename Scalar>
-		blas::vector<Scalar> minmaxscaler(const blas::vector<Scalar>& v, Scalar lb = 0, Scalar ub = 1) {
-			blas::vector<Scalar> t; 
-			if (lb >= ub) {
-				std::cerr << "target range is inconsistent\n";
-				return t;
-			}
-			std::pair< Scalar, Scalar> mm = blas::range(v);
-			Scalar min = mm.first;
-			Scalar max = mm.second;
-			auto scale = (ub - lb) / (max - min);
-			auto offset = lb - min * scale;
-			std::cout << min << ", " << max << ", " << lb << ", " << ub << ", " << scale << ", " << offset << '\n';
-			for (auto e : v) {
-				t.push_back( e * scale + offset );
-			}
-			return t;
-		}
-
-		template<typename Target>
-		blas::vector<Target> compress(const blas::vector<double>& v) {
-			auto maxpos = double(std::numeric_limits<Target>::max());
-
-			auto vminmax = arange(v);
-			auto minValue = vminmax.first;
-			auto maxValue = vminmax.second;
-
-			sw::universal::blas::vector<Target> t(v.size());
-			auto sqrtMaxpos = sqrt(maxpos);
-			double maxScale = 1.0;
-			if (abs(maxValue) > sqrtMaxpos) maxScale = sqrtMaxpos / maxValue;
-			t = maxScale * v;
-
-			return t;
-		}
-
-	}
-}
-
 /*
  * When we want to take arbitrary vectors and want to faithfully calculate a 
  * dot product using lower precision types, we need to 'squeeze' the values

diff --git a/benchmark/error/sampling/sampling.cpp b/benchmark/error/sampling/sampling.cpp
@@ -43,6 +43,22 @@ void SampleError(sw::universal::blas::vector<double>& reals) {
 	std::cout << "Maximum sampling error : " << maxError << '\n';
 }
 
+// Regression testing guards: typically set by the cmake configuration, but MANUAL_TESTING is an override
+#define MANUAL_TESTING 0
+// REGRESSION_LEVEL_OVERRIDE is set by the cmake file to drive a specific regression intensity
+// It is the responsibility of the regression test to organize the tests in a quartile progression.
+//#undef REGRESSION_LEVEL_OVERRIDE
+#ifndef REGRESSION_LEVEL_OVERRIDE
+#undef REGRESSION_LEVEL_1
+#undef REGRESSION_LEVEL_2
+#undef REGRESSION_LEVEL_3
+#undef REGRESSION_LEVEL_4
+#define REGRESSION_LEVEL_1 1
+#define REGRESSION_LEVEL_2 1
+#define REGRESSION_LEVEL_3 1
+#define REGRESSION_LEVEL_4 1
+#endif
+
 int main()
 try {
 	using namespace sw::universal;

diff --git a/benchmark/error/scaling/scaling.cpp b/benchmark/error/scaling/scaling.cpp
@@ -11,51 +11,7 @@
 #include <universal/number/posit/posit.hpp>
 #include <universal/number/lns/lns.hpp>
 #include <universal/blas/blas.hpp>
-
-namespace sw {
-	namespace universal {
-		// data normalization
-
-		// minmaxscaler rescales the elements of a vector from their original 
-		// range [min, max] to a new range [lb, ub]
-		template<typename Scalar>
-		blas::vector<Scalar> minmaxscaler(const blas::vector<Scalar>& v, Scalar lb = 0, Scalar ub = 1) {
-			blas::vector<Scalar> t; 
-			if (lb >= ub) {
-				std::cerr << "target range is inconsistent\n";
-				return t;
-			}
-			std::pair< Scalar, Scalar> mm = blas::range(v);
-			Scalar min = mm.first;
-			Scalar max = mm.second;
-			auto scale = (ub - lb) / (max - min);
-			auto offset = lb - min * scale;
-			std::cout << min << ", " << max << ", " << lb << ", " << ub << ", " << scale << ", " << offset << '\n';
-			for (auto e : v) {
-				t.push_back( e * scale + offset );
-			}
-			return t;
-		}
-
-		template<typename Target>
-		blas::vector<Target> compress(const blas::vector<double>& v) {
-			auto maxpos = double(std::numeric_limits<Target>::max());
-
-			auto vminmax = arange(v);
-			auto minValue = vminmax.first;
-			auto maxValue = vminmax.second;
-
-			sw::universal::blas::vector<Target> t(v.size());
-			auto sqrtMaxpos = sqrt(maxpos);
-			double maxScale = 1.0;
-			if (abs(maxValue) > sqrtMaxpos) maxScale = sqrtMaxpos / maxValue;
-			t = maxScale * v;
-
-			return t;
-		}
-
-	}
-}
+#include <universal/verification/test_suite.hpp>
 
 /*
  * When we want to take arbitrary vectors and want to faithfully calculate a 
@@ -70,11 +26,33 @@ namespace sw {
  *                 
  */
 
+ // Regression testing guards: typically set by the cmake configuration, but MANUAL_TESTING is an override
+#define MANUAL_TESTING 0
+// REGRESSION_LEVEL_OVERRIDE is set by the cmake file to drive a specific regression intensity
+// It is the responsibility of the regression test to organize the tests in a quartile progression.
+//#undef REGRESSION_LEVEL_OVERRIDE
+#ifndef REGRESSION_LEVEL_OVERRIDE
+#undef REGRESSION_LEVEL_1
+#undef REGRESSION_LEVEL_2
+#undef REGRESSION_LEVEL_3
+#undef REGRESSION_LEVEL_4
+#define REGRESSION_LEVEL_1 1
+#define REGRESSION_LEVEL_2 1
+#define REGRESSION_LEVEL_3 1
+#define REGRESSION_LEVEL_4 1
+#endif
 
 int main()
 try {
 	using namespace sw::universal;
+	using namespace sw::universal::blas;
+
+	std::string test_suite  = "benchmark error in scaling operations";
+	std::string test_tag    = "data distribution scaling";
+	bool reportTestCases    = true;
+	int nrOfFailedTestCases = 0;
 
+	ReportTestSuiteHeader(test_suite, reportTestCases);
 	unsigned N{ 10000 };
 	double mean{ 0.0 }, stddev{ 1.0 };
 

diff --git a/include/universal/blas/scaling.hpp b/include/universal/blas/scaling.hpp
@@ -39,5 +39,90 @@ std::pair<typename Vector::value_type, typename Vector::value_type> arange(const
 	return std::pair(running_min, running_max);
 }
 
+// data normalization
+
+/*
+    X_std = (X - X.min) / (X.max - X.min)
+	X_scaled = X_std * (ub - lb) + min
+
+	where [lb, ub] = feature_range.
+
+	The transformation is calculated as
+
+		X_scaled = scale * X + min - X.min * scale
+		where scale = (max - min) / (X.max - X.min)
+*/
+
+// minmaxscaler rescales the elements of a vector from their original 
+// range [min, max] to a new range [lb, ub]
+template<typename Scalar>
+blas::vector<Scalar> minmaxscaler(const blas::vector<Scalar>& v, Scalar lb = 0, Scalar ub = 1) {
+	blas::vector<Scalar> t;
+	if (lb >= ub) {
+		std::cerr << "target range is inconsistent\n";
+		return t;
+	}
+	std::pair< Scalar, Scalar> mm = blas::range(v);
+	Scalar min = mm.first;
+	Scalar max = mm.second;
+	auto scale = (ub - lb) / (max - min);
+	auto offset = lb - min * scale;
+	std::cout << min << ", " << max << ", " << lb << ", " << ub << ", " << scale << ", " << offset << '\n';
+	for (auto e : v) t.push_back(e * scale + offset);
+
+	return t;
+}
+
+template<typename Target>
+blas::vector<Target> compress(const blas::vector<double>& v) {
+	auto maxpos = double(std::numeric_limits<Target>::max());
+
+	auto vminmax = arange(v);
+	auto minValue = vminmax.first;
+	auto maxValue = vminmax.second;
+
+	sw::universal::blas::vector<Target> t(v.size());
+	auto sqrtMaxpos = sqrt(maxpos);
+	double maxScale = 1.0;
+	if (abs(maxValue) > sqrtMaxpos) maxScale = sqrtMaxpos / maxValue;
+	t = maxScale * v;
+
+	return t;
+}
+
+
+/*
+	Standardize features by removing the mean and scaling to unit variance.
+
+	The standard score of a sample `x` is calculated as:
+
+		z = (x - u) / s
+
+	where `u` is the mean of the training samples or zero if `with_mean=False`,
+	and `s` is the standard deviation of the training samples or one if
+	`with_std=False`.
+
+	Centering and scaling happen independently on each feature by computing
+	the relevant statistics on the samples in the training set. Mean and
+	standard deviation are then stored to be used on later data using
+	:meth:`transform`.
+
+	Standardization of a dataset is a common requirement for many
+	machine learning estimators: they might behave badly if the
+	individual features do not more or less look like standard normally
+	distributed data (e.g. Gaussian with 0 mean and unit variance).
+
+	For instance many elements used in the objective function of
+	a learning algorithm (such as the RBF kernel of Support Vector
+	Machines or the L1 and L2 regularizers of linear models) assume that
+	all features are centered around 0 and have variance in the same
+	order. If a feature has a variance that is orders of magnitude larger
+	than others, it might dominate the objective function and make the
+	estimator unable to learn from other features correctly as expected.
+
+	This scaler can also be applied to sparse CSR or CSC matrices by passing
+	`with_mean=False` to avoid breaking the sparsity structure of the data.
+
+ */
 
 }}} // namespace sw::universal::blas
diff --git a/include/universal/blas/vector.hpp b/include/universal/blas/vector.hpp
@@ -36,7 +36,7 @@
 
 #elif defined(_MSC_VER)
 /* Microsoft Visual Studio. --------------------------------- */
-// already defineds _NODISCARD
+// already defines _NODISCARD
 
 #elif defined(__PGI)
 /* Portland Group PGCC/PGCPP. ------------------------------- */
@@ -389,4 +389,18 @@ posit<nbits, es> operator*(const vector< posit<nbits, es> >& a, const vector< po
 	return p;
 }
 
+template<typename Scalar>
+bool operator==(const vector<Scalar>& lhs, const vector<Scalar>& rhs) {
+	if (lhs.size() != rhs.size()) return false;
+	for (unsigned i = 0; i < lhs.size(); ++i) {
+		if (lhs[i] != rhs[i]) return false;
+	}
+	return true;
+}
+
+template<typename Scalar>
+bool operator!=(const vector<Scalar>& lhs, const vector<Scalar>& rhs) {
+	return !(lhs == rhs);
+}
+
 }}} // namespace sw::universal::blas
diff --git a/linalg/blas/CMakeLists.txt b/linalg/blas/CMakeLists.txt
@@ -1,3 +1,3 @@
 file (GLOB SOURCES "./*.cpp")
 
-compile_all("true" "blas" "Basic Linear Algebra/blas" "${SOURCES}")
+compile_all("true" "blas" "Linear Algebra/blas" "${SOURCES}")
diff --git a/linalg/blas/README.md b/linalg/blas/README.md
@@ -1,3 +1,3 @@
 # Basic Linear Algebra Subroutine tests
 
-This directory contains verification tests for the sw::unum::blas library
+This directory contains verification tests for the sw::universal::blas library
diff --git a/linalg/blas/vector_ops.cpp b/linalg/blas/vector_ops.cpp
@@ -127,8 +127,11 @@ try {
 	using Scalar = posit<nbits, es>;
 	using Vector = blas::vector<Scalar>;
 
-	// error full and error free dot products
-	nrOfFailedTestCases += ReportTestResult(VerifyErrorFreeFusedDotProduct(std::numeric_limits<posit<nbits, es> >::max()), test_tag, "error free posit dot");
+	std::cout << "error full and error free dot products\n";
+	// posit<8,0> is failing on 32k sums of epsilon
+	nrOfFailedTestCases += ReportTestResult(VerifyErrorFreeFusedDotProduct(std::numeric_limits<posit<8, 2> >::max()), test_tag, "error free posit<8,2> dot");
+	nrOfFailedTestCases += ReportTestResult(VerifyErrorFreeFusedDotProduct(std::numeric_limits<posit<16, 2> >::max()), test_tag, "error free posit<16,2> dot");
+	nrOfFailedTestCases += ReportTestResult(VerifyErrorFreeFusedDotProduct(std::numeric_limits<posit<32, 2> >::max()), test_tag, "error free posit<32,2> dot");
 	// TBD: no fdp yet for cfloat or lns
 	// nrOfFailedTestCases += ReportTestResult(VerifyErrorFreeFusedDotProduct(std::numeric_limits< bfloat_t >::max()), test_tag, "error free bfloat16 dot");
 	// nrOfFailedTestCases += ReportTestResult(VerifyErrorFreeFusedDotProduct(std::numeric_limits< lns<16, 8> >::max()), test_tag, "error free lns dot");

diff --git a/linalg/data/CMakeLists.txt b/linalg/data/CMakeLists.txt
@@ -0,0 +1,3 @@
+file (GLOB SOURCES "./*.cpp")
+
+compile_all("true" "data" "Linear Algebra/data" "${SOURCES}")
diff --git a/linalg/data/README.md b/linalg/data/README.md
@@ -0,0 +1,32 @@
+# Data preparation functions
+
+This directory contains verification tests for the data preprocessing vector functions.
+
+standardization
+normalization
+minmax_scale
+maxabs_scale
+robust scaler
+binarizer
+kernel_centerer
+quantile_transformer
+power_transformer
+
+layer norm
+softmax
+max pooling
+
+
+data preprocessing of images vs language
+
+
+mixed-precision
+layer to layer what do our activations look like?
+
+what are the trouble cases?
+- the layer transformation expands the dynamic range of the input activations
+
+
+log-normal distributions  (the gradients are log-normal)  normal distributed in log space
+
+what does a log-normal distribution look like in linear space?