Skip to content

Speed comparision for vec3 vs vec4 arithmetics

Hleb Valoshka edited this page Feb 4, 2023 · 5 revisions

The code in comparison is based on galaxy.cpp (point to quad conversion in render function).

Three mods were tested:

4 → 4 Both Blob::position and GalaxyVertex::position are vec4, all calculations done using vec4
3 → 3 Both Blob::position and GalaxyVertex::position are vec3, all calculations done using vec4
4 → 3 Blob::position is vec4, GalaxyVertex::position is vec3, calculations done using vec4 results then stored to vec3
3 → 4 → 3 Both stored as vec3 but for calculation converted to vec4

The test program was compiled in 2 modes for ARM64 (Oracle cloud VM):

  1. -O2
  2. -O3
  3. -O3 -march=native

And in 4 modes for Intel (i5 mobile 8th gen):

  1. -O2
  2. -O2 -msse
  3. -O3
  4. -O3 -march=native -msse4
  5. -O3 -march=native -mavx

Results (ms):

CPU Optimization 4 → 4 4 → 3 3 → 3 3 → 4 → 3
ARM -O2 15050.80 10115.40 9513.18 10185.00
-O3 7760.55 8150.00 8403.77 8846.66
-O3 -march=native 7768.69 8082.59 8366.02 8818.15
Intel -O2 10093.30 10476.60 10367.00 10387.40
-O2 -msse 9992.38 10496.80 10416.40 10426.10
-O3 9711.28 11479.80 11213.50 11108.90
-O3 -march=native -msse4 9522.09 10475.90 11250.10 11687.20
-O3 -march=native -mavx 9515.35 10484.40 11100.40 11623.70

Conclusion:

  1. For code targeting new hardware with heavy optimizations and vector extensions enabled prefer vec4 and then convert to vec3 if required. Generic ARM64 prefers vec3.
  2. For code targeting old hardware or without vector extensions enabled and with medium optimizations vec3 is sufficient (performance drop is less then 3%) and behaves better then vec4 math and conversion to vec3.
  3. Conversion from vec3 to voc4 in simple calculations doesn’t give any speedup.

Test code:


#include <algorithm>
#include <chrono>
#include <iostream>
#include <Eigen/Core>

#ifndef USEVEC3OUT
#error “Define USEVEC3OUT 0/1”
#endif

#ifndef USEVEC3IN
#error “Define USEVEC3IN 0/1”
#endif

using std::chrono::duration;
using std::chrono::duration_cast;
using std::chrono::high_resolution_clock;
using std::chrono::milliseconds;

struct Blob
{
#if USEVEC3IN
Eigen::Vector3f position;
#else
Eigen::Vector4f position;
#endif
std::uint8_t colorIndex, brightness;
};

struct GalaxyVertex
{
#if USEVEC3OUT
Eigen::Vector3f position;
#else
Eigen::Vector4f position;
#endif
Eigen::Matrix<uint8_t, 4, 1> texCoord; // texCoord.x = x, texCoord.y = y, texCoord.z = color index, texCoord.w = alpha
};

void fn(const Eigen::Matrix4f &m, const Eigen::Matrix3f &viewMat, Blob blobs[], float size, uint8_t alpha, GalaxyVertex g_vertices[], std::size_t N)
{
#if USEVEC3IN
Eigen::Vector3f v0 = viewMat * Eigen::Vector3f(-1, -1, 0) * size;
Eigen::Vector3f v1 = viewMat * Eigen::Vector3f( 1, -1, 0) * size;
Eigen::Vector3f v2 = viewMat * Eigen::Vector3f( 1, 1, 0) * size;
Eigen::Vector3f v3 = viewMat * Eigen::Vector3f(-1, 1, 0) * size;
#else
Eigen::Vector4f v0(Eigen::Vector4f::Zero());
Eigen::Vector4f v1(Eigen::Vector4f::Zero());
Eigen::Vector4f v2(Eigen::Vector4f::Zero());
Eigen::Vector4f v3(Eigen::Vector4f::Zero());
v0.head(3) = viewMat * Eigen::Vector3f(-1, -1, 0) * size;
v1.head(3) = viewMat * Eigen::Vector3f( 1, -1, 0) * size;
v2.head(3) = viewMat * Eigen::Vector3f( 1, 1, 0) * size;
v3.head(3) = viewMat * Eigen::Vector3f(-1, 1, 0) * size;
#endif

for (std::size_t i = 0, j = 0; i < N; i++) { Blob &b = blobs[i];

#if USEVEC3IN
Eigen::Vector3f p = (m * Eigen::Vector4f(b.position.x(), b.position.y(), b.position.z(), 1.0f)).head(3);
#else
Eigen::Vector4f p = m * b.position;
#endif

#if USEVEC3OUT
#if USEVEC3IN
g_vertices[j+0] = { p + v0, { std::uint8_t(0), std::uint8_t(0), b.colorIndex, alpha } };
g_vertices[j+1] = { p + v1, { std::uint8_t(255), std::uint8_t(0), b.colorIndex, alpha } };
g_vertices[j+2] = { p + v2, { std::uint8_t(255), std::uint8_t(255), b.colorIndex, alpha } };
g_vertices[j+3] = { p + v3, { std::uint8_t(0), std::uint8_t(255), b.colorIndex, alpha } };
#else
g_vertices[j+0] = { (p + v0).head(3), { std::uint8_t(0), std::uint8_t(0), b.colorIndex, alpha } };
g_vertices[j+1] = { (p + v1).head(3), { std::uint8_t(255), std::uint8_t(0), b.colorIndex, alpha } };
g_vertices[j+2] = { (p + v2).head(3), { std::uint8_t(255), std::uint8_t(255), b.colorIndex, alpha } };
g_vertices[j+3] = { (p + v3).head(3), { std::uint8_t(0), std::uint8_t(255), b.colorIndex, alpha } };
#endif
#else
g_vertices[j+0] = { p + v0, { std::uint8_t(0), std::uint8_t(0), b.colorIndex, alpha } };
g_vertices[j+1] = { p + v1, { std::uint8_t(255), std::uint8_t(0), b.colorIndex, alpha } };
g_vertices[j+2] = { p + v2, { std::uint8_t(255), std::uint8_t(255), b.colorIndex, alpha } };
g_vertices[j+3] = { p + v3, { std::uint8_t(0), std::uint8_t(255), b.colorIndex, alpha } };
#endif
j+=4;
}
}

int main()
{
Eigen::Matrix4f m(Eigen::Matrix4f::Identity());
Eigen::Matrix3f viewMat(Eigen::Matrix3f::Identity());

constexpr std::size_t N = 1024 * 10; Blob *blobs = new Blob[N]; for (std::size_t i = 0; i < N; i++) {

#if USEVEC3IN
Eigen::Vector3f position(i / 3.0, i / 2.0f, i / 3.0f);
#else
Eigen::Vector4f position(i / 5.0f, i / 4.0f, i / 3.0f, 1.0f);
#endif
blobs[i] = {position, i%15, i%17};
}

GalaxyVertex g_vertices = new GalaxyVertex[N4];

#if USEVEC3OUT
#if USEVEC3IN
std::cout << “vec3 → vec3\n”;
#else
std::cout << “vec4 → vec3\n”;
#endif
#else
std::cout << “vec4 → vec4\n”;
#endif

auto t1 = high_resolution_clock::now(); for (int i = 0; i < 100000; i++) fn(m, viewMat, blobs, 10.1f, 16, g_vertices, N); auto t2 = high_resolution_clock::now(); /* Getting number of milliseconds as a double. */ duration<double, std::milli> ms_double = t2 – t1; std::cout << ms_double.count() << “\n”; return 0;

}

To compile:


g++ -I /usr/include/eigen3 -o galaxy-test-4-3-sse -O2 -msse galaxy-test.cc -DUSEVEC3OUT=1 -DUSEVEC3IN=0