Speed comparision for vec3 vs vec4 arithmetics

The code in comparison is based on galaxy.cpp (point to quad conversion in render function).

Three mods were tested:

4 → 4	Both Blob::position and GalaxyVertex::position are vec4, all calculations done using vec4
3 → 3	Both Blob::position and GalaxyVertex::position are vec3, all calculations done using vec4
4 → 3	Blob::position is vec4, GalaxyVertex::position is vec3, calculations done using vec4 results then stored to vec3
3 → 4 → 3	Both stored as vec3 but for calculation converted to vec4

The test program was compiled in 2 modes for ARM64 (Oracle cloud VM):

-O2
-O3
-O3 -march=native

And in 4 modes for Intel (i5 mobile 8th gen):

-O2
-O2 -msse
-O3
-O3 -march=native -msse4
-O3 -march=native -mavx

Results (ms):

CPU	Optimization	4 → 4	4 → 3	3 → 3	3 → 4 → 3
ARM	-O2	15050.80	10115.40	9513.18	10185.00
	-O3	7760.55	8150.00	8403.77	8846.66
	-O3 -march=native	7768.69	8082.59	8366.02	8818.15
Intel	-O2	10093.30	10476.60	10367.00	10387.40
	-O2 -msse	9992.38	10496.80	10416.40	10426.10
	-O3	9711.28	11479.80	11213.50	11108.90
	-O3 -march=native -msse4	9522.09	10475.90	11250.10	11687.20
	-O3 -march=native -mavx	9515.35	10484.40	11100.40	11623.70

Conclusion:

For code targeting new hardware with heavy optimizations and vector extensions enabled prefer vec4 and then convert to vec3 if required. Generic ARM64 prefers vec3.
For code targeting old hardware or without vector extensions enabled and with medium optimizations vec3 is sufficient (performance drop is less then 3%) and behaves better then vec4 math and conversion to vec3.
Conversion from vec3 to voc4 in simple calculations doesn’t give any speedup.

Test code:


#include <algorithm>
#include <chrono>
#include <iostream>
#include <Eigen/Core>
#ifndef USEVEC3OUT

#error “Define USEVEC3OUT 0/1”

#endif
#ifndef USEVEC3IN

#error “Define USEVEC3IN 0/1”

#endif
using std::chrono::duration;

using std::chrono::duration_cast;

using std::chrono::high_resolution_clock;

using std::chrono::milliseconds;
struct Blob

{

#if USEVEC3IN

    Eigen::Vector3f position;

#else

    Eigen::Vector4f position;

#endif

    std::uint8_t colorIndex, brightness;

};
struct GalaxyVertex

{

#if USEVEC3OUT

    Eigen::Vector3f position;

#else

    Eigen::Vector4f position;

#endif

    Eigen::Matrix<uint8_t, 4, 1> texCoord; // texCoord.x = x, texCoord.y = y, texCoord.z = color index, texCoord.w = alpha

};
void fn(const Eigen::Matrix4f &m, const Eigen::Matrix3f &viewMat, Blob blobs[], float size, uint8_t alpha, GalaxyVertex g_vertices[], std::size_t N)

{

#if USEVEC3IN

    Eigen::Vector3f v0 = viewMat * Eigen::Vector3f(-1, -1, 0) * size;

    Eigen::Vector3f v1 = viewMat * Eigen::Vector3f( 1, -1, 0) * size;

    Eigen::Vector3f v2 = viewMat * Eigen::Vector3f( 1,  1, 0) * size;

    Eigen::Vector3f v3 = viewMat * Eigen::Vector3f(-1,  1, 0) * size;

#else

    Eigen::Vector4f v0(Eigen::Vector4f::Zero());

    Eigen::Vector4f v1(Eigen::Vector4f::Zero());

    Eigen::Vector4f v2(Eigen::Vector4f::Zero());

    Eigen::Vector4f v3(Eigen::Vector4f::Zero());

    v0.head(3) = viewMat * Eigen::Vector3f(-1, -1, 0) * size;

    v1.head(3) = viewMat * Eigen::Vector3f( 1, -1, 0) * size;

    v2.head(3) = viewMat * Eigen::Vector3f( 1,  1, 0) * size;

    v3.head(3) = viewMat * Eigen::Vector3f(-1,  1, 0) * size;

#endif
for (std::size_t i = 0, j = 0; i < N; i++)
{
Blob &b = blobs[i];
#if USEVEC3IN

        Eigen::Vector3f p  = (m * Eigen::Vector4f(b.position.x(), b.position.y(), b.position.z(), 1.0f)).head(3);

#else

        Eigen::Vector4f p  = m * b.position;

#endif
#if USEVEC3OUT

#if USEVEC3IN

        g_vertices[j+0] = { p + v0, { std::uint8_t(0),   std::uint8_t(0),   b.colorIndex, alpha } };

        g_vertices[j+1] = { p + v1, { std::uint8_t(255), std::uint8_t(0),   b.colorIndex, alpha } };

        g_vertices[j+2] = { p + v2, { std::uint8_t(255), std::uint8_t(255), b.colorIndex, alpha } };

        g_vertices[j+3] = { p + v3, { std::uint8_t(0),   std::uint8_t(255), b.colorIndex, alpha } };

#else

        g_vertices[j+0] = { (p + v0).head(3), { std::uint8_t(0),   std::uint8_t(0),   b.colorIndex, alpha } };

        g_vertices[j+1] = { (p + v1).head(3), { std::uint8_t(255), std::uint8_t(0),   b.colorIndex, alpha } };

        g_vertices[j+2] = { (p + v2).head(3), { std::uint8_t(255), std::uint8_t(255), b.colorIndex, alpha } };

        g_vertices[j+3] = { (p + v3).head(3), { std::uint8_t(0),   std::uint8_t(255), b.colorIndex, alpha } };

#endif

#else

        g_vertices[j+0] = { p + v0, { std::uint8_t(0),   std::uint8_t(0),   b.colorIndex, alpha } };

        g_vertices[j+1] = { p + v1, { std::uint8_t(255), std::uint8_t(0),   b.colorIndex, alpha } };

        g_vertices[j+2] = { p + v2, { std::uint8_t(255), std::uint8_t(255), b.colorIndex, alpha } };

        g_vertices[j+3] = { p + v3, { std::uint8_t(0),   std::uint8_t(255), b.colorIndex, alpha } };

#endif

        j+=4;

    }

}
int main()

{

    Eigen::Matrix4f m(Eigen::Matrix4f::Identity());

    Eigen::Matrix3f viewMat(Eigen::Matrix3f::Identity());
constexpr std::size_t N = 1024 * 10;
Blob *blobs = new Blob[N];
for (std::size_t i = 0; i < N; i++)
{
#if USEVEC3IN

        Eigen::Vector3f position(i / 3.0, i / 2.0f, i / 3.0f);

#else

        Eigen::Vector4f position(i / 5.0f, i / 4.0f, i / 3.0f, 1.0f);

#endif

        blobs[i] = {position, i%15, i%17};

    }
GalaxyVertex g_vertices = new GalaxyVertex[N4];
#if USEVEC3OUT

#if USEVEC3IN

    std::cout << “vec3 → vec3\n”;

#else

    std::cout << “vec4 → vec3\n”;

#endif

#else

    std::cout << “vec4 → vec4\n”;

#endif
auto t1 = high_resolution_clock::now();
for (int i = 0; i < 100000; i++)
fn(m, viewMat, blobs, 10.1f, 16, g_vertices, N);
auto t2 = high_resolution_clock::now();
/* Getting number of milliseconds as a double. */
duration<double, std::milli> ms_double = t2 – t1;
std::cout << ms_double.count() << “\n”;
return 0;
}

To compile:


g++ -I /usr/include/eigen3 -o galaxy-test-4-3-sse -O2 -msse galaxy-test.cc -DUSEVEC3OUT=1 -DUSEVEC3IN=0

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Speed comparision for vec3 vs vec4 arithmetics

Clone this wiki locally