Different results on different GPUs (Stress test) #8191

soufianekhiat · 2024-04-12T15:57:07Z

I try to reproduce this:
www.shadertoy.com/view/DdKBzy
https://docs.google.com/spreadsheets/d/1xgawYWYQlfzUGyZgxV9Ug9qwfCCQPKszhYBCimyBqWE/edit
To stress out transendental functions on GPUs, to maybe add a new test.
I got this on D3D12:

And this with OpenCL (or CPU):

With a 4090.
Compared to a a ShaderToy version:

What surprises me are the discontinuities.
Do we have an API to disable all the internal simplication to help me to debug?
Let me know if I missed something (or in my implementation).

Full source code:

#include <Halide.h>

#define STB_IMAGE_WRITE_IMPLEMENTATION
#include <stb_image_write.h>

#pragma comment( lib, "Halide.lib" )

using namespace Halide;

typedef float f32;

class float2
{
public:
	float2() :
		x(undef< f32 >())
	,	y(undef< f32 >())
	{}

	float2(Expr _x) :
		x(_x)
	,	y(_x)
	{}

	float2(Expr _x, Expr _y) :
		x(_x)
	,	y(_y)
	{}

	float2(float2 const& v) :
		x(v.x)
	,	y(v.y)
	{}

	float2& operator=(Expr t)
	{
		x = t;
		y = t;

		return *this;
	}

	float2& operator=(float2 v)
	{
		x = v.x;
		y = v.y;

		return *this;
	}

	Expr x;
	Expr y;
};

float2 operator+(float2 a, float2 b) { return float2(a.x + b.x, a.y + b.y); }
float2 operator-(float2 a, float2 b) { return float2(a.x - b.x, a.y - b.y); }
float2 operator*(float2 a, float2 b) { return float2(a.x * b.x, a.y * b.y); }
float2 operator/(float2 a, float2 b) { return float2(a.x / b.x, a.y / b.y); }

float2 operator+(float2 a, Expr t) { return float2(a.x + t, a.y + t); }
float2 operator+(Expr t, float2 a) { return float2(t + a.x, t + a.y); }

float2 operator-(float2 a, Expr t) { return float2(a.x - t, a.y - t); }
float2 operator-(Expr t, float2 a) { return float2(t - a.x, t - a.y); }

float2 operator*(float2 a, Expr t) { return float2(a.x * t, a.y * t); }
float2 operator*(Expr t, float2 a) { return float2(t * a.x, t * a.y); }

float2 operator/(float2 a, Expr t) { return float2(a.x / t, a.y / t); }
float2 operator/(Expr t, float2 a) { return float2(t / a.x, t / a.y); }

Expr dot(float2 a, float2 b) { return a.x * b.x + a.y * b.y; }

float2 cos(float2 v) { return float2(cos(v.x), cos(v.y)); }
float2 sin(float2 v) { return float2(sin(v.x), sin(v.y)); }
float2 tan(float2 v) { return float2(tan(v.x), tan(v.y)); }
float2 fract(float2 v) { return float2(fract(v.x), fract(v.y)); }
float2 floor(float2 v) { return float2(floor(v.x), floor(v.y)); }
Expr length(float2 v) { return sqrt(dot(v, v)); }

float2 pow(float2 v, Expr t) { return float2(pow(v.x, t), pow(v.y, t)); }
#if 1
Expr lerp2(Expr a, Expr b, Expr t)
{
	return lerp(a, b, t);
}
#else
Expr lerp2(Expr a, Expr b, Expr t)
{
	// Mimic the GPU behavior
	return (cast(a.type(), 1.0f) - t) * a + t * b;
}
#endif
float2 lerp2(float2 a, float2 b, Expr t) { return float2(lerp2(a.x, b.x, t), lerp2(a.y, b.y, t)); }
float2 clamp(float2 a, Expr min, Expr max) { return float2(clamp(a.x, min, max), clamp(a.y, min, max)); }
float2 clamp(float2 a, float2 min, float2 max) { return float2(clamp(a.x, min.x, max.x), clamp(a.y, min.y, max.y)); }
Expr sign(Expr x)
{
	return select(x > cast(x.type(), 0.0f), cast(x.type(), 1.0f),
		select(x < cast(x.type(), 0.0f), cast(x.type(), -1.0f), cast(x.type(), 0.0f))
	);
	//return select(x > cast(x.type(), 0.0f), cast(x.type(), 1.0f), cast(x.type(), -1.0f));
}

Expr smoothstep(Expr edge0, Expr edge1, Expr x)
{
	Expr t = clamp((x - edge0) / (edge1 - edge0), 0.0f, 1.0f);

	return t * t * (3.0f - 2.0f * t);
}

Expr getFalloff(float2 uv)
{
	Expr land0 = smoothstep(0.2f, 0.5f, length(uv - 0.5f));

	return 1.0f - clamp(land0, 0.0f, 1.0f);
}

Expr noiseMethod(float2 uv)
{
	//Expr _lacunarity = 2.0f; // Noiseness
	//Expr _gain = 0.5f; // Crispyness
	Expr _amplitude = 1.5f; // How much is does
	Expr _frequency = 2.0f; // Another kind of scale?
	Expr _power = 1.0f; // Color sharpness

	Expr _finalValue = 1.0f;

	float2 p = uv;

	Expr fpA = 127.1f; // 127.1
	Expr fpB = 311.7f; // 311.7
	Expr fpC = 269.5f; // 269.5
	Expr fpD = 183.3f; // 183.3
	Expr fpE = 43758.5453123f; // 43758.5453123

	float2 i = floor(p * _frequency);
	float2 f = fract(p * _frequency);
	float2 t = f * f * f * (f * (f * 6.0f - 15.0f) + 10.0f);
	float2 a = i + float2(0.0f, 0.0f);
	float2 b = i + float2(1.0f, 0.0f);
	float2 c = i + float2(0.0f, 1.0f);
	float2 d = i + float2(1.0f, 1.0f);
	a = -1.0f + 2.0f * fract(sin(float2(dot(a, float2(fpA, fpB)), dot(a, float2(fpC, fpD)))) * fpE);
	b = -1.0f + 2.0f * fract(sin(float2(dot(b, float2(fpA, fpB)), dot(b, float2(fpC, fpD)))) * fpE);
	c = -1.0f + 2.0f * fract(sin(float2(dot(c, float2(fpA, fpB)), dot(c, float2(fpC, fpD)))) * fpE);
	d = -1.0f + 2.0f * fract(sin(float2(dot(d, float2(fpA, fpB)), dot(d, float2(fpC, fpD)))) * fpE);
	Expr A = dot(a, f - float2(0.0f, 0.0f));
	Expr B = dot(b, f - float2(1.0f, 0.0f));
	Expr C = dot(c, f - float2(0.0f, 1.0f));
	Expr D = dot(d, f - float2(1.0f, 1.0f));
	Expr noise = (lerp2(lerp2(A, B, t.x), lerp2(C, D, t.x), t.y));
	_finalValue *= _amplitude * noise;

	_finalValue = clamp(_finalValue, -1.0f, 1.0f);
	return pow(_finalValue * 0.5f + 0.5f, _power);
}

Func mainImage(float2 resolution, bool gpu = true)
{
	Var x{ "x" };
	Var y{ "y" };

	float2 uv = { cast<f32>(x) / (resolution.x - 1.0f), cast<f32>(y) / (resolution.y - 1.0f) };

	Expr ScaleValue = 2.0f;
	Expr OffsetValue = 100.0f;

	Expr col = 1.0f;

	Expr a = noiseMethod(uv * ScaleValue + OffsetValue);
	Expr b = getFalloff(uv) * a;

	col *= select(b < .5f, 0.f, 1.f);
	col += 0.83f * (sign(uv.x - 1.0f) * 0.5f + 0.5f);

	Func main{ "main" };
	main(x, y) = col;

	Var tx{ "tx" };
	Var ty{ "ty" };
	// Trivial scheduling
	if (gpu)
		main.gpu_tile(x, y, tx, ty, 8, 8, TailStrategy::GuardWithIf);
	else
		main.parallel(y);

	return main;
}

int main()
{
	try
	{
		//
		constexpr bool gpu = true;

		constexpr int width = 512;
		constexpr f32 fwidth = static_cast<f32>(width);
		constexpr int width2 = width * width;
		std::vector<f32> output(width2);

		halide_dimension_t dim[] = {
			{ 0, width, 1, 0 },
			{ 0, width, width, 0 }
		};
		Buffer<f32> hOutput(output.data(), 2, &dim[0], "output");

		Func main = mainImage(float2(fwidth, fwidth), gpu);

		Pipeline p(main);

		Target t = get_target_from_environment();
		if (gpu)
			t = t.with_feature(Target::OpenCL);
			//t = t.with_feature(Target::D3D12Compute);

		std::cout << t.to_string() << std::endl;

		main.realize(hOutput, t);
		if (gpu)
			hOutput.copy_to_host();

		std::vector<uint8_t> imgOut(width2 * 4);
		for (int i = 0; i < width2; ++i)
		{
			imgOut[4 * i + 0] = static_cast<uint8_t>(std::round(std::clamp(output[i], 0.0f, 1.0f) * 255.0f));
			imgOut[4 * i + 1] = static_cast<uint8_t>(std::round(std::clamp(output[i], 0.0f, 1.0f) * 255.0f));
			imgOut[4 * i + 2] = static_cast<uint8_t>(std::round(std::clamp(output[i], 0.0f, 1.0f) * 255.0f));
			imgOut[4 * i + 3] = 255;
		}

		stbi_write_png("output.png", width, width, 4, imgOut.data(), 0);
	}
	catch (Halide::Error& e)
	{
		std::cerr << e.what() << std::endl;
	}
	catch (std::exception& e)
	{
		std::cerr << e.what() << std::endl;
	}

	return 0;
}

abadams · 2024-04-12T17:06:37Z

Have you tried a CPU schedule? That might help narrow it down.

soufianekhiat · 2024-04-13T09:05:31Z

CPU gave me the same as OpenCL.
CUDA gives me (close to OpenCL):

Sidenote vulkan give me:
Internal Error at D:\git\Halide\src\CodeGen_Vulkan_Dev.cpp:2527 triggered by user code at : map_simt_builtin called on bad variable name: main.s0.y.y.block_id_y

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Different results on different GPUs (Stress test) #8191

Different results on different GPUs (Stress test) #8191

soufianekhiat commented Apr 12, 2024 •

edited

abadams commented Apr 12, 2024

soufianekhiat commented Apr 13, 2024

Different results on different GPUs (Stress test) #8191

Different results on different GPUs (Stress test) #8191

Comments

soufianekhiat commented Apr 12, 2024 • edited

abadams commented Apr 12, 2024

soufianekhiat commented Apr 13, 2024

soufianekhiat commented Apr 12, 2024 •

edited