Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Different results on different GPUs (Stress test) #8191

Open
soufianekhiat opened this issue Apr 12, 2024 · 2 comments
Open

Different results on different GPUs (Stress test) #8191

soufianekhiat opened this issue Apr 12, 2024 · 2 comments

Comments

@soufianekhiat
Copy link

soufianekhiat commented Apr 12, 2024

I try to reproduce this:
www.shadertoy.com/view/DdKBzy
https://docs.google.com/spreadsheets/d/1xgawYWYQlfzUGyZgxV9Ug9qwfCCQPKszhYBCimyBqWE/edit
To stress out transendental functions on GPUs, to maybe add a new test.
I got this on D3D12:
D3D12
And this with OpenCL (or CPU):
OpenCL
With a 4090.
Compared to a a ShaderToy version:
image
What surprises me are the discontinuities.
Do we have an API to disable all the internal simplication to help me to debug?
Let me know if I missed something (or in my implementation).

Full source code:

#include <Halide.h>

#define STB_IMAGE_WRITE_IMPLEMENTATION
#include <stb_image_write.h>

#pragma comment( lib, "Halide.lib" )

using namespace Halide;

typedef float f32;

class float2
{
public:
	float2() :
		x(undef< f32 >())
	,	y(undef< f32 >())
	{}

	float2(Expr _x) :
		x(_x)
	,	y(_x)
	{}

	float2(Expr _x, Expr _y) :
		x(_x)
	,	y(_y)
	{}

	float2(float2 const& v) :
		x(v.x)
	,	y(v.y)
	{}

	float2& operator=(Expr t)
	{
		x = t;
		y = t;

		return *this;
	}

	float2& operator=(float2 v)
	{
		x = v.x;
		y = v.y;

		return *this;
	}

	Expr x;
	Expr y;
};

float2 operator+(float2 a, float2 b) { return float2(a.x + b.x, a.y + b.y); }
float2 operator-(float2 a, float2 b) { return float2(a.x - b.x, a.y - b.y); }
float2 operator*(float2 a, float2 b) { return float2(a.x * b.x, a.y * b.y); }
float2 operator/(float2 a, float2 b) { return float2(a.x / b.x, a.y / b.y); }

float2 operator+(float2 a, Expr t) { return float2(a.x + t, a.y + t); }
float2 operator+(Expr t, float2 a) { return float2(t + a.x, t + a.y); }

float2 operator-(float2 a, Expr t) { return float2(a.x - t, a.y - t); }
float2 operator-(Expr t, float2 a) { return float2(t - a.x, t - a.y); }

float2 operator*(float2 a, Expr t) { return float2(a.x * t, a.y * t); }
float2 operator*(Expr t, float2 a) { return float2(t * a.x, t * a.y); }

float2 operator/(float2 a, Expr t) { return float2(a.x / t, a.y / t); }
float2 operator/(Expr t, float2 a) { return float2(t / a.x, t / a.y); }

Expr dot(float2 a, float2 b) { return a.x * b.x + a.y * b.y; }

float2 cos(float2 v) { return float2(cos(v.x), cos(v.y)); }
float2 sin(float2 v) { return float2(sin(v.x), sin(v.y)); }
float2 tan(float2 v) { return float2(tan(v.x), tan(v.y)); }
float2 fract(float2 v) { return float2(fract(v.x), fract(v.y)); }
float2 floor(float2 v) { return float2(floor(v.x), floor(v.y)); }
Expr length(float2 v) { return sqrt(dot(v, v)); }

float2 pow(float2 v, Expr t) { return float2(pow(v.x, t), pow(v.y, t)); }
#if 1
Expr lerp2(Expr a, Expr b, Expr t)
{
	return lerp(a, b, t);
}
#else
Expr lerp2(Expr a, Expr b, Expr t)
{
	// Mimic the GPU behavior
	return (cast(a.type(), 1.0f) - t) * a + t * b;
}
#endif
float2 lerp2(float2 a, float2 b, Expr t) { return float2(lerp2(a.x, b.x, t), lerp2(a.y, b.y, t)); }
float2 clamp(float2 a, Expr min, Expr max) { return float2(clamp(a.x, min, max), clamp(a.y, min, max)); }
float2 clamp(float2 a, float2 min, float2 max) { return float2(clamp(a.x, min.x, max.x), clamp(a.y, min.y, max.y)); }
Expr sign(Expr x)
{
	return select(x > cast(x.type(), 0.0f), cast(x.type(), 1.0f),
		select(x < cast(x.type(), 0.0f), cast(x.type(), -1.0f), cast(x.type(), 0.0f))
	);
	//return select(x > cast(x.type(), 0.0f), cast(x.type(), 1.0f), cast(x.type(), -1.0f));
}

Expr smoothstep(Expr edge0, Expr edge1, Expr x)
{
	Expr t = clamp((x - edge0) / (edge1 - edge0), 0.0f, 1.0f);

	return t * t * (3.0f - 2.0f * t);
}

Expr getFalloff(float2 uv)
{
	Expr land0 = smoothstep(0.2f, 0.5f, length(uv - 0.5f));

	return 1.0f - clamp(land0, 0.0f, 1.0f);
}

Expr noiseMethod(float2 uv)
{
	//Expr _lacunarity = 2.0f; // Noiseness
	//Expr _gain = 0.5f; // Crispyness
	Expr _amplitude = 1.5f; // How much is does
	Expr _frequency = 2.0f; // Another kind of scale?
	Expr _power = 1.0f; // Color sharpness

	Expr _finalValue = 1.0f;

	float2 p = uv;

	Expr fpA = 127.1f; // 127.1
	Expr fpB = 311.7f; // 311.7
	Expr fpC = 269.5f; // 269.5
	Expr fpD = 183.3f; // 183.3
	Expr fpE = 43758.5453123f; // 43758.5453123

	float2 i = floor(p * _frequency);
	float2 f = fract(p * _frequency);
	float2 t = f * f * f * (f * (f * 6.0f - 15.0f) + 10.0f);
	float2 a = i + float2(0.0f, 0.0f);
	float2 b = i + float2(1.0f, 0.0f);
	float2 c = i + float2(0.0f, 1.0f);
	float2 d = i + float2(1.0f, 1.0f);
	a = -1.0f + 2.0f * fract(sin(float2(dot(a, float2(fpA, fpB)), dot(a, float2(fpC, fpD)))) * fpE);
	b = -1.0f + 2.0f * fract(sin(float2(dot(b, float2(fpA, fpB)), dot(b, float2(fpC, fpD)))) * fpE);
	c = -1.0f + 2.0f * fract(sin(float2(dot(c, float2(fpA, fpB)), dot(c, float2(fpC, fpD)))) * fpE);
	d = -1.0f + 2.0f * fract(sin(float2(dot(d, float2(fpA, fpB)), dot(d, float2(fpC, fpD)))) * fpE);
	Expr A = dot(a, f - float2(0.0f, 0.0f));
	Expr B = dot(b, f - float2(1.0f, 0.0f));
	Expr C = dot(c, f - float2(0.0f, 1.0f));
	Expr D = dot(d, f - float2(1.0f, 1.0f));
	Expr noise = (lerp2(lerp2(A, B, t.x), lerp2(C, D, t.x), t.y));
	_finalValue *= _amplitude * noise;

	_finalValue = clamp(_finalValue, -1.0f, 1.0f);
	return pow(_finalValue * 0.5f + 0.5f, _power);
}

Func mainImage(float2 resolution, bool gpu = true)
{
	Var x{ "x" };
	Var y{ "y" };

	float2 uv = { cast<f32>(x) / (resolution.x - 1.0f), cast<f32>(y) / (resolution.y - 1.0f) };

	Expr ScaleValue = 2.0f;
	Expr OffsetValue = 100.0f;

	Expr col = 1.0f;

	Expr a = noiseMethod(uv * ScaleValue + OffsetValue);
	Expr b = getFalloff(uv) * a;

	col *= select(b < .5f, 0.f, 1.f);
	col += 0.83f * (sign(uv.x - 1.0f) * 0.5f + 0.5f);

	Func main{ "main" };
	main(x, y) = col;

	Var tx{ "tx" };
	Var ty{ "ty" };
	// Trivial scheduling
	if (gpu)
		main.gpu_tile(x, y, tx, ty, 8, 8, TailStrategy::GuardWithIf);
	else
		main.parallel(y);

	return main;
}

int main()
{
	try
	{
		//
		constexpr bool gpu = true;

		constexpr int width = 512;
		constexpr f32 fwidth = static_cast<f32>(width);
		constexpr int width2 = width * width;
		std::vector<f32> output(width2);

		halide_dimension_t dim[] = {
			{ 0, width, 1, 0 },
			{ 0, width, width, 0 }
		};
		Buffer<f32> hOutput(output.data(), 2, &dim[0], "output");

		Func main = mainImage(float2(fwidth, fwidth), gpu);

		Pipeline p(main);

		Target t = get_target_from_environment();
		if (gpu)
			t = t.with_feature(Target::OpenCL);
			//t = t.with_feature(Target::D3D12Compute);

		std::cout << t.to_string() << std::endl;

		main.realize(hOutput, t);
		if (gpu)
			hOutput.copy_to_host();

		std::vector<uint8_t> imgOut(width2 * 4);
		for (int i = 0; i < width2; ++i)
		{
			imgOut[4 * i + 0] = static_cast<uint8_t>(std::round(std::clamp(output[i], 0.0f, 1.0f) * 255.0f));
			imgOut[4 * i + 1] = static_cast<uint8_t>(std::round(std::clamp(output[i], 0.0f, 1.0f) * 255.0f));
			imgOut[4 * i + 2] = static_cast<uint8_t>(std::round(std::clamp(output[i], 0.0f, 1.0f) * 255.0f));
			imgOut[4 * i + 3] = 255;
		}

		stbi_write_png("output.png", width, width, 4, imgOut.data(), 0);
	}
	catch (Halide::Error& e)
	{
		std::cerr << e.what() << std::endl;
	}
	catch (std::exception& e)
	{
		std::cerr << e.what() << std::endl;
	}

	return 0;
}
@abadams
Copy link
Member

abadams commented Apr 12, 2024

Have you tried a CPU schedule? That might help narrow it down.

@soufianekhiat
Copy link
Author

CPU gave me the same as OpenCL.
CUDA gives me (close to OpenCL):
CUDA
Sidenote vulkan give me:
Internal Error at D:\git\Halide\src\CodeGen_Vulkan_Dev.cpp:2527 triggered by user code at : map_simt_builtin called on bad variable name: main.s0.y.y.block_id_y

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants