Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

clang-18: Aarch64: macos: memset pattern & always_inline attribute prevents copy elision of float constants in Neon code #91863

Open
angushewlett opened this issue May 11, 2024 · 1 comment

Comments

@angushewlett
Copy link

clang 18.1 (homebrew) generates memset_pattern16 function calls when assigning a float to multiple Neon f32x4 elements in an array. This causes a serious performance regression in the scenario outlined below.

clang 17 (homebrew) and clang 18 (trunk, 18.1.0rc, aarch64-unknown-linux-gnu) do not do this, and instead perform copy elision which generates much more performant code.

The behaviour only seems to happen when attribute((always_inline)) is set.

The two output examples below demonstrate the bug. You can see that the second output example is much less performant, due to larger size and calls out to memset etc.

clang 17 does not demonstrate this behaviour.

Compile with:

clang simdtest2.cpp -std=c++20 -stdlib=libc++ -O3 -funroll-loops -g  -target aarch64-unknown-macos -o test.o && objdump -dS test.o

Example program:

#include <arm_neon.h>

#define force_inline_unroll 0

#if force_inline_unroll
    #define    simd_forceinline inline __attribute__((always_inline, nodebug))
    #define    unroll_n __attribute__((opencl_unroll_hint))
#else
    #define    simd_forceinline inline
    #define    unroll_n
#endif


// Define an array-of-Neon-vector class with a conversion from float, and a multiply operator.
template <int N> class alignas(16) vf
{
public:
    float32x4_t m[N];
    simd_forceinline vf (float x)
    {
        unroll_n for (int i = 0; i < N; i++) m[i] = vdupq_n_f32(x);     // <= correct: detect potential copy elision. incorrect: memset_pattern16
    }
    
    simd_forceinline vf(const vf& q1, const vf&  q2, float32x4_t(*oper)(float32x4_t, float32x4_t)) {   unroll_n for (int i = 0; i < N; i++) m[i] = oper(q1.m[i], q2.m[i]); }
    
    simd_forceinline const vf& operator*=(const vf<N>& other)
    {
         unroll_n for (int i = 0; i < N; i++) m[i] = vmulq_f32(m[i], other.m[i]);
         return *this;
    }
    
    simd_forceinline vf  operator*(const vf& m2) const  {   return vf (*this, m2, vmulq_f32);   }
};

// a 4-element type instance of vf
typedef vf<4> simd;

simd dosomething (simd a, simd b, float c, float d)
{
        return (a * 5.f) * (b * 3.f) * c;
}

int main()
{
   return 1;
}

Output with #define force_inline_unroll 0:

test.o:	file format mach-o arm64

Disassembly of section __TEXT,__text:

0000000100003f50 <__Z11dosomething2vfILi4EES0_ff>:
; {
100003f50: bd4003f0    	ldr	s16, [sp]
;     simd_forceinline vf(const vf& q1, const vf&  q2, float32x4_t(*oper)(float32x4_t, float32x4_t)) {   unroll_n for (int i = 0; i < N; i++) m[i] = oper(q1.m[i], q2.m[i]); }
100003f54: 4f00f691    	fmov.4s	v17, #5.00000000
100003f58: 6e31dc00    	fmul.4s	v0, v0, v17
100003f5c: 6e31dc21    	fmul.4s	v1, v1, v17
100003f60: 6e31dc42    	fmul.4s	v2, v2, v17
100003f64: 6e31dc63    	fmul.4s	v3, v3, v17
100003f68: 4f00f511    	fmov.4s	v17, #3.00000000
100003f6c: 6e31dc84    	fmul.4s	v4, v4, v17
100003f70: 6e31dca5    	fmul.4s	v5, v5, v17
100003f74: 6e31dcc6    	fmul.4s	v6, v6, v17
100003f78: 6e31dce7    	fmul.4s	v7, v7, v17
100003f7c: 6e24dc00    	fmul.4s	v0, v0, v4
100003f80: 6e25dc21    	fmul.4s	v1, v1, v5
100003f84: 6e26dc42    	fmul.4s	v2, v2, v6
100003f88: 6e27dc63    	fmul.4s	v3, v3, v7
100003f8c: 4f909000    	fmul.4s	v0, v0, v16[0]
100003f90: 4f909021    	fmul.4s	v1, v1, v16[0]
100003f94: 4f909042    	fmul.4s	v2, v2, v16[0]
100003f98: 4f909063    	fmul.4s	v3, v3, v16[0]
; 	return (a * 5.f) * (b * 3.f) * c;
100003f9c: d65f03c0    	ret

0000000100003fa0 <_main>:
;    return 1;
100003fa0: 52800020    	mov	w0, #1
100003fa4: d65f03c0    	ret

Output with #define force_inline_unroll 1:

test.o:	file format mach-o arm64

Disassembly of section __TEXT,__text:

0000000100003e78 <__Z11dosomething2vfILi4EES0_ff>:
; {
100003e78: d104c3ff    	sub	sp, sp, #304
100003e7c: a9116ffc    	stp	x28, x27, [sp, #272]
100003e80: a9127bfd    	stp	x29, x30, [sp, #288]
100003e84: 910483fd    	add	x29, sp, #288
100003e88: ad039fe6    	stp	q6, q7, [sp, #112]
100003e8c: ad0217e4    	stp	q4, q5, [sp, #64]
100003e90: ad000fe2    	stp	q2, q3, [sp]
100003e94: ad0103e1    	stp	q1, q0, [sp, #32]
100003e98: bd4013a0    	ldr	s0, [x29, #16]
100003e9c: 3d801be0    	str	q0, [sp, #96]
; 	return (a * 5.f) * (b * 3.f) * c;
100003ea0: 90000001    	adrp	x1, 0x100003000 <__Z11dosomething2vfILi4EES0_ff+0x28>
100003ea4: 913e0021    	add	x1, x1, #3968
100003ea8: d10143a0    	sub	x0, x29, #80
100003eac: 52800802    	mov	w2, #64
100003eb0: 94000031    	bl	0x100003f74 <_memset_pattern16+0x100003f74>
100003eb4: ad7d87a0    	ldp	q0, q1, [x29, #-80]
100003eb8: 3dc00fe2    	ldr	q2, [sp, #48]
100003ebc: 6e20dc40    	fmul.4s	v0, v2, v0
100003ec0: 3d800fe0    	str	q0, [sp, #48]
100003ec4: 3dc00be0    	ldr	q0, [sp, #32]
100003ec8: 6e21dc00    	fmul.4s	v0, v0, v1
100003ecc: 3d800be0    	str	q0, [sp, #32]
; 	return (a * 5.f) * (b * 3.f) * c;
100003ed0: ad7e87a0    	ldp	q0, q1, [x29, #-48]
100003ed4: 3dc003e2    	ldr	q2, [sp]
100003ed8: 6e20dc40    	fmul.4s	v0, v2, v0
100003edc: 3d8003e0    	str	q0, [sp]
100003ee0: 3dc007e0    	ldr	q0, [sp, #16]
100003ee4: 6e21dc00    	fmul.4s	v0, v0, v1
100003ee8: 3d8007e0    	str	q0, [sp, #16]
; 	return (a * 5.f) * (b * 3.f) * c;
100003eec: 90000001    	adrp	x1, 0x100003000 <__Z11dosomething2vfILi4EES0_ff+0x74>
100003ef0: 913e4021    	add	x1, x1, #3984
100003ef4: 910243e0    	add	x0, sp, #144
100003ef8: 52800802    	mov	w2, #64
100003efc: 9400001e    	bl	0x100003f74 <_memset_pattern16+0x100003f74>
100003f00: ad4487e0    	ldp	q0, q1, [sp, #144]
100003f04: 3dc013e2    	ldr	q2, [sp, #64]
100003f08: 6e20dc40    	fmul.4s	v0, v2, v0
100003f0c: 3dc017e2    	ldr	q2, [sp, #80]
100003f10: 6e21dc41    	fmul.4s	v1, v2, v1
100003f14: ad458fe2    	ldp	q2, q3, [sp, #176]
100003f18: 3dc01fe4    	ldr	q4, [sp, #112]
; 	return (a * 5.f) * (b * 3.f) * c;
100003f1c: 6e22dc82    	fmul.4s	v2, v4, v2
100003f20: 3dc023e4    	ldr	q4, [sp, #128]
100003f24: 6e23dc83    	fmul.4s	v3, v4, v3
100003f28: 3dc00fe4    	ldr	q4, [sp, #48]
100003f2c: 6e20dc80    	fmul.4s	v0, v4, v0
100003f30: 3dc00be4    	ldr	q4, [sp, #32]
100003f34: 6e21dc81    	fmul.4s	v1, v4, v1
100003f38: 3dc003e4    	ldr	q4, [sp]
100003f3c: 6e22dc82    	fmul.4s	v2, v4, v2
100003f40: 3dc007e4    	ldr	q4, [sp, #16]
100003f44: 6e23dc83    	fmul.4s	v3, v4, v3
100003f48: 3dc01be4    	ldr	q4, [sp, #96]
100003f4c: 4f849000    	fmul.4s	v0, v0, v4[0]
100003f50: 4f849021    	fmul.4s	v1, v1, v4[0]
100003f54: 4f849042    	fmul.4s	v2, v2, v4[0]
100003f58: 4f849063    	fmul.4s	v3, v3, v4[0]
100003f5c: a9527bfd    	ldp	x29, x30, [sp, #288]
100003f60: a9516ffc    	ldp	x28, x27, [sp, #272]
100003f64: 9104c3ff    	add	sp, sp, #304
100003f68: d65f03c0    	ret

0000000100003f6c <_main>:
;    return 1;
100003f6c: 52800020    	mov	w0, #1
100003f70: d65f03c0    	ret

Disassembly of section __TEXT,__stubs:

0000000100003f74 <__stubs>:
100003f74: b0000010    	adrp	x16, 0x100004000 <__stubs+0x4>
100003f78: f9400210    	ldr	x16, [x16]
100003f7c: d61f0200    	br	x16
@llvmbot
Copy link
Collaborator

llvmbot commented May 11, 2024

@llvm/issue-subscribers-backend-aarch64

Author: None (angushewlett)

clang 18.1 (homebrew) generates memset_pattern16 function calls when assigning a float to multiple Neon f32x4 elements in an array. This causes a serious performance regression in the scenario outlined below.

clang 17 (homebrew) and clang 18 (trunk, 18.1.0rc, aarch64-unknown-linux-gnu) do not do this, and instead perform copy elision which generates much more performant code.

The behaviour only seems to happen when attribute((always_inline)) is set.

The two output examples below demonstrate the bug. You can see that the second output example is much less performant, due to larger size and calls out to memset etc.

clang 17 does not demonstrate this behaviour.

Compile with:

clang simdtest2.cpp -std=c++20 -stdlib=libc++ -O3 -funroll-loops -g  -target aarch64-unknown-macos -o test.o &amp;&amp; objdump -dS test.o

Example program:

#include &lt;arm_neon.h&gt;

#define force_inline_unroll 0

#if force_inline_unroll
    #define    simd_forceinline inline __attribute__((always_inline, nodebug))
    #define    unroll_n __attribute__((opencl_unroll_hint))
#else
    #define    simd_forceinline inline
    #define    unroll_n
#endif


// Define an array-of-Neon-vector class with a conversion from float, and a multiply operator.
template &lt;int N&gt; class alignas(16) vf
{
public:
    float32x4_t m[N];
    simd_forceinline vf (float x)
    {
        unroll_n for (int i = 0; i &lt; N; i++) m[i] = vdupq_n_f32(x);     // &lt;= correct: detect potential copy elision. incorrect: memset_pattern16
    }
    
    simd_forceinline vf(const vf&amp; q1, const vf&amp;  q2, float32x4_t(*oper)(float32x4_t, float32x4_t)) {   unroll_n for (int i = 0; i &lt; N; i++) m[i] = oper(q1.m[i], q2.m[i]); }
    
    simd_forceinline const vf&amp; operator*=(const vf&lt;N&gt;&amp; other)
    {
         unroll_n for (int i = 0; i &lt; N; i++) m[i] = vmulq_f32(m[i], other.m[i]);
         return *this;
    }
    
    simd_forceinline vf  operator*(const vf&amp; m2) const  {   return vf (*this, m2, vmulq_f32);   }
};

// a 4-element type instance of vf
typedef vf&lt;4&gt; simd;

simd dosomething (simd a, simd b, float c, float d)
{
        return (a * 5.f) * (b * 3.f) * c;
}

int main()
{
   return 1;
}

Output with #define force_inline_unroll 0:

test.o:	file format mach-o arm64

Disassembly of section __TEXT,__text:

0000000100003f50 &lt;__Z11dosomething2vfILi4EES0_ff&gt;:
; {
100003f50: bd4003f0    	ldr	s16, [sp]
;     simd_forceinline vf(const vf&amp; q1, const vf&amp;  q2, float32x4_t(*oper)(float32x4_t, float32x4_t)) {   unroll_n for (int i = 0; i &lt; N; i++) m[i] = oper(q1.m[i], q2.m[i]); }
100003f54: 4f00f691    	fmov.4s	v17, #<!-- -->5.00000000
100003f58: 6e31dc00    	fmul.4s	v0, v0, v17
100003f5c: 6e31dc21    	fmul.4s	v1, v1, v17
100003f60: 6e31dc42    	fmul.4s	v2, v2, v17
100003f64: 6e31dc63    	fmul.4s	v3, v3, v17
100003f68: 4f00f511    	fmov.4s	v17, #<!-- -->3.00000000
100003f6c: 6e31dc84    	fmul.4s	v4, v4, v17
100003f70: 6e31dca5    	fmul.4s	v5, v5, v17
100003f74: 6e31dcc6    	fmul.4s	v6, v6, v17
100003f78: 6e31dce7    	fmul.4s	v7, v7, v17
100003f7c: 6e24dc00    	fmul.4s	v0, v0, v4
100003f80: 6e25dc21    	fmul.4s	v1, v1, v5
100003f84: 6e26dc42    	fmul.4s	v2, v2, v6
100003f88: 6e27dc63    	fmul.4s	v3, v3, v7
100003f8c: 4f909000    	fmul.4s	v0, v0, v16[0]
100003f90: 4f909021    	fmul.4s	v1, v1, v16[0]
100003f94: 4f909042    	fmul.4s	v2, v2, v16[0]
100003f98: 4f909063    	fmul.4s	v3, v3, v16[0]
; 	return (a * 5.f) * (b * 3.f) * c;
100003f9c: d65f03c0    	ret

0000000100003fa0 &lt;_main&gt;:
;    return 1;
100003fa0: 52800020    	mov	w0, #<!-- -->1
100003fa4: d65f03c0    	ret

Output with #define force_inline_unroll 1:

test.o:	file format mach-o arm64

Disassembly of section __TEXT,__text:

0000000100003e78 &lt;__Z11dosomething2vfILi4EES0_ff&gt;:
; {
100003e78: d104c3ff    	sub	sp, sp, #<!-- -->304
100003e7c: a9116ffc    	stp	x28, x27, [sp, #<!-- -->272]
100003e80: a9127bfd    	stp	x29, x30, [sp, #<!-- -->288]
100003e84: 910483fd    	add	x29, sp, #<!-- -->288
100003e88: ad039fe6    	stp	q6, q7, [sp, #<!-- -->112]
100003e8c: ad0217e4    	stp	q4, q5, [sp, #<!-- -->64]
100003e90: ad000fe2    	stp	q2, q3, [sp]
100003e94: ad0103e1    	stp	q1, q0, [sp, #<!-- -->32]
100003e98: bd4013a0    	ldr	s0, [x29, #<!-- -->16]
100003e9c: 3d801be0    	str	q0, [sp, #<!-- -->96]
; 	return (a * 5.f) * (b * 3.f) * c;
100003ea0: 90000001    	adrp	x1, 0x100003000 &lt;__Z11dosomething2vfILi4EES0_ff+0x28&gt;
100003ea4: 913e0021    	add	x1, x1, #<!-- -->3968
100003ea8: d10143a0    	sub	x0, x29, #<!-- -->80
100003eac: 52800802    	mov	w2, #<!-- -->64
100003eb0: 94000031    	bl	0x100003f74 &lt;_memset_pattern16+0x100003f74&gt;
100003eb4: ad7d87a0    	ldp	q0, q1, [x29, #-80]
100003eb8: 3dc00fe2    	ldr	q2, [sp, #<!-- -->48]
100003ebc: 6e20dc40    	fmul.4s	v0, v2, v0
100003ec0: 3d800fe0    	str	q0, [sp, #<!-- -->48]
100003ec4: 3dc00be0    	ldr	q0, [sp, #<!-- -->32]
100003ec8: 6e21dc00    	fmul.4s	v0, v0, v1
100003ecc: 3d800be0    	str	q0, [sp, #<!-- -->32]
; 	return (a * 5.f) * (b * 3.f) * c;
100003ed0: ad7e87a0    	ldp	q0, q1, [x29, #-48]
100003ed4: 3dc003e2    	ldr	q2, [sp]
100003ed8: 6e20dc40    	fmul.4s	v0, v2, v0
100003edc: 3d8003e0    	str	q0, [sp]
100003ee0: 3dc007e0    	ldr	q0, [sp, #<!-- -->16]
100003ee4: 6e21dc00    	fmul.4s	v0, v0, v1
100003ee8: 3d8007e0    	str	q0, [sp, #<!-- -->16]
; 	return (a * 5.f) * (b * 3.f) * c;
100003eec: 90000001    	adrp	x1, 0x100003000 &lt;__Z11dosomething2vfILi4EES0_ff+0x74&gt;
100003ef0: 913e4021    	add	x1, x1, #<!-- -->3984
100003ef4: 910243e0    	add	x0, sp, #<!-- -->144
100003ef8: 52800802    	mov	w2, #<!-- -->64
100003efc: 9400001e    	bl	0x100003f74 &lt;_memset_pattern16+0x100003f74&gt;
100003f00: ad4487e0    	ldp	q0, q1, [sp, #<!-- -->144]
100003f04: 3dc013e2    	ldr	q2, [sp, #<!-- -->64]
100003f08: 6e20dc40    	fmul.4s	v0, v2, v0
100003f0c: 3dc017e2    	ldr	q2, [sp, #<!-- -->80]
100003f10: 6e21dc41    	fmul.4s	v1, v2, v1
100003f14: ad458fe2    	ldp	q2, q3, [sp, #<!-- -->176]
100003f18: 3dc01fe4    	ldr	q4, [sp, #<!-- -->112]
; 	return (a * 5.f) * (b * 3.f) * c;
100003f1c: 6e22dc82    	fmul.4s	v2, v4, v2
100003f20: 3dc023e4    	ldr	q4, [sp, #<!-- -->128]
100003f24: 6e23dc83    	fmul.4s	v3, v4, v3
100003f28: 3dc00fe4    	ldr	q4, [sp, #<!-- -->48]
100003f2c: 6e20dc80    	fmul.4s	v0, v4, v0
100003f30: 3dc00be4    	ldr	q4, [sp, #<!-- -->32]
100003f34: 6e21dc81    	fmul.4s	v1, v4, v1
100003f38: 3dc003e4    	ldr	q4, [sp]
100003f3c: 6e22dc82    	fmul.4s	v2, v4, v2
100003f40: 3dc007e4    	ldr	q4, [sp, #<!-- -->16]
100003f44: 6e23dc83    	fmul.4s	v3, v4, v3
100003f48: 3dc01be4    	ldr	q4, [sp, #<!-- -->96]
100003f4c: 4f849000    	fmul.4s	v0, v0, v4[0]
100003f50: 4f849021    	fmul.4s	v1, v1, v4[0]
100003f54: 4f849042    	fmul.4s	v2, v2, v4[0]
100003f58: 4f849063    	fmul.4s	v3, v3, v4[0]
100003f5c: a9527bfd    	ldp	x29, x30, [sp, #<!-- -->288]
100003f60: a9516ffc    	ldp	x28, x27, [sp, #<!-- -->272]
100003f64: 9104c3ff    	add	sp, sp, #<!-- -->304
100003f68: d65f03c0    	ret

0000000100003f6c &lt;_main&gt;:
;    return 1;
100003f6c: 52800020    	mov	w0, #<!-- -->1
100003f70: d65f03c0    	ret

Disassembly of section __TEXT,__stubs:

0000000100003f74 &lt;__stubs&gt;:
100003f74: b0000010    	adrp	x16, 0x100004000 &lt;__stubs+0x4&gt;
100003f78: f9400210    	ldr	x16, [x16]
100003f7c: d61f0200    	br	x16

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

3 participants