perf: simd neon #133

AaronO · 2023-04-15T23:34:20Z

First pass at neon support, building off #132

I shared some promising benches here: #123 (comment)

Overall it's a net-positive but not totally optimal.

Next steps we would want to land #123 and then refactor it into a SWAR SIMD backend and stack them appropriately to avoid duplicated work and cleaner logic.

Additionally, offsetz is very much central to total perf, I believe the vectorized implementation I added underperforms due to inefficiently reloading the offset lookup table instead of keeping it a register at the top of the header parsing loop.

AaronO · 2023-04-15T23:36:21Z

ASM of vectorized offsetz

In context of match_uri_vectored

.section __TEXT,__text,regular,pure_instructions
        .globl  httparse::simd::neon::match_uri_vectored
        .p2align        2
httparse::simd::neon::match_uri_vectored:
Lfunc_begin14:
        .cfi_startproc
        stp x29, x30, [sp, #-16]!
        .cfi_def_cfa_offset 16
        mov x29, sp
        .cfi_def_cfa w29, 16
        .cfi_offset w30, -8
        .cfi_offset w29, -16
        .cfi_remember_state
        mov x8, x0
        ldr x0, [x0, #16]
        ldp x9, x1, [x8]
        movi.16b v0, #129
        movi.16b v1, #162
        movi.16b v2, #253
        movi.16b v3, #60
Lloh22:
        adrp x10, lCPI14_0@PAGE
Lloh23:
        ldr q4, [x10, lCPI14_0@PAGEOFF]
        mov w10, #16
LBB14_1:
        subs x11, x1, x0
        b.lo LBB14_5
        cmp x11, #15
        b.ls LBB14_4
        ldr q5, [x9, x0]
        add.16b v6, v5, v0
        cmhi.16b v6, v1, v6
        and.16b v5, v5, v2
        cmeq.16b v5, v5, v3
        orr.16b v5, v6, v5
        and.16b v5, v5, v4
        umaxv.16b b5, v5
        fmov w11, s5
        sub w11, w10, w11, uxtb
        add x0, x11, x0
        str x0, [x8, #16]
        cmp w11, #16
        b.eq LBB14_1
LBB14_4:
        .cfi_def_cfa wsp, 16
        ldp x29, x30, [sp], #16
        .cfi_def_cfa_offset 0
        .cfi_restore w30
        .cfi_restore w29
        ret
LBB14_5:
        .cfi_restore_state
Lloh24:
        adrp x2, l___unnamed_15@PAGE
Lloh25:
        add x2, x2, l___unnamed_15@PAGEOFF
        bl core::slice::index::slice_start_index_len_fail
        .loh AdrpLdr    Lloh22, Lloh23
        .loh AdrpAdd    Lloh24, Lloh25

ASM of unvectorized offsetz

        .globl  httparse::simd::neon::match_uri_vectored
        .p2align        2
httparse::simd::neon::match_uri_vectored:
Lfunc_begin14:
        .cfi_startproc
        stp x29, x30, [sp, #-16]!
        .cfi_def_cfa_offset 16
        mov x29, sp
        .cfi_def_cfa w29, 16
        .cfi_offset w30, -8
        .cfi_offset w29, -16
        .cfi_remember_state
        ldp x1, x8, [x0, #8]
        ldr x9, [x0]
        movi.16b v0, #129
        movi.16b v1, #162
        movi.16b v2, #253
        movi.16b v3, #60
        b LBB14_2

        mov w10, #16
        add x8, x10, x8
        str x8, [x0, #16]
        cmp x10, #16
        b.ne LBB14_42
LBB14_2:
        subs x10, x1, x8
        b.lo LBB14_43
        cmp x10, #15
        b.ls LBB14_42
        ldr q4, [x9, x8]
        add.16b v5, v4, v0
        cmhi.16b v5, v1, v5
        and.16b v4, v4, v2
        cmeq.16b v4, v4, v3
        orr.16b v4, v5, v4
        fmov x10, d4
        cbnz x10, LBB14_24
        mov.d x10, v4[1]
        cbz x10, LBB14_1
        tst x10, #0xff
        b.eq LBB14_8
        mov x10, #0
        b LBB14_23
LBB14_8:
        tst x10, #0xff00
        b.eq LBB14_10
        mov w10, #1
        b LBB14_23
LBB14_10:
        tst x10, #0xff0000
        b.eq LBB14_12
        mov w10, #2
        b LBB14_23
LBB14_12:
        tst x10, #0xff000000
        b.eq LBB14_14
        mov w10, #3
        b LBB14_23
LBB14_14:
        tst x10, #0xff00000000
        b.eq LBB14_16
        mov w10, #4
        b LBB14_23
LBB14_16:
        tst x10, #0xff0000000000
        b.eq LBB14_18
        mov w10, #5
        b LBB14_23
LBB14_18:
        tst x10, #0xff000000000000
        b.eq LBB14_20
        mov w10, #6
        b LBB14_23
LBB14_20:
        lsr x10, x10, #56
        cbnz x10, LBB14_22
        mov w10, #8
        b LBB14_23

        mov w10, #7
LBB14_23:
        add x10, x10, #8
        add x8, x10, x8
        str x8, [x0, #16]
        cmp x10, #16
        b.eq LBB14_2
        b LBB14_42

        tst x10, #0xff
        b.eq LBB14_26
        mov x9, #0
        b LBB14_41
LBB14_26:
        tst x10, #0xff00
        b.eq LBB14_28
        mov w9, #1
        b LBB14_41
LBB14_28:
        tst x10, #0xff0000
        b.eq LBB14_30
        mov w9, #2
        b LBB14_41
LBB14_30:
        tst x10, #0xff000000
        b.eq LBB14_32
        mov w9, #3
        b LBB14_41
LBB14_32:
        tst x10, #0xff00000000
        b.eq LBB14_34
        mov w9, #4
        b LBB14_41
LBB14_34:
        tst x10, #0xff0000000000
        b.eq LBB14_36
        mov w9, #5
        b LBB14_41
LBB14_36:
        tst x10, #0xff000000000000
        b.eq LBB14_38
        mov w9, #6
        b LBB14_41
LBB14_38:
        lsr x9, x10, #56
        cbnz x9, LBB14_40
        mov w9, #8
        b LBB14_41

        mov w9, #7
LBB14_41:
        add x8, x9, x8
        str x8, [x0, #16]
LBB14_42:
        .cfi_def_cfa wsp, 16
        ldp x29, x30, [sp], #16
        .cfi_def_cfa_offset 0
        .cfi_restore w30
        .cfi_restore w29
        ret
LBB14_43:
        .cfi_restore_state
Lloh22:
        adrp x2, l___unnamed_15@PAGE
Lloh23:
        add x2, x2, l___unnamed_15@PAGEOFF
        mov x0, x8
        bl core::slice::index::slice_start_index_len_fail
        .loh AdrpAdd    Lloh22, Lloh23

AaronO · 2023-04-20T20:45:02Z

@seanmonstar I've resolved the conflicts, this should be good to land it's not perfect but it's a solid first pass, a few specifics to revisit later.

seanmonstar

Adding a new arch/target, it'd be wise of us to add a CI job to exercise it too. I think some of the other modules include some tests that are checking that the matching behavior is the same as the scalar version.

AaronO · 2023-04-24T02:43:26Z

Adding a new arch/target, it'd be wise of us to add a CI job to exercise it too. I think some of the other modules include some tests that are checking that the matching behavior is the same as the scalar version.

I've added a CI job to run aarch64/neon tests with qemu. I added tests in neon.rs that ensure the vectorized versions math their scalar counterparts. (Only some of the SWAR validators have false-negatives with a fallback to scalar check so correct in aggregate, since testing for the common-cases can be cheaper than exhaustive checks)

src/simd/neon.rs

First pass at neon support, building off seanmonstar#132

seanmonstar

Right on!

I accidentally typed `std::mem` out of habit, this wasn't caught in seanmonstar#133 because `neon+no_std` isn't exercised

I accidentally typed `std::mem` out of habit, this wasn't caught in #133 because `neon+no_std` isn't exercised

AaronO mentioned this pull request Apr 15, 2023

perf: improve non-SIMD with wordwise validation #123

Merged

AaronO force-pushed the perf/simd-neon branch from e2c21ad to 5391346 Compare April 20, 2023 20:38

seanmonstar reviewed Apr 21, 2023

View reviewed changes

AaronO force-pushed the perf/simd-neon branch from e30dc2c to 4e6f3b8 Compare April 24, 2023 02:42

seanmonstar reviewed Apr 24, 2023

View reviewed changes

src/simd/neon.rs Outdated Show resolved Hide resolved

src/simd/neon.rs Show resolved Hide resolved

perf: SIMD neon support

66a7f0c

First pass at neon support, building off seanmonstar#132

AaronO force-pushed the perf/simd-neon branch from e65f751 to 66a7f0c Compare April 25, 2023 18:42

seanmonstar approved these changes Apr 25, 2023

View reviewed changes

seanmonstar merged commit cdabb70 into seanmonstar:master Apr 25, 2023
32 checks passed

AaronO added a commit to AaronO/httparse that referenced this pull request Apr 25, 2023

fix: neon no_std

9629414

I accidentally typed `std::mem` out of habit, this wasn't caught in seanmonstar#133 because `neon+no_std` isn't exercised

AaronO mentioned this pull request Apr 25, 2023

fix: neon no_std #141

Merged

AaronO added a commit to AaronO/httparse that referenced this pull request May 4, 2023

fix: neon no_std

3eabf51

I accidentally typed `std::mem` out of habit, this wasn't caught in seanmonstar#133 because `neon+no_std` isn't exercised

seanmonstar pushed a commit that referenced this pull request May 4, 2023

fix: neon no_std (#141)

46c9c9b

I accidentally typed `std::mem` out of habit, this wasn't caught in #133 because `neon+no_std` isn't exercised

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

perf: simd neon #133

perf: simd neon #133

AaronO commented Apr 15, 2023

AaronO commented Apr 15, 2023

AaronO commented Apr 20, 2023

seanmonstar left a comment

AaronO commented Apr 24, 2023

seanmonstar left a comment

perf: simd neon #133

perf: simd neon #133

Conversation

AaronO commented Apr 15, 2023

AaronO commented Apr 15, 2023

ASM of vectorized offsetz

ASM of unvectorized offsetz

AaronO commented Apr 20, 2023

seanmonstar left a comment

Choose a reason for hiding this comment

AaronO commented Apr 24, 2023

seanmonstar left a comment

Choose a reason for hiding this comment