Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor AdvSimd version of DecodeFromUTF8 #101620

Merged

Conversation

SwapnilGaikwad
Copy link
Contributor

@dotnet-issue-labeler dotnet-issue-labeler bot added the needs-area-label An area label is needed to ensure this gets routed to the appropriate area owners label Apr 26, 2024
@dotnet-policy-service dotnet-policy-service bot added the community-contribution Indicates that the PR has been added by a community member label Apr 26, 2024
@SwapnilGaikwad
Copy link
Contributor Author

@a74nh @kunalspathak @dotnet/arm64-contrib

@SwapnilGaikwad SwapnilGaikwad marked this pull request as ready for review April 26, 2024 18:33
@SwapnilGaikwad
Copy link
Contributor Author

There is no notable performance difference on a V1 and N1 system for this patch.
There is reordering of assembly sequence with the newer version having an instruction less.

Assembly sequence for DecodeFromUtf8
b.cc	0xffffa8338394  // b.lo, b.ul, b.last
ldr	q8, 0xffffa83388c0
str	q8, [x29, #96]
ldr	q9, 0xffffa83388d0
str	q9, [x29, #80]
ldr	q10, 0xffffa83388e0
str	q10, [x29, #64]
ldr	q11, 0xffffa83388f0
str	q11, [x29, #48]
ldr	q12, 0xffffa8338900
str	q12, [x29, #32]
ldr	q13, 0xffffa8338910
str	q13, [x29, #16]
mov	x6, x27
str	x28, [x29, #272]
ldr	q14, 0xffffa8338920
str	q14, [x29, #256]
str	w4, [x29, #344]
mov	w2, w4
str	x6, [x29, #280]
mov	x0, x6
mov	x1, x27
adrp	x11, 0xffffa8e3a000
add	x11, x11, #0x918
mov	v14.d[0], v8.d[1]
mov	v10.d[0], v9.d[1]
ldr	x13, [x11]
blr	x13
ldr	x6, [x29, #280]
ld4	{v16.16b-v19.16b}, [x6]
stp	q16, q17, [x29, #192]
stp	q18, q19, [x29, #224]
ldp	q16, q17, [x29, #192]
ldp	q18, q19, [x29, #224]
mvni	v20.4s, #0x0
mvni	v21.4s, #0x0
mov	v8.d[1], v14.d[0]
mov	v9.d[1], v10.d[0]
mov	v22.16b, v8.16b
mov	v23.16b, v9.16b
tbl	v20.16b, {v20.16b-v23.16b}, v16.16b
mvni	v21.4s, #0x0
mvni	v22.4s, #0x0
mov	v23.16b, v21.16b
mov	v24.16b, v22.16b
mov	v25.16b, v8.16b
mov	v26.16b, v9.16b
tbl	v21.16b, {v23.16b-v26.16b}, v17.16b
mvni	v22.4s, #0x0
mvni	v23.4s, #0x0
mov	v24.16b, v22.16b
mov	v25.16b, v23.16b
mov	v26.16b, v8.16b
mov	v27.16b, v9.16b
tbl	v22.16b, {v24.16b-v27.16b}, v18.16b
mvni	v23.4s, #0x0
mvni	v24.4s, #0x0
mov	v25.16b, v23.16b
mov	v26.16b, v24.16b
mov	v27.16b, v8.16b
mov	v28.16b, v9.16b
tbl	v23.16b, {v25.16b-v28.16b}, v19.16b
ldr	q24, [x29, #256]
uqsub	v16.16b, v16.16b, v24.16b
uqsub	v17.16b, v17.16b, v24.16b
uqsub	v18.16b, v18.16b, v24.16b
uqsub	v19.16b, v19.16b, v24.16b
ldp	q26, q25, [x29, #48]
ldp	q28, q27, [x29, #16]
tbx	v16.16b, {v25.16b-v28.16b}, v16.16b
tbx	v17.16b, {v25.16b-v28.16b}, v17.16b
tbx	v18.16b, {v25.16b-v28.16b}, v18.16b
tbx	v19.16b, {v25.16b-v28.16b}, v19.16b
orr	v16.16b, v20.16b, v16.16b
orr	v17.16b, v21.16b, v17.16b
orr	v18.16b, v22.16b, v18.16b
orr	v19.16b, v23.16b, v19.16b
cmhi	v20.16b, v16.16b, v24.16b
cmhi	v21.16b, v17.16b, v24.16b
orr	v20.16b, v20.16b, v21.16b
cmhi	v21.16b, v18.16b, v24.16b
orr	v20.16b, v20.16b, v21.16b
cmhi	v21.16b, v19.16b, v24.16b
orr	v20.16b, v20.16b, v21.16b
umaxp	v20.4s, v20.4s, v20.4s
mov	x2, v20.d[0]
cmp	x2, #0x0
b.ne	0xffffa833836c  // b.any
shl	v16.16b, v16.16b, #2
ushr	v20.16b, v17.16b, #4
orr	v10.16b, v16.16b, v20.16b
shl	v16.16b, v17.16b, #4
ushr	v17.16b, v18.16b, #2
orr	v11.16b, v16.16b, v17.16b
shl	v16.16b, v18.16b, #6
orr	v12.16b, v16.16b, v19.16b
mov	w2, w19
ldr	x0, [x29, #272]
mov	x1, x28
adrp	x11, 0xffffa8e3a000
add	x11, x11, #0x920
mov	v13.d[0], v10.d[1]
mov	v8.d[0], v11.d[1]
mov	v9.d[0], v12.d[1]
ldr	x3, [x11]
blr	x3
mov	v10.d[1], v13.d[0]
mov	v11.d[1], v8.d[0]
mov	v12.d[1], v9.d[0]
ldr	x7, [x29, #272]
st3	{v10.16b-v12.16b}, [x7]
ldr	x6, [x29, #280]
add	x6, x6, #0x40
add	x7, x7, #0x30
ldr	x3, [x29, #288]
cmp	x6, x3
str	x7, [x29, #272]
str	x3, [x29, #288]
ldp	q9, q8, [x29, #80]
b.ls	0xffffa8338568  // b.plast
str	x6, [x29, #280]
ldr	x6, [x29, #280]
mov	x4, x6
ldr	x7, [x29, #272]
mov	x5, x7
ldr	x6, [x29, #312]
cmp	x4, x6
b.eq	0xffffa833875c

Copy link
Contributor

Tagging subscribers to this area: @dotnet/area-system-buffers
See info in area-owners.md if you want to be subscribed.

Copy link
Member

@kunalspathak kunalspathak left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added some questions/comments.

Copy link
Member

@kunalspathak kunalspathak left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM. Thanks!

@kunalspathak kunalspathak merged commit 7037516 into dotnet:main May 9, 2024
144 of 146 checks passed
@SwapnilGaikwad SwapnilGaikwad deleted the github-refactor-Base64Decode branch May 10, 2024 09:02
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
area-System.Buffers community-contribution Indicates that the PR has been added by a community member
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

3 participants