mirror of
https://github.com/golang/go
synced 2024-09-30 03:34:51 -06:00
internal/bytealg: optimize Equal on arm64
Currently the 16-byte loop chunk16_loop is implemented with NEON instructions LD1, VMOV and VCMEQ. Using scalar instructions LDP and CMP to achieve this loop can reduce the number of clock cycles. For cases where the length of strings are between 4 to 15 bytes, loading the last 8 or 4 bytes at a time to reduce the number of comparisons. Benchmarks: name old time/op new time/op delta Equal/0-8 5.51ns ± 0% 5.84ns ±14% ~ (p=0.246 n=7+8) Equal/1-8 10.5ns ± 0% 10.5ns ± 0% ~ (all equal) Equal/6-8 14.0ns ± 0% 12.5ns ± 0% -10.71% (p=0.000 n=8+8) Equal/9-8 13.5ns ± 0% 12.5ns ± 0% -7.41% (p=0.000 n=8+8) Equal/15-8 15.5ns ± 0% 12.5ns ± 0% -19.35% (p=0.000 n=8+8) Equal/16-8 14.0ns ± 0% 13.0ns ± 0% -7.14% (p=0.000 n=8+8) Equal/20-8 16.5ns ± 0% 16.0ns ± 0% -3.03% (p=0.000 n=8+8) Equal/32-8 16.5ns ± 0% 15.3ns ± 0% -7.27% (p=0.000 n=8+8) Equal/4K-8 552ns ± 0% 553ns ± 0% ~ (p=0.315 n=8+8) Equal/4M-8 1.13ms ±23% 1.20ms ±27% ~ (p=0.442 n=8+8) Equal/64M-8 32.9ms ± 0% 32.6ms ± 0% -1.15% (p=0.000 n=8+8) CompareBytesEqual-8 12.0ns ± 0% 12.0ns ± 0% ~ (all equal) Change-Id: If317ecdcc98e31883d37fd7d42b113b548c5bd2a Reviewed-on: https://go-review.googlesource.com/112496 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com>
This commit is contained in:
parent
21f3d5816d
commit
de28555c0b
@ -67,6 +67,7 @@ TEXT runtime·memequal_varlen(SB),NOSPLIT,$40-17
|
||||
CMP R3, R4
|
||||
BEQ eq
|
||||
MOVD 8(R26), R5 // compiler stores size at offset 8 in the closure
|
||||
CBZ R5, eq
|
||||
MOVD R3, 8(RSP)
|
||||
MOVD R4, 16(RSP)
|
||||
MOVD R5, 24(RSP)
|
||||
@ -119,30 +120,41 @@ chunk16:
|
||||
CBZ R3, tail
|
||||
ADD R3, R0, R6 // end of chunks
|
||||
chunk16_loop:
|
||||
VLD1.P (R0), [V0.D2]
|
||||
VLD1.P (R2), [V1.D2]
|
||||
VCMEQ V0.D2, V1.D2, V2.D2
|
||||
LDP.P 16(R0), (R4, R5)
|
||||
LDP.P 16(R2), (R7, R9)
|
||||
EOR R4, R7
|
||||
CBNZ R7, not_equal
|
||||
EOR R5, R9
|
||||
CBNZ R9, not_equal
|
||||
CMP R0, R6
|
||||
VMOV V2.D[0], R4
|
||||
VMOV V2.D[1], R5
|
||||
CBZ R4, not_equal
|
||||
CBZ R5, not_equal
|
||||
BNE chunk16_loop
|
||||
AND $0xf, R1, R1
|
||||
CBZ R1, equal
|
||||
tail:
|
||||
// special compare of tail with length < 16
|
||||
TBZ $3, R1, lt_8
|
||||
MOVD.P 8(R0), R4
|
||||
MOVD.P 8(R2), R5
|
||||
CMP R4, R5
|
||||
BNE not_equal
|
||||
MOVD (R0), R4
|
||||
MOVD (R2), R5
|
||||
EOR R4, R5
|
||||
CBNZ R5, not_equal
|
||||
SUB $8, R1, R6 // offset of the last 8 bytes
|
||||
MOVD (R0)(R6), R4
|
||||
MOVD (R2)(R6), R5
|
||||
EOR R4, R5
|
||||
CBNZ R5, not_equal
|
||||
B equal
|
||||
lt_8:
|
||||
TBZ $2, R1, lt_4
|
||||
MOVWU.P 4(R0), R4
|
||||
MOVWU.P 4(R2), R5
|
||||
CMP R4, R5
|
||||
BNE not_equal
|
||||
MOVWU (R0), R4
|
||||
MOVWU (R2), R5
|
||||
EOR R4, R5
|
||||
CBNZ R5, not_equal
|
||||
SUB $4, R1, R6 // offset of the last 4 bytes
|
||||
MOVWU (R0)(R6), R4
|
||||
MOVWU (R2)(R6), R5
|
||||
EOR R4, R5
|
||||
CBNZ R5, not_equal
|
||||
B equal
|
||||
lt_4:
|
||||
TBZ $1, R1, lt_2
|
||||
MOVHU.P 2(R0), R4
|
||||
@ -150,7 +162,7 @@ lt_4:
|
||||
CMP R4, R5
|
||||
BNE not_equal
|
||||
lt_2:
|
||||
TBZ $0, R1, equal
|
||||
TBZ $0, R1, equal
|
||||
one:
|
||||
MOVBU (R0), R4
|
||||
MOVBU (R2), R5
|
||||
|
Loading…
Reference in New Issue
Block a user