mirror of
https://github.com/golang/go
synced 2024-11-17 10:54:50 -07:00
internal/bytealg: optimise compare on riscv64
Implement compare using loops that process 32 bytes, 16 bytes, 4 bytes or 1 byte depending on size and alignment. For comparisons that are less than 32 bytes the overhead of checking and adjusting alignment usually exceeds the overhead of reading and processing 4 bytes at a time. Updates #50615 name old time/op new time/op delta BytesCompare/1-4 68.4ns _ 1% 61.0ns _ 0% -10.78% (p=0.001 n=3+3) BytesCompare/2-4 82.9ns _ 0% 71.0ns _ 1% -14.31% (p=0.000 n=3+3) BytesCompare/4-4 107ns _ 0% 70ns _ 0% -34.96% (p=0.000 n=3+3) BytesCompare/8-4 156ns _ 0% 90ns _ 0% -42.36% (p=0.000 n=3+3) BytesCompare/16-4 267ns _11% 130ns _ 0% -51.10% (p=0.011 n=3+3) BytesCompare/32-4 446ns _ 0% 74ns _ 0% -83.31% (p=0.000 n=3+3) BytesCompare/64-4 840ns _ 2% 91ns _ 0% -89.17% (p=0.000 n=3+3) BytesCompare/128-4 1.60_s _ 0% 0.13_s _ 0% -92.18% (p=0.000 n=3+3) BytesCompare/256-4 3.15_s _ 0% 0.19_s _ 0% -93.91% (p=0.000 n=3+3) BytesCompare/512-4 6.25_s _ 0% 0.33_s _ 0% -94.80% (p=0.000 n=3+3) BytesCompare/1024-4 12.5_s _ 0% 0.6_s _ 0% -95.23% (p=0.000 n=3+3) BytesCompare/2048-4 24.8_s _ 0% 1.1_s _ 0% -95.46% (p=0.000 n=3+3) CompareBytesEqual-4 225ns _ 0% 131ns _ 0% -41.69% (p=0.000 n=3+3) CompareBytesToNil-4 45.3ns _ 7% 46.7ns _ 0% ~ (p=0.452 n=3+3) CompareBytesEmpty-4 41.0ns _ 1% 40.6ns _ 0% ~ (p=0.071 n=3+3) CompareBytesIdentical-4 48.9ns _ 0% 41.3ns _ 1% -15.58% (p=0.000 n=3+3) CompareBytesSameLength-4 127ns _ 0% 77ns _ 0% -39.48% (p=0.000 n=3+3) CompareBytesDifferentLength-4 136ns _12% 78ns _ 0% -42.65% (p=0.018 n=3+3) CompareBytesBigUnaligned-4 14.9ms _ 1% 7.3ms _ 1% -50.95% (p=0.000 n=3+3) CompareBytesBig-4 14.9ms _ 1% 2.7ms _ 8% -82.10% (p=0.000 n=3+3) CompareBytesBigIdentical-4 52.5ns _ 0% 44.9ns _ 0% -14.53% (p=0.000 n=3+3) name old speed new speed delta CompareBytesBigUnaligned-4 70.5MB/s _ 1% 143.8MB/s _ 1% +103.87% (p=0.000 n=3+3) CompareBytesBig-4 70.3MB/s _ 1% 393.8MB/s _ 8% +460.43% (p=0.003 n=3+3) CompareBytesBigIdentical-4 20.0TB/s _ 0% 23.4TB/s _ 0% +17.00% (p=0.000 n=3+3) Change-Id: Ie18712a9009d425c75e1ab49d5a673d84e73a1eb Reviewed-on: https://go-review.googlesource.com/c/go/+/380076 Trust: Joel Sing <joel@sing.id.au> Trust: mzh <mzh@golangcn.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
This commit is contained in:
parent
3c42ebf3e2
commit
291bda80e5
@ -2,7 +2,7 @@
|
|||||||
// Use of this source code is governed by a BSD-style
|
// Use of this source code is governed by a BSD-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
//go:build !386 && !amd64 && !s390x && !arm && !arm64 && !ppc64 && !ppc64le && !mips && !mipsle && !wasm && !mips64 && !mips64le
|
//go:build !386 && !amd64 && !s390x && !arm && !arm64 && !ppc64 && !ppc64le && !mips && !mipsle && !wasm && !mips64 && !mips64le && !riscv64
|
||||||
|
|
||||||
package bytealg
|
package bytealg
|
||||||
|
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
// Use of this source code is governed by a BSD-style
|
// Use of this source code is governed by a BSD-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
//go:build 386 || amd64 || s390x || arm || arm64 || ppc64 || ppc64le || mips || mipsle || wasm || mips64 || mips64le
|
//go:build 386 || amd64 || s390x || arm || arm64 || ppc64 || ppc64le || mips || mipsle || wasm || mips64 || mips64le || riscv64
|
||||||
|
|
||||||
package bytealg
|
package bytealg
|
||||||
|
|
||||||
|
185
src/internal/bytealg/compare_riscv64.s
Normal file
185
src/internal/bytealg/compare_riscv64.s
Normal file
@ -0,0 +1,185 @@
|
|||||||
|
// Copyright 2022 The Go Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
#include "go_asm.h"
|
||||||
|
#include "textflag.h"
|
||||||
|
|
||||||
|
TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-56
|
||||||
|
MOV a_base+0(FP), X5
|
||||||
|
MOV a_len+8(FP), X6
|
||||||
|
MOV b_base+24(FP), X7
|
||||||
|
MOV b_len+32(FP), X8
|
||||||
|
MOV $ret+48(FP), X9
|
||||||
|
JMP compare<>(SB)
|
||||||
|
|
||||||
|
TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
|
||||||
|
MOV a_base+0(FP), X5
|
||||||
|
MOV a_len+8(FP), X6
|
||||||
|
MOV b_base+16(FP), X7
|
||||||
|
MOV b_len+24(FP), X8
|
||||||
|
MOV $ret+32(FP), X9
|
||||||
|
JMP compare<>(SB)
|
||||||
|
|
||||||
|
// On entry:
|
||||||
|
// X5 points to start of a
|
||||||
|
// X6 length of a
|
||||||
|
// X7 points to start of b
|
||||||
|
// X8 length of b
|
||||||
|
// X9 points to the address to store the return value (-1/0/1)
|
||||||
|
TEXT compare<>(SB),NOSPLIT|NOFRAME,$0
|
||||||
|
BEQ X5, X7, cmp_len
|
||||||
|
|
||||||
|
MOV X6, X10
|
||||||
|
BGE X8, X10, use_a_len // X10 = min(len(a), len(b))
|
||||||
|
MOV X8, X10
|
||||||
|
use_a_len:
|
||||||
|
BEQZ X10, cmp_len
|
||||||
|
|
||||||
|
MOV $32, X11
|
||||||
|
BLT X10, X11, loop4_check
|
||||||
|
|
||||||
|
// Check alignment - if alignment differs we have to do one byte at a time.
|
||||||
|
AND $3, X5, X12
|
||||||
|
AND $3, X7, X13
|
||||||
|
BNE X12, X13, loop4_check
|
||||||
|
BEQZ X12, loop32_check
|
||||||
|
|
||||||
|
// Check one byte at a time until we reach 8 byte alignment.
|
||||||
|
SUB X12, X10, X10
|
||||||
|
align:
|
||||||
|
ADD $-1, X12
|
||||||
|
MOVBU 0(X5), X13
|
||||||
|
MOVBU 0(X7), X14
|
||||||
|
BNE X13, X14, cmp
|
||||||
|
ADD $1, X5
|
||||||
|
ADD $1, X7
|
||||||
|
BNEZ X12, align
|
||||||
|
|
||||||
|
loop32_check:
|
||||||
|
MOV $32, X12
|
||||||
|
BLT X10, X12, loop16_check
|
||||||
|
loop32:
|
||||||
|
MOV 0(X5), X15
|
||||||
|
MOV 0(X7), X16
|
||||||
|
MOV 8(X5), X17
|
||||||
|
MOV 8(X7), X18
|
||||||
|
BEQ X15, X16, loop32a
|
||||||
|
JMP cmp8a
|
||||||
|
loop32a:
|
||||||
|
BEQ X17, X18, loop32b
|
||||||
|
JMP cmp8b
|
||||||
|
loop32b:
|
||||||
|
MOV 16(X5), X15
|
||||||
|
MOV 16(X7), X16
|
||||||
|
MOV 24(X5), X17
|
||||||
|
MOV 24(X7), X18
|
||||||
|
BEQ X15, X16, loop32c
|
||||||
|
JMP cmp8a
|
||||||
|
loop32c:
|
||||||
|
BEQ X17, X18, loop32d
|
||||||
|
JMP cmp8b
|
||||||
|
loop32d:
|
||||||
|
ADD $32, X5
|
||||||
|
ADD $32, X7
|
||||||
|
ADD $-32, X10
|
||||||
|
BGE X10, X12, loop32
|
||||||
|
BEQZ X10, cmp_len
|
||||||
|
|
||||||
|
loop16_check:
|
||||||
|
MOV $16, X11
|
||||||
|
BLT X10, X11, loop4_check
|
||||||
|
loop16:
|
||||||
|
MOV 0(X5), X15
|
||||||
|
MOV 0(X7), X16
|
||||||
|
MOV 8(X5), X17
|
||||||
|
MOV 8(X7), X18
|
||||||
|
BEQ X15, X16, loop16a
|
||||||
|
JMP cmp8a
|
||||||
|
loop16a:
|
||||||
|
BEQ X17, X18, loop16b
|
||||||
|
JMP cmp8b
|
||||||
|
loop16b:
|
||||||
|
ADD $16, X5
|
||||||
|
ADD $16, X7
|
||||||
|
ADD $-16, X10
|
||||||
|
BGE X10, X11, loop16
|
||||||
|
BEQZ X10, cmp_len
|
||||||
|
|
||||||
|
loop4_check:
|
||||||
|
MOV $4, X11
|
||||||
|
BLT X10, X11, loop1
|
||||||
|
loop4:
|
||||||
|
MOVBU 0(X5), X13
|
||||||
|
MOVBU 0(X7), X14
|
||||||
|
MOVBU 1(X5), X15
|
||||||
|
MOVBU 1(X7), X16
|
||||||
|
BEQ X13, X14, loop4a
|
||||||
|
SLTU X14, X13, X10
|
||||||
|
SLTU X13, X14, X11
|
||||||
|
JMP cmp_ret
|
||||||
|
loop4a:
|
||||||
|
BEQ X15, X16, loop4b
|
||||||
|
SLTU X16, X15, X10
|
||||||
|
SLTU X15, X16, X11
|
||||||
|
JMP cmp_ret
|
||||||
|
loop4b:
|
||||||
|
MOVBU 2(X5), X21
|
||||||
|
MOVBU 2(X7), X22
|
||||||
|
MOVBU 3(X5), X23
|
||||||
|
MOVBU 3(X7), X24
|
||||||
|
BEQ X21, X22, loop4c
|
||||||
|
SLTU X22, X21, X10
|
||||||
|
SLTU X21, X22, X11
|
||||||
|
JMP cmp_ret
|
||||||
|
loop4c:
|
||||||
|
BEQ X23, X24, loop4d
|
||||||
|
SLTU X24, X23, X10
|
||||||
|
SLTU X23, X24, X11
|
||||||
|
JMP cmp_ret
|
||||||
|
loop4d:
|
||||||
|
ADD $4, X5
|
||||||
|
ADD $4, X7
|
||||||
|
ADD $-4, X10
|
||||||
|
BGE X10, X11, loop4
|
||||||
|
|
||||||
|
loop1:
|
||||||
|
BEQZ X10, cmp_len
|
||||||
|
MOVBU 0(X5), X13
|
||||||
|
MOVBU 0(X7), X14
|
||||||
|
BNE X13, X14, cmp
|
||||||
|
ADD $1, X5
|
||||||
|
ADD $1, X7
|
||||||
|
ADD $-1, X10
|
||||||
|
JMP loop1
|
||||||
|
|
||||||
|
// Compare 8 bytes of memory in X15/X16 that are known to differ.
|
||||||
|
cmp8a:
|
||||||
|
MOV $0xff, X19
|
||||||
|
cmp8a_loop:
|
||||||
|
AND X15, X19, X13
|
||||||
|
AND X16, X19, X14
|
||||||
|
BNE X13, X14, cmp
|
||||||
|
SLLI $8, X19
|
||||||
|
JMP cmp8a_loop
|
||||||
|
|
||||||
|
// Compare 8 bytes of memory in X17/X18 that are known to differ.
|
||||||
|
cmp8b:
|
||||||
|
MOV $0xff, X19
|
||||||
|
cmp8b_loop:
|
||||||
|
AND X17, X19, X13
|
||||||
|
AND X18, X19, X14
|
||||||
|
BNE X13, X14, cmp
|
||||||
|
SLLI $8, X19
|
||||||
|
JMP cmp8b_loop
|
||||||
|
|
||||||
|
cmp_len:
|
||||||
|
MOV X6, X13
|
||||||
|
MOV X8, X14
|
||||||
|
cmp:
|
||||||
|
SLTU X14, X13, X10
|
||||||
|
SLTU X13, X14, X11
|
||||||
|
cmp_ret:
|
||||||
|
SUB X10, X11, X12
|
||||||
|
MOV X12, (X9)
|
||||||
|
RET
|
Loading…
Reference in New Issue
Block a user