From 291bda80e551289e0b8ed3209782ccb2a98a124b Mon Sep 17 00:00:00 2001 From: Joel Sing Date: Fri, 21 Jan 2022 19:48:26 +1100 Subject: [PATCH] internal/bytealg: optimise compare on riscv64 Implement compare using loops that process 32 bytes, 16 bytes, 4 bytes or 1 byte depending on size and alignment. For comparisons that are less than 32 bytes the overhead of checking and adjusting alignment usually exceeds the overhead of reading and processing 4 bytes at a time. Updates #50615 name old time/op new time/op delta BytesCompare/1-4 68.4ns _ 1% 61.0ns _ 0% -10.78% (p=0.001 n=3+3) BytesCompare/2-4 82.9ns _ 0% 71.0ns _ 1% -14.31% (p=0.000 n=3+3) BytesCompare/4-4 107ns _ 0% 70ns _ 0% -34.96% (p=0.000 n=3+3) BytesCompare/8-4 156ns _ 0% 90ns _ 0% -42.36% (p=0.000 n=3+3) BytesCompare/16-4 267ns _11% 130ns _ 0% -51.10% (p=0.011 n=3+3) BytesCompare/32-4 446ns _ 0% 74ns _ 0% -83.31% (p=0.000 n=3+3) BytesCompare/64-4 840ns _ 2% 91ns _ 0% -89.17% (p=0.000 n=3+3) BytesCompare/128-4 1.60_s _ 0% 0.13_s _ 0% -92.18% (p=0.000 n=3+3) BytesCompare/256-4 3.15_s _ 0% 0.19_s _ 0% -93.91% (p=0.000 n=3+3) BytesCompare/512-4 6.25_s _ 0% 0.33_s _ 0% -94.80% (p=0.000 n=3+3) BytesCompare/1024-4 12.5_s _ 0% 0.6_s _ 0% -95.23% (p=0.000 n=3+3) BytesCompare/2048-4 24.8_s _ 0% 1.1_s _ 0% -95.46% (p=0.000 n=3+3) CompareBytesEqual-4 225ns _ 0% 131ns _ 0% -41.69% (p=0.000 n=3+3) CompareBytesToNil-4 45.3ns _ 7% 46.7ns _ 0% ~ (p=0.452 n=3+3) CompareBytesEmpty-4 41.0ns _ 1% 40.6ns _ 0% ~ (p=0.071 n=3+3) CompareBytesIdentical-4 48.9ns _ 0% 41.3ns _ 1% -15.58% (p=0.000 n=3+3) CompareBytesSameLength-4 127ns _ 0% 77ns _ 0% -39.48% (p=0.000 n=3+3) CompareBytesDifferentLength-4 136ns _12% 78ns _ 0% -42.65% (p=0.018 n=3+3) CompareBytesBigUnaligned-4 14.9ms _ 1% 7.3ms _ 1% -50.95% (p=0.000 n=3+3) CompareBytesBig-4 14.9ms _ 1% 2.7ms _ 8% -82.10% (p=0.000 n=3+3) CompareBytesBigIdentical-4 52.5ns _ 0% 44.9ns _ 0% -14.53% (p=0.000 n=3+3) name old speed new speed delta CompareBytesBigUnaligned-4 70.5MB/s _ 1% 143.8MB/s _ 1% +103.87% (p=0.000 n=3+3) CompareBytesBig-4 70.3MB/s _ 1% 393.8MB/s _ 8% +460.43% (p=0.003 n=3+3) CompareBytesBigIdentical-4 20.0TB/s _ 0% 23.4TB/s _ 0% +17.00% (p=0.000 n=3+3) Change-Id: Ie18712a9009d425c75e1ab49d5a673d84e73a1eb Reviewed-on: https://go-review.googlesource.com/c/go/+/380076 Trust: Joel Sing Trust: mzh Reviewed-by: Cherry Mui --- src/internal/bytealg/compare_generic.go | 2 +- src/internal/bytealg/compare_native.go | 2 +- src/internal/bytealg/compare_riscv64.s | 185 ++++++++++++++++++++++++ 3 files changed, 187 insertions(+), 2 deletions(-) create mode 100644 src/internal/bytealg/compare_riscv64.s diff --git a/src/internal/bytealg/compare_generic.go b/src/internal/bytealg/compare_generic.go index eaea168f2d..c5853f503f 100644 --- a/src/internal/bytealg/compare_generic.go +++ b/src/internal/bytealg/compare_generic.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build !386 && !amd64 && !s390x && !arm && !arm64 && !ppc64 && !ppc64le && !mips && !mipsle && !wasm && !mips64 && !mips64le +//go:build !386 && !amd64 && !s390x && !arm && !arm64 && !ppc64 && !ppc64le && !mips && !mipsle && !wasm && !mips64 && !mips64le && !riscv64 package bytealg diff --git a/src/internal/bytealg/compare_native.go b/src/internal/bytealg/compare_native.go index 21ff8fe786..ad0fcd7660 100644 --- a/src/internal/bytealg/compare_native.go +++ b/src/internal/bytealg/compare_native.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build 386 || amd64 || s390x || arm || arm64 || ppc64 || ppc64le || mips || mipsle || wasm || mips64 || mips64le +//go:build 386 || amd64 || s390x || arm || arm64 || ppc64 || ppc64le || mips || mipsle || wasm || mips64 || mips64le || riscv64 package bytealg diff --git a/src/internal/bytealg/compare_riscv64.s b/src/internal/bytealg/compare_riscv64.s new file mode 100644 index 0000000000..0dc62515a1 --- /dev/null +++ b/src/internal/bytealg/compare_riscv64.s @@ -0,0 +1,185 @@ +// Copyright 2022 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-56 + MOV a_base+0(FP), X5 + MOV a_len+8(FP), X6 + MOV b_base+24(FP), X7 + MOV b_len+32(FP), X8 + MOV $ret+48(FP), X9 + JMP compare<>(SB) + +TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40 + MOV a_base+0(FP), X5 + MOV a_len+8(FP), X6 + MOV b_base+16(FP), X7 + MOV b_len+24(FP), X8 + MOV $ret+32(FP), X9 + JMP compare<>(SB) + +// On entry: +// X5 points to start of a +// X6 length of a +// X7 points to start of b +// X8 length of b +// X9 points to the address to store the return value (-1/0/1) +TEXT compare<>(SB),NOSPLIT|NOFRAME,$0 + BEQ X5, X7, cmp_len + + MOV X6, X10 + BGE X8, X10, use_a_len // X10 = min(len(a), len(b)) + MOV X8, X10 +use_a_len: + BEQZ X10, cmp_len + + MOV $32, X11 + BLT X10, X11, loop4_check + + // Check alignment - if alignment differs we have to do one byte at a time. + AND $3, X5, X12 + AND $3, X7, X13 + BNE X12, X13, loop4_check + BEQZ X12, loop32_check + + // Check one byte at a time until we reach 8 byte alignment. + SUB X12, X10, X10 +align: + ADD $-1, X12 + MOVBU 0(X5), X13 + MOVBU 0(X7), X14 + BNE X13, X14, cmp + ADD $1, X5 + ADD $1, X7 + BNEZ X12, align + +loop32_check: + MOV $32, X12 + BLT X10, X12, loop16_check +loop32: + MOV 0(X5), X15 + MOV 0(X7), X16 + MOV 8(X5), X17 + MOV 8(X7), X18 + BEQ X15, X16, loop32a + JMP cmp8a +loop32a: + BEQ X17, X18, loop32b + JMP cmp8b +loop32b: + MOV 16(X5), X15 + MOV 16(X7), X16 + MOV 24(X5), X17 + MOV 24(X7), X18 + BEQ X15, X16, loop32c + JMP cmp8a +loop32c: + BEQ X17, X18, loop32d + JMP cmp8b +loop32d: + ADD $32, X5 + ADD $32, X7 + ADD $-32, X10 + BGE X10, X12, loop32 + BEQZ X10, cmp_len + +loop16_check: + MOV $16, X11 + BLT X10, X11, loop4_check +loop16: + MOV 0(X5), X15 + MOV 0(X7), X16 + MOV 8(X5), X17 + MOV 8(X7), X18 + BEQ X15, X16, loop16a + JMP cmp8a +loop16a: + BEQ X17, X18, loop16b + JMP cmp8b +loop16b: + ADD $16, X5 + ADD $16, X7 + ADD $-16, X10 + BGE X10, X11, loop16 + BEQZ X10, cmp_len + +loop4_check: + MOV $4, X11 + BLT X10, X11, loop1 +loop4: + MOVBU 0(X5), X13 + MOVBU 0(X7), X14 + MOVBU 1(X5), X15 + MOVBU 1(X7), X16 + BEQ X13, X14, loop4a + SLTU X14, X13, X10 + SLTU X13, X14, X11 + JMP cmp_ret +loop4a: + BEQ X15, X16, loop4b + SLTU X16, X15, X10 + SLTU X15, X16, X11 + JMP cmp_ret +loop4b: + MOVBU 2(X5), X21 + MOVBU 2(X7), X22 + MOVBU 3(X5), X23 + MOVBU 3(X7), X24 + BEQ X21, X22, loop4c + SLTU X22, X21, X10 + SLTU X21, X22, X11 + JMP cmp_ret +loop4c: + BEQ X23, X24, loop4d + SLTU X24, X23, X10 + SLTU X23, X24, X11 + JMP cmp_ret +loop4d: + ADD $4, X5 + ADD $4, X7 + ADD $-4, X10 + BGE X10, X11, loop4 + +loop1: + BEQZ X10, cmp_len + MOVBU 0(X5), X13 + MOVBU 0(X7), X14 + BNE X13, X14, cmp + ADD $1, X5 + ADD $1, X7 + ADD $-1, X10 + JMP loop1 + + // Compare 8 bytes of memory in X15/X16 that are known to differ. +cmp8a: + MOV $0xff, X19 +cmp8a_loop: + AND X15, X19, X13 + AND X16, X19, X14 + BNE X13, X14, cmp + SLLI $8, X19 + JMP cmp8a_loop + + // Compare 8 bytes of memory in X17/X18 that are known to differ. +cmp8b: + MOV $0xff, X19 +cmp8b_loop: + AND X17, X19, X13 + AND X18, X19, X14 + BNE X13, X14, cmp + SLLI $8, X19 + JMP cmp8b_loop + +cmp_len: + MOV X6, X13 + MOV X8, X14 +cmp: + SLTU X14, X13, X10 + SLTU X13, X14, X11 +cmp_ret: + SUB X10, X11, X12 + MOV X12, (X9) + RET