1
0
mirror of https://github.com/golang/go synced 2024-09-29 15:34:30 -06:00

internal/bytealg: optimise compare on riscv64

Implement compare using loops that process 32 bytes, 16 bytes, 4 bytes
or 1 byte depending on size and alignment. For comparisons that are less
than 32 bytes the overhead of checking and adjusting alignment usually
exceeds the overhead of reading and processing 4 bytes at a time.

Updates #50615

name                           old time/op    new time/op     delta
BytesCompare/1-4                 68.4ns _ 1%     61.0ns _ 0%   -10.78%  (p=0.001 n=3+3)
BytesCompare/2-4                 82.9ns _ 0%     71.0ns _ 1%   -14.31%  (p=0.000 n=3+3)
BytesCompare/4-4                  107ns _ 0%       70ns _ 0%   -34.96%  (p=0.000 n=3+3)
BytesCompare/8-4                  156ns _ 0%       90ns _ 0%   -42.36%  (p=0.000 n=3+3)
BytesCompare/16-4                 267ns _11%      130ns _ 0%   -51.10%  (p=0.011 n=3+3)
BytesCompare/32-4                 446ns _ 0%       74ns _ 0%   -83.31%  (p=0.000 n=3+3)
BytesCompare/64-4                 840ns _ 2%       91ns _ 0%   -89.17%  (p=0.000 n=3+3)
BytesCompare/128-4               1.60_s _ 0%     0.13_s _ 0%   -92.18%  (p=0.000 n=3+3)
BytesCompare/256-4               3.15_s _ 0%     0.19_s _ 0%   -93.91%  (p=0.000 n=3+3)
BytesCompare/512-4               6.25_s _ 0%     0.33_s _ 0%   -94.80%  (p=0.000 n=3+3)
BytesCompare/1024-4              12.5_s _ 0%      0.6_s _ 0%   -95.23%  (p=0.000 n=3+3)
BytesCompare/2048-4              24.8_s _ 0%      1.1_s _ 0%   -95.46%  (p=0.000 n=3+3)
CompareBytesEqual-4               225ns _ 0%      131ns _ 0%   -41.69%  (p=0.000 n=3+3)
CompareBytesToNil-4              45.3ns _ 7%     46.7ns _ 0%      ~     (p=0.452 n=3+3)
CompareBytesEmpty-4              41.0ns _ 1%     40.6ns _ 0%      ~     (p=0.071 n=3+3)
CompareBytesIdentical-4          48.9ns _ 0%     41.3ns _ 1%   -15.58%  (p=0.000 n=3+3)
CompareBytesSameLength-4          127ns _ 0%       77ns _ 0%   -39.48%  (p=0.000 n=3+3)
CompareBytesDifferentLength-4     136ns _12%       78ns _ 0%   -42.65%  (p=0.018 n=3+3)
CompareBytesBigUnaligned-4       14.9ms _ 1%      7.3ms _ 1%   -50.95%  (p=0.000 n=3+3)
CompareBytesBig-4                14.9ms _ 1%      2.7ms _ 8%   -82.10%  (p=0.000 n=3+3)
CompareBytesBigIdentical-4       52.5ns _ 0%     44.9ns _ 0%   -14.53%  (p=0.000 n=3+3)

name                           old speed      new speed       delta
CompareBytesBigUnaligned-4     70.5MB/s _ 1%  143.8MB/s _ 1%  +103.87%  (p=0.000 n=3+3)
CompareBytesBig-4              70.3MB/s _ 1%  393.8MB/s _ 8%  +460.43%  (p=0.003 n=3+3)
CompareBytesBigIdentical-4     20.0TB/s _ 0%   23.4TB/s _ 0%   +17.00%  (p=0.000 n=3+3)

Change-Id: Ie18712a9009d425c75e1ab49d5a673d84e73a1eb
Reviewed-on: https://go-review.googlesource.com/c/go/+/380076
Trust: Joel Sing <joel@sing.id.au>
Trust: mzh <mzh@golangcn.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
This commit is contained in:
Joel Sing 2022-01-21 19:48:26 +11:00
parent 3c42ebf3e2
commit 291bda80e5
3 changed files with 187 additions and 2 deletions

View File

@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !386 && !amd64 && !s390x && !arm && !arm64 && !ppc64 && !ppc64le && !mips && !mipsle && !wasm && !mips64 && !mips64le
//go:build !386 && !amd64 && !s390x && !arm && !arm64 && !ppc64 && !ppc64le && !mips && !mipsle && !wasm && !mips64 && !mips64le && !riscv64
package bytealg

View File

@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build 386 || amd64 || s390x || arm || arm64 || ppc64 || ppc64le || mips || mipsle || wasm || mips64 || mips64le
//go:build 386 || amd64 || s390x || arm || arm64 || ppc64 || ppc64le || mips || mipsle || wasm || mips64 || mips64le || riscv64
package bytealg

View File

@ -0,0 +1,185 @@
// Copyright 2022 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "go_asm.h"
#include "textflag.h"
TEXT ·Compare(SB),NOSPLIT|NOFRAME,$0-56
MOV a_base+0(FP), X5
MOV a_len+8(FP), X6
MOV b_base+24(FP), X7
MOV b_len+32(FP), X8
MOV $ret+48(FP), X9
JMP compare<>(SB)
TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
MOV a_base+0(FP), X5
MOV a_len+8(FP), X6
MOV b_base+16(FP), X7
MOV b_len+24(FP), X8
MOV $ret+32(FP), X9
JMP compare<>(SB)
// On entry:
// X5 points to start of a
// X6 length of a
// X7 points to start of b
// X8 length of b
// X9 points to the address to store the return value (-1/0/1)
TEXT compare<>(SB),NOSPLIT|NOFRAME,$0
BEQ X5, X7, cmp_len
MOV X6, X10
BGE X8, X10, use_a_len // X10 = min(len(a), len(b))
MOV X8, X10
use_a_len:
BEQZ X10, cmp_len
MOV $32, X11
BLT X10, X11, loop4_check
// Check alignment - if alignment differs we have to do one byte at a time.
AND $3, X5, X12
AND $3, X7, X13
BNE X12, X13, loop4_check
BEQZ X12, loop32_check
// Check one byte at a time until we reach 8 byte alignment.
SUB X12, X10, X10
align:
ADD $-1, X12
MOVBU 0(X5), X13
MOVBU 0(X7), X14
BNE X13, X14, cmp
ADD $1, X5
ADD $1, X7
BNEZ X12, align
loop32_check:
MOV $32, X12
BLT X10, X12, loop16_check
loop32:
MOV 0(X5), X15
MOV 0(X7), X16
MOV 8(X5), X17
MOV 8(X7), X18
BEQ X15, X16, loop32a
JMP cmp8a
loop32a:
BEQ X17, X18, loop32b
JMP cmp8b
loop32b:
MOV 16(X5), X15
MOV 16(X7), X16
MOV 24(X5), X17
MOV 24(X7), X18
BEQ X15, X16, loop32c
JMP cmp8a
loop32c:
BEQ X17, X18, loop32d
JMP cmp8b
loop32d:
ADD $32, X5
ADD $32, X7
ADD $-32, X10
BGE X10, X12, loop32
BEQZ X10, cmp_len
loop16_check:
MOV $16, X11
BLT X10, X11, loop4_check
loop16:
MOV 0(X5), X15
MOV 0(X7), X16
MOV 8(X5), X17
MOV 8(X7), X18
BEQ X15, X16, loop16a
JMP cmp8a
loop16a:
BEQ X17, X18, loop16b
JMP cmp8b
loop16b:
ADD $16, X5
ADD $16, X7
ADD $-16, X10
BGE X10, X11, loop16
BEQZ X10, cmp_len
loop4_check:
MOV $4, X11
BLT X10, X11, loop1
loop4:
MOVBU 0(X5), X13
MOVBU 0(X7), X14
MOVBU 1(X5), X15
MOVBU 1(X7), X16
BEQ X13, X14, loop4a
SLTU X14, X13, X10
SLTU X13, X14, X11
JMP cmp_ret
loop4a:
BEQ X15, X16, loop4b
SLTU X16, X15, X10
SLTU X15, X16, X11
JMP cmp_ret
loop4b:
MOVBU 2(X5), X21
MOVBU 2(X7), X22
MOVBU 3(X5), X23
MOVBU 3(X7), X24
BEQ X21, X22, loop4c
SLTU X22, X21, X10
SLTU X21, X22, X11
JMP cmp_ret
loop4c:
BEQ X23, X24, loop4d
SLTU X24, X23, X10
SLTU X23, X24, X11
JMP cmp_ret
loop4d:
ADD $4, X5
ADD $4, X7
ADD $-4, X10
BGE X10, X11, loop4
loop1:
BEQZ X10, cmp_len
MOVBU 0(X5), X13
MOVBU 0(X7), X14
BNE X13, X14, cmp
ADD $1, X5
ADD $1, X7
ADD $-1, X10
JMP loop1
// Compare 8 bytes of memory in X15/X16 that are known to differ.
cmp8a:
MOV $0xff, X19
cmp8a_loop:
AND X15, X19, X13
AND X16, X19, X14
BNE X13, X14, cmp
SLLI $8, X19
JMP cmp8a_loop
// Compare 8 bytes of memory in X17/X18 that are known to differ.
cmp8b:
MOV $0xff, X19
cmp8b_loop:
AND X17, X19, X13
AND X18, X19, X14
BNE X13, X14, cmp
SLLI $8, X19
JMP cmp8b_loop
cmp_len:
MOV X6, X13
MOV X8, X14
cmp:
SLTU X14, X13, X10
SLTU X13, X14, X11
cmp_ret:
SUB X10, X11, X12
MOV X12, (X9)
RET