1
0
mirror of https://github.com/golang/go synced 2024-11-23 13:30:08 -07:00

runtime: improve equal on ppc64x/power10

Rewrite equal asm function to use the new power10 instruction lxvl
and stxvl- load and store with variable length which can simplify
the tail end bytes comparison process. Cleaned up code on CR
register usage.

On power9 and power8 the code remains unchanged. The performance
for multiple sizes<=16 improve on power10 with the change.

name      old time/op    new time/op    delta
Equal/1     5.28ns ± 0%    4.19ns ± 9%  -20.80%
Equal/2     5.30ns ± 0%    4.29ns ± 6%  -19.06%
Equal/3     5.10ns ± 5%    4.20ns ± 6%  -17.73%
Equal/4     5.05ns ± 0%    4.42ns ± 4%  -12.50%
Equal/5     5.27ns ± 1%    4.44ns ± 4%  -15.69%
Equal/6     5.30ns ± 0%    4.38ns ±12%  -17.44%
Equal/7     5.02ns ± 6%    4.48ns ± 2%  -10.64%
Equal/9     4.53ns ± 0%    4.34ns ± 7%   -4.21%
Equal/16    4.52ns ± 0%    4.29ns ± 6%   -5.16%

Change-Id: Ie124906e3a5012dfe634bfe09af06be42f1b178b
Reviewed-on: https://go-review.googlesource.com/c/go/+/473536
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Paul Murphy <murp@ibm.com>
This commit is contained in:
Archana R 2023-03-06 03:17:47 -06:00 committed by Lynn Boger
parent b4ac4b4b42
commit 8377f2014d

View File

@ -10,13 +10,6 @@
// 4K (smallest case) page size offset mask for PPC64.
#define PAGE_OFFSET 4095
// TODO: At writing, ISEL and BC do not support CR bit type arguments,
// define them here for readability.
#define CR0LT 4*0+0
#define CR0EQ 4*0+2
#define CR1LT 4*1+0
#define CR6LT 4*6+0
// Likewise, the BC opcode is hard to read, and no extended
// mnemonics are offered for these forms.
#define BGELR_CR6 BC 4, CR6LT, (LR)
@ -90,7 +83,7 @@ loop64:
ADD $64,R4
BDNZ loop64
ISEL $CR0EQ, R11, R3, R3 // If no tail, return 1, otherwise R3 remains 0.
ISEL CR0EQ, R11, R3, R3 // If no tail, return 1, otherwise R3 remains 0.
BEQLR // return if no tail.
ADD $-64, R9, R8
@ -110,7 +103,7 @@ loop64:
LXVD2X (R8+R16), V0
LXVD2X (R4+R16), V1
VCMPEQUBCC V0, V1, V2
ISEL $CR6LT, R11, R0, R3
ISEL CR6LT, R11, R0, R3
RET
check33_64:
@ -138,7 +131,7 @@ check17_32:
LXVD2X (R8+R0), V0
LXVD2X (R4+R0), V1
VCMPEQUBCC V0, V1, V2
ISEL $CR6LT, R11, R0, R5
ISEL CR6LT, R11, R0, R5
// Load sX[len(sX)-16:len(sX)] and compare.
ADD $-16, R9
@ -146,22 +139,30 @@ check17_32:
LXVD2X (R9+R0), V0
LXVD2X (R10+R0), V1
VCMPEQUBCC V0, V1, V2
ISEL $CR6LT, R5, R0, R3
ISEL CR6LT, R5, R0, R3
RET
check0_16:
#ifdef GOPPC64_power10
SLD $56, R5, R7
LXVL R8, R7, V0
LXVL R4, R7, V1
VCMPEQUDCC V0, V1, V2
ISEL CR6LT, R11, R0, R3
RET
#else
CMP R5, $8
BLT check0_7
// Load sX[0:7] and compare.
MOVD (R8), R6
MOVD (R4), R7
CMP R6, R7
ISEL $CR0EQ, R11, R0, R5
ISEL CR0EQ, R11, R0, R5
// Load sX[len(sX)-8:len(sX)] and compare.
MOVD -8(R9), R6
MOVD -8(R10), R7
CMP R6, R7
ISEL $CR0EQ, R5, R0, R3
ISEL CR0EQ, R5, R0, R3
RET
check0_7:
@ -183,8 +184,8 @@ check0_7:
CMPU R9, R12, CR0
SUB R12, R8, R6 // compute lower load address
SUB R12, R4, R9
ISEL $CR1LT, R8, R6, R8 // R8 = R6 < 0 ? R8 (&s1) : R6 (&s1 - (8-len))
ISEL $CR0LT, R4, R9, R4 // Similar for s2
ISEL CR1LT, R8, R6, R8 // R8 = R6 < 0 ? R8 (&s1) : R6 (&s1 - (8-len))
ISEL CR0LT, R4, R9, R4 // Similar for s2
MOVD (R8), R15
MOVD (R4), R16
SLD R14, R15, R7
@ -194,12 +195,13 @@ check0_7:
SRD R14, R15, R6 // Clear the lower (8-len) bytes
SRD R14, R16, R9
#ifdef GOARCH_ppc64le
ISEL $CR1LT, R7, R6, R8 // Choose the correct len bytes to compare based on alignment
ISEL $CR0LT, R17, R9, R4
ISEL CR1LT, R7, R6, R8 // Choose the correct len bytes to compare based on alignment
ISEL CR0LT, R17, R9, R4
#else
ISEL $CR1LT, R6, R7, R8
ISEL $CR0LT, R9, R17, R4
ISEL CR1LT, R6, R7, R8
ISEL CR0LT, R9, R17, R4
#endif
CMP R4, R8
ISEL $CR0EQ, R11, R0, R3
ISEL CR0EQ, R11, R0, R3
RET
#endif // tail processing if !defined(GOPPC64_power10)