1
0
mirror of https://github.com/golang/go synced 2024-11-24 10:50:13 -07:00

math/big: Implement shlVU and shrVU in ASM for PPC64

Currently the shift left and shift right functions are coded in .go
on PPC64. Implementing them in ASM just like AMD and ARM results in
overall speedup of shift benchmarks on POWER8/9/10.

name                        old time/op  new time/op  delta
NonZeroShifts/1/shrVU       8.50ns ± 0%  5.21ns ± 0%  -38.66%
NonZeroShifts/1/shlVU       8.85ns ± 1%  5.24ns ± 0%  -40.78%
NonZeroShifts/2/shrVU       9.16ns ± 0%  5.51ns ± 0%  -39.80%
NonZeroShifts/2/shlVU       9.24ns ± 2%  5.61ns ± 0%  -39.28%
NonZeroShifts/3/shrVU       10.6ns ± 0%   6.8ns ± 0%  -35.78%
NonZeroShifts/3/shlVU       10.7ns ± 2%   6.4ns ± 0%  -40.82%
NonZeroShifts/4/shrVU       12.4ns ± 0%   7.7ns ± 0%  -38.12%
NonZeroShifts/4/shlVU       12.3ns ± 1%   7.5ns ± 0%  -38.67%
NonZeroShifts/5/shrVU       13.2ns ± 0%   8.5ns ± 0%  -35.51%
NonZeroShifts/5/shlVU       13.3ns ± 2%   9.3ns ± 0%  -30.05%
NonZeroShifts/10/shrVU      16.5ns ± 0%  13.1ns ± 0%  -20.12%
NonZeroShifts/10/shlVU      16.8ns ± 1%  14.1ns ± 0%  -16.02%
NonZeroShifts/100/shrVU      122ns ± 0%    94ns ± 0%  -22.87%
NonZeroShifts/100/shlVU      115ns ± 0%   103ns ± 0%  -10.50%
NonZeroShifts/1000/shrVU    1.10µs ± 0%  0.91µs ± 0%  -17.03%
NonZeroShifts/1000/shlVU    1.02µs ± 0%  0.93µs ± 0%   -8.74%
NonZeroShifts/10000/shrVU   10.9µs ± 0%   9.1µs ± 0%  -16.66%
NonZeroShifts/10000/shlVU   10.1µs ± 0%   9.3µs ± 0%   -8.19%
NonZeroShifts/100000/shrVU   109µs ± 0%    91µs ± 0%  -16.01%
NonZeroShifts/100000/shlVU   101µs ± 0%    94µs ± 0%   -7.16%

Change-Id: Ia31951cc29a4169beb494d2951427cbe1e963b11
Reviewed-on: https://go-review.googlesource.com/c/go/+/384474
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Russ Cox <rsc@golang.org>
Auto-Submit: Russ Cox <rsc@golang.org>
Reviewed-by: Ian Lance Taylor <iant@google.com>
This commit is contained in:
Archana R 2022-02-09 07:37:12 -06:00 committed by Gopher Robot
parent d85694ab4f
commit 6183920a33

View File

@ -346,11 +346,161 @@ done:
MOVD R4, c+56(FP)
RET
//func shlVU(z, x []Word, s uint) (c Word)
TEXT ·shlVU(SB), NOSPLIT, $0
BR ·shlVU_g(SB)
MOVD z+0(FP), R3
MOVD x+24(FP), R6
MOVD s+48(FP), R9
MOVD z_len+8(FP), R4
MOVD x_len+32(FP), R7
CMP R9, R0 // s==0 copy(z,x)
BEQ zeroshift
CMP R4, R0 // len(z)==0 return
BEQ done
ADD $-1, R4, R5 // len(z)-1
SUBC R9, $64, R4 // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64)
SLD $3, R5, R7
ADD R6, R7, R15 // save starting address &x[len(z)-1]
ADD R3, R7, R16 // save starting address &z[len(z)-1]
MOVD (R6)(R7), R14
SRD R4, R14, R7 // compute x[len(z)-1]>>ŝ into R7
CMP R5, R0 // iterate from i=len(z)-1 to 0
BEQ loopexit // Already at end?
MOVD 0(R15),R10 // x[i]
shloop:
SLD R9, R10, R10 // x[i]<<s
MOVDU -8(R15), R14
SRD R4, R14, R11 // x[i-1]>>ŝ
OR R11, R10, R10
MOVD R10, 0(R16) // z[i-1]=x[i]<<s | x[i-1]>>ŝ
MOVD R14, R10 // reuse x[i-1] for next iteration
ADD $-8, R16 // i--
CMP R15, R6 // &x[i-1]>&x[0]?
BGT shloop
loopexit:
MOVD 0(R6), R4
SLD R9, R4, R4
MOVD R4, 0(R3) // z[0]=x[0]<<s
MOVD R7, c+56(FP) // store pre-computed x[len(z)-1]>>ŝ into c
RET
zeroshift:
CMP R6, R0 // x is null, nothing to copy
BEQ done
CMP R6, R3 // if x is same as z, nothing to copy
BEQ done
CMP R7, R4
ISEL $0, R7, R4, R7 // Take the lower bound of lengths of x,z
SLD $3, R7, R7
SUB R6, R3, R11 // dest - src
CMPU R11, R7, CR2 // < len?
BLT CR2, backward // there is overlap, copy backwards
MOVD $0, R14
// shlVU processes backwards, but added a forward copy option
// since its faster on POWER
repeat:
MOVD (R6)(R14), R15 // Copy 8 bytes at a time
MOVD R15, (R3)(R14)
ADD $8, R14
CMP R14, R7 // More 8 bytes left?
BLT repeat
BR done
backward:
ADD $-8,R7, R14
repeatback:
MOVD (R6)(R14), R15 // copy x into z backwards
MOVD R15, (R3)(R14) // copy 8 bytes at a time
SUB $8, R14
CMP R14, $-8 // More 8 bytes left?
BGT repeatback
done:
MOVD R0, c+56(FP) // c=0
RET
//func shrVU(z, x []Word, s uint) (c Word)
TEXT ·shrVU(SB), NOSPLIT, $0
BR ·shrVU_g(SB)
MOVD z+0(FP), R3
MOVD x+24(FP), R6
MOVD s+48(FP), R9
MOVD z_len+8(FP), R4
MOVD x_len+32(FP), R7
CMP R9, R0 // s==0, copy(z,x)
BEQ zeroshift
CMP R4, R0 // len(z)==0 return
BEQ done
SUBC R9, $64, R5 // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64)
MOVD 0(R6), R7
SLD R5, R7, R7 // compute x[0]<<ŝ
MOVD $1, R8 // iterate from i=1 to i<len(z)
CMP R8, R4
BGE loopexit // Already at end?
// vectorize if len(z) is >=3, else jump to scalar loop
CMP R4, $3
BLT scalar
MTVSRD R9, VS38 // s
VSPLTB $7, V6, V4
MTVSRD R5, VS39 // ŝ
VSPLTB $7, V7, V2
ADD $-2, R4, R16
PCALIGN $16
loopback:
ADD $-1, R8, R10
SLD $3, R10
LXVD2X (R6)(R10), VS32 // load x[i-1], x[i]
SLD $3, R8, R12
LXVD2X (R6)(R12), VS33 // load x[i], x[i+1]
VSRD V0, V4, V3 // x[i-1]>>s, x[i]>>s
VSLD V1, V2, V5 // x[i]<<ŝ, x[i+1]<<ŝ
VOR V3, V5, V5 // Or(|) the two registers together
STXVD2X VS37, (R3)(R10) // store into z[i-1] and z[i]
ADD $2, R8 // Done processing 2 entries, i and i+1
CMP R8, R16 // Are there at least a couple of more entries left?
BLE loopback
CMP R8, R4 // Are we at the last element?
BEQ loopexit
scalar:
ADD $-1, R8, R10
SLD $3, R10
MOVD (R6)(R10),R11
SRD R9, R11, R11 // x[len(z)-2] >> s
SLD $3, R8, R12
MOVD (R6)(R12), R12
SLD R5, R12, R12 // x[len(z)-1]<<ŝ
OR R12, R11, R11 // x[len(z)-2]>>s | x[len(z)-1]<<ŝ
MOVD R11, (R3)(R10) // z[len(z)-2]=x[len(z)-2]>>s | x[len(z)-1]<<ŝ
loopexit:
ADD $-1, R4
SLD $3, R4
MOVD (R6)(R4), R5
SRD R9, R5, R5 // x[len(z)-1]>>s
MOVD R5, (R3)(R4) // z[len(z)-1]=x[len(z)-1]>>s
MOVD R7, c+56(FP) // store pre-computed x[0]<<ŝ into c
RET
zeroshift:
CMP R6, R0 // x is null, nothing to copy
BEQ done
CMP R6, R3 // if x is same as z, nothing to copy
BEQ done
CMP R7, R4
ISEL $0, R7, R4, R7 // Take the lower bounds of lengths of x, z
SLD $3, R7, R7
MOVD $0, R14
repeat:
MOVD (R6)(R14), R15 // copy 8 bytes at a time
MOVD R15, (R3)(R14) // shrVU processes bytes only forwards
ADD $8, R14
CMP R14, R7 // More 8 bytes left?
BLT repeat
done:
MOVD R0, c+56(FP)
RET
// func mulAddVWW(z, x []Word, y, r Word) (c Word)
TEXT ·mulAddVWW(SB), NOSPLIT, $0