mirror of
https://github.com/golang/go
synced 2024-11-24 10:50:13 -07:00
math/big: Implement shlVU and shrVU in ASM for PPC64
Currently the shift left and shift right functions are coded in .go on PPC64. Implementing them in ASM just like AMD and ARM results in overall speedup of shift benchmarks on POWER8/9/10. name old time/op new time/op delta NonZeroShifts/1/shrVU 8.50ns ± 0% 5.21ns ± 0% -38.66% NonZeroShifts/1/shlVU 8.85ns ± 1% 5.24ns ± 0% -40.78% NonZeroShifts/2/shrVU 9.16ns ± 0% 5.51ns ± 0% -39.80% NonZeroShifts/2/shlVU 9.24ns ± 2% 5.61ns ± 0% -39.28% NonZeroShifts/3/shrVU 10.6ns ± 0% 6.8ns ± 0% -35.78% NonZeroShifts/3/shlVU 10.7ns ± 2% 6.4ns ± 0% -40.82% NonZeroShifts/4/shrVU 12.4ns ± 0% 7.7ns ± 0% -38.12% NonZeroShifts/4/shlVU 12.3ns ± 1% 7.5ns ± 0% -38.67% NonZeroShifts/5/shrVU 13.2ns ± 0% 8.5ns ± 0% -35.51% NonZeroShifts/5/shlVU 13.3ns ± 2% 9.3ns ± 0% -30.05% NonZeroShifts/10/shrVU 16.5ns ± 0% 13.1ns ± 0% -20.12% NonZeroShifts/10/shlVU 16.8ns ± 1% 14.1ns ± 0% -16.02% NonZeroShifts/100/shrVU 122ns ± 0% 94ns ± 0% -22.87% NonZeroShifts/100/shlVU 115ns ± 0% 103ns ± 0% -10.50% NonZeroShifts/1000/shrVU 1.10µs ± 0% 0.91µs ± 0% -17.03% NonZeroShifts/1000/shlVU 1.02µs ± 0% 0.93µs ± 0% -8.74% NonZeroShifts/10000/shrVU 10.9µs ± 0% 9.1µs ± 0% -16.66% NonZeroShifts/10000/shlVU 10.1µs ± 0% 9.3µs ± 0% -8.19% NonZeroShifts/100000/shrVU 109µs ± 0% 91µs ± 0% -16.01% NonZeroShifts/100000/shlVU 101µs ± 0% 94µs ± 0% -7.16% Change-Id: Ia31951cc29a4169beb494d2951427cbe1e963b11 Reviewed-on: https://go-review.googlesource.com/c/go/+/384474 Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com> Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Gopher Robot <gobot@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Auto-Submit: Russ Cox <rsc@golang.org> Reviewed-by: Ian Lance Taylor <iant@google.com>
This commit is contained in:
parent
d85694ab4f
commit
6183920a33
@ -346,11 +346,161 @@ done:
|
||||
MOVD R4, c+56(FP)
|
||||
RET
|
||||
|
||||
//func shlVU(z, x []Word, s uint) (c Word)
|
||||
TEXT ·shlVU(SB), NOSPLIT, $0
|
||||
BR ·shlVU_g(SB)
|
||||
MOVD z+0(FP), R3
|
||||
MOVD x+24(FP), R6
|
||||
MOVD s+48(FP), R9
|
||||
MOVD z_len+8(FP), R4
|
||||
MOVD x_len+32(FP), R7
|
||||
CMP R9, R0 // s==0 copy(z,x)
|
||||
BEQ zeroshift
|
||||
CMP R4, R0 // len(z)==0 return
|
||||
BEQ done
|
||||
|
||||
ADD $-1, R4, R5 // len(z)-1
|
||||
SUBC R9, $64, R4 // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64)
|
||||
SLD $3, R5, R7
|
||||
ADD R6, R7, R15 // save starting address &x[len(z)-1]
|
||||
ADD R3, R7, R16 // save starting address &z[len(z)-1]
|
||||
MOVD (R6)(R7), R14
|
||||
SRD R4, R14, R7 // compute x[len(z)-1]>>ŝ into R7
|
||||
CMP R5, R0 // iterate from i=len(z)-1 to 0
|
||||
BEQ loopexit // Already at end?
|
||||
MOVD 0(R15),R10 // x[i]
|
||||
shloop:
|
||||
SLD R9, R10, R10 // x[i]<<s
|
||||
MOVDU -8(R15), R14
|
||||
SRD R4, R14, R11 // x[i-1]>>ŝ
|
||||
OR R11, R10, R10
|
||||
MOVD R10, 0(R16) // z[i-1]=x[i]<<s | x[i-1]>>ŝ
|
||||
MOVD R14, R10 // reuse x[i-1] for next iteration
|
||||
ADD $-8, R16 // i--
|
||||
CMP R15, R6 // &x[i-1]>&x[0]?
|
||||
BGT shloop
|
||||
loopexit:
|
||||
MOVD 0(R6), R4
|
||||
SLD R9, R4, R4
|
||||
MOVD R4, 0(R3) // z[0]=x[0]<<s
|
||||
MOVD R7, c+56(FP) // store pre-computed x[len(z)-1]>>ŝ into c
|
||||
RET
|
||||
|
||||
zeroshift:
|
||||
CMP R6, R0 // x is null, nothing to copy
|
||||
BEQ done
|
||||
CMP R6, R3 // if x is same as z, nothing to copy
|
||||
BEQ done
|
||||
CMP R7, R4
|
||||
ISEL $0, R7, R4, R7 // Take the lower bound of lengths of x,z
|
||||
SLD $3, R7, R7
|
||||
SUB R6, R3, R11 // dest - src
|
||||
CMPU R11, R7, CR2 // < len?
|
||||
BLT CR2, backward // there is overlap, copy backwards
|
||||
MOVD $0, R14
|
||||
// shlVU processes backwards, but added a forward copy option
|
||||
// since its faster on POWER
|
||||
repeat:
|
||||
MOVD (R6)(R14), R15 // Copy 8 bytes at a time
|
||||
MOVD R15, (R3)(R14)
|
||||
ADD $8, R14
|
||||
CMP R14, R7 // More 8 bytes left?
|
||||
BLT repeat
|
||||
BR done
|
||||
backward:
|
||||
ADD $-8,R7, R14
|
||||
repeatback:
|
||||
MOVD (R6)(R14), R15 // copy x into z backwards
|
||||
MOVD R15, (R3)(R14) // copy 8 bytes at a time
|
||||
SUB $8, R14
|
||||
CMP R14, $-8 // More 8 bytes left?
|
||||
BGT repeatback
|
||||
|
||||
done:
|
||||
MOVD R0, c+56(FP) // c=0
|
||||
RET
|
||||
|
||||
//func shrVU(z, x []Word, s uint) (c Word)
|
||||
TEXT ·shrVU(SB), NOSPLIT, $0
|
||||
BR ·shrVU_g(SB)
|
||||
MOVD z+0(FP), R3
|
||||
MOVD x+24(FP), R6
|
||||
MOVD s+48(FP), R9
|
||||
MOVD z_len+8(FP), R4
|
||||
MOVD x_len+32(FP), R7
|
||||
|
||||
CMP R9, R0 // s==0, copy(z,x)
|
||||
BEQ zeroshift
|
||||
CMP R4, R0 // len(z)==0 return
|
||||
BEQ done
|
||||
SUBC R9, $64, R5 // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64)
|
||||
|
||||
MOVD 0(R6), R7
|
||||
SLD R5, R7, R7 // compute x[0]<<ŝ
|
||||
MOVD $1, R8 // iterate from i=1 to i<len(z)
|
||||
CMP R8, R4
|
||||
BGE loopexit // Already at end?
|
||||
|
||||
// vectorize if len(z) is >=3, else jump to scalar loop
|
||||
CMP R4, $3
|
||||
BLT scalar
|
||||
MTVSRD R9, VS38 // s
|
||||
VSPLTB $7, V6, V4
|
||||
MTVSRD R5, VS39 // ŝ
|
||||
VSPLTB $7, V7, V2
|
||||
ADD $-2, R4, R16
|
||||
PCALIGN $16
|
||||
loopback:
|
||||
ADD $-1, R8, R10
|
||||
SLD $3, R10
|
||||
LXVD2X (R6)(R10), VS32 // load x[i-1], x[i]
|
||||
SLD $3, R8, R12
|
||||
LXVD2X (R6)(R12), VS33 // load x[i], x[i+1]
|
||||
|
||||
VSRD V0, V4, V3 // x[i-1]>>s, x[i]>>s
|
||||
VSLD V1, V2, V5 // x[i]<<ŝ, x[i+1]<<ŝ
|
||||
VOR V3, V5, V5 // Or(|) the two registers together
|
||||
STXVD2X VS37, (R3)(R10) // store into z[i-1] and z[i]
|
||||
ADD $2, R8 // Done processing 2 entries, i and i+1
|
||||
CMP R8, R16 // Are there at least a couple of more entries left?
|
||||
BLE loopback
|
||||
CMP R8, R4 // Are we at the last element?
|
||||
BEQ loopexit
|
||||
scalar:
|
||||
ADD $-1, R8, R10
|
||||
SLD $3, R10
|
||||
MOVD (R6)(R10),R11
|
||||
SRD R9, R11, R11 // x[len(z)-2] >> s
|
||||
SLD $3, R8, R12
|
||||
MOVD (R6)(R12), R12
|
||||
SLD R5, R12, R12 // x[len(z)-1]<<ŝ
|
||||
OR R12, R11, R11 // x[len(z)-2]>>s | x[len(z)-1]<<ŝ
|
||||
MOVD R11, (R3)(R10) // z[len(z)-2]=x[len(z)-2]>>s | x[len(z)-1]<<ŝ
|
||||
loopexit:
|
||||
ADD $-1, R4
|
||||
SLD $3, R4
|
||||
MOVD (R6)(R4), R5
|
||||
SRD R9, R5, R5 // x[len(z)-1]>>s
|
||||
MOVD R5, (R3)(R4) // z[len(z)-1]=x[len(z)-1]>>s
|
||||
MOVD R7, c+56(FP) // store pre-computed x[0]<<ŝ into c
|
||||
RET
|
||||
|
||||
zeroshift:
|
||||
CMP R6, R0 // x is null, nothing to copy
|
||||
BEQ done
|
||||
CMP R6, R3 // if x is same as z, nothing to copy
|
||||
BEQ done
|
||||
CMP R7, R4
|
||||
ISEL $0, R7, R4, R7 // Take the lower bounds of lengths of x, z
|
||||
SLD $3, R7, R7
|
||||
MOVD $0, R14
|
||||
repeat:
|
||||
MOVD (R6)(R14), R15 // copy 8 bytes at a time
|
||||
MOVD R15, (R3)(R14) // shrVU processes bytes only forwards
|
||||
ADD $8, R14
|
||||
CMP R14, R7 // More 8 bytes left?
|
||||
BLT repeat
|
||||
done:
|
||||
MOVD R0, c+56(FP)
|
||||
RET
|
||||
|
||||
// func mulAddVWW(z, x []Word, y, r Word) (c Word)
|
||||
TEXT ·mulAddVWW(SB), NOSPLIT, $0
|
||||
|
Loading…
Reference in New Issue
Block a user