mirror of
https://github.com/golang/go
synced 2024-11-18 09:54:57 -07:00
crypto/subtle: improve xorBytes assembler on PPC64
This makes some improvements to the xorBytes assembler implementation for PPC64 targets. The loops to process large streams of bytes has been changed to do 64 bytes at a time. Other changes were made to prevent degradations in some of the common sizes like 8, 16. The case for < 8 bytes on power10 has been modified to use the LXVL and STXVL instructions. Change-Id: I7477d12d5375d484af8c274443d595ccdafbda7c Reviewed-on: https://go-review.googlesource.com/c/go/+/530877 Reviewed-by: Paul Murphy <murp@ibm.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Jayanth Krishnamurthy <jayanth.krishnamurthy@ibm.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Michael Pratt <mpratt@google.com> Reviewed-by: Benny Siegert <bsiegert@gmail.com> Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
This commit is contained in:
parent
be3d5fb6e6
commit
9b61b0054b
@ -13,75 +13,130 @@ TEXT ·xorBytes(SB), NOSPLIT, $0
|
|||||||
MOVD b+16(FP), R5 // R5 = b
|
MOVD b+16(FP), R5 // R5 = b
|
||||||
MOVD n+24(FP), R6 // R6 = n
|
MOVD n+24(FP), R6 // R6 = n
|
||||||
|
|
||||||
CMPU R6, $32, CR7 // Check if n ≥ 32 bytes
|
CMPU R6, $64, CR7 // Check if n ≥ 64 bytes
|
||||||
MOVD R0, R8 // R8 = index
|
MOVD R0, R8 // R8 = index
|
||||||
CMPU R6, $8, CR6 // Check if 8 ≤ n < 32 bytes
|
CMPU R6, $8, CR6 // Check if 8 ≤ n < 64 bytes
|
||||||
BLT CR6, small // Smaller than 8
|
BLE CR6, small // <= 8
|
||||||
BLT CR7, xor16 // Case for 16 ≤ n < 32 bytes
|
BLT CR7, xor32 // Case for 32 ≤ n < 64 bytes
|
||||||
|
|
||||||
// Case for n ≥ 32 bytes
|
// Case for n ≥ 64 bytes
|
||||||
preloop32:
|
preloop64:
|
||||||
SRD $5, R6, R7 // Setup loop counter
|
SRD $6, R6, R7 // Set up loop counter
|
||||||
MOVD R7, CTR
|
MOVD R7, CTR
|
||||||
MOVD $16, R10
|
MOVD $16, R10
|
||||||
ANDCC $31, R6, R9 // Check for tailing bytes for later
|
MOVD $32, R14
|
||||||
loop32:
|
MOVD $48, R15
|
||||||
LXVD2X (R4)(R8), VS32 // VS32 = a[i,...,i+15]
|
ANDCC $63, R6, R9 // Check for tailing bytes for later
|
||||||
LXVD2X (R4)(R10), VS34
|
PCALIGN $16
|
||||||
LXVD2X (R5)(R8), VS33 // VS33 = b[i,...,i+15]
|
// Case for >= 64 bytes
|
||||||
LXVD2X (R5)(R10), VS35
|
// Process 64 bytes per iteration
|
||||||
XXLXOR VS32, VS33, VS32 // VS34 = a[] ^ b[]
|
// Load 4 vectors of a and b
|
||||||
XXLXOR VS34, VS35, VS34
|
// XOR the corresponding vectors
|
||||||
STXVD2X VS32, (R3)(R8) // Store to dst
|
// from a and b and store the result
|
||||||
STXVD2X VS34, (R3)(R10)
|
loop64:
|
||||||
ADD $32, R8 // Update index
|
LXVD2X (R4)(R8), VS32
|
||||||
ADD $32, R10
|
LXVD2X (R4)(R10), VS34
|
||||||
BC 16, 0, loop32 // bdnz loop16
|
LXVD2X (R4)(R14), VS36
|
||||||
|
LXVD2X (R4)(R15), VS38
|
||||||
BEQ CR0, done
|
LXVD2X (R5)(R8), VS33
|
||||||
|
LXVD2X (R5)(R10), VS35
|
||||||
MOVD R9, R6
|
LXVD2X (R5)(R14), VS37
|
||||||
CMP R6, $8
|
LXVD2X (R5)(R15), VS39
|
||||||
BLT small
|
XXLXOR VS32, VS33, VS32
|
||||||
|
XXLXOR VS34, VS35, VS34
|
||||||
|
XXLXOR VS36, VS37, VS36
|
||||||
|
XXLXOR VS38, VS39, VS38
|
||||||
|
STXVD2X VS32, (R3)(R8)
|
||||||
|
STXVD2X VS34, (R3)(R10)
|
||||||
|
STXVD2X VS36, (R3)(R14)
|
||||||
|
STXVD2X VS38, (R3)(R15)
|
||||||
|
ADD $64, R8
|
||||||
|
ADD $64, R10
|
||||||
|
ADD $64, R14
|
||||||
|
ADD $64, R15
|
||||||
|
BDNZ loop64
|
||||||
|
BC 12,2,LR // BEQLR
|
||||||
|
MOVD R9, R6
|
||||||
|
CMP R6, $8
|
||||||
|
BLE small
|
||||||
|
// Case for 8 <= n < 64 bytes
|
||||||
|
// Process 32 bytes if available
|
||||||
|
xor32:
|
||||||
|
CMP R6, $32
|
||||||
|
BLT xor16
|
||||||
|
ADD $16, R8, R9
|
||||||
|
LXVD2X (R4)(R8), VS32
|
||||||
|
LXVD2X (R4)(R9), VS33
|
||||||
|
LXVD2X (R5)(R8), VS34
|
||||||
|
LXVD2X (R5)(R9), VS35
|
||||||
|
XXLXOR VS32, VS34, VS32
|
||||||
|
XXLXOR VS33, VS35, VS33
|
||||||
|
STXVD2X VS32, (R3)(R8)
|
||||||
|
STXVD2X VS33, (R3)(R9)
|
||||||
|
ADD $32, R8
|
||||||
|
ADD $-32, R6
|
||||||
|
CMP R6, $8
|
||||||
|
BLE small
|
||||||
|
// Case for 8 <= n < 32 bytes
|
||||||
|
// Process 16 bytes if available
|
||||||
xor16:
|
xor16:
|
||||||
CMP R6, $16
|
CMP R6, $16
|
||||||
BLT xor8
|
BLT xor8
|
||||||
LXVD2X (R4)(R8), VS32
|
LXVD2X (R4)(R8), VS32
|
||||||
LXVD2X (R5)(R8), VS33
|
LXVD2X (R5)(R8), VS33
|
||||||
XXLXOR VS32, VS33, VS32
|
XXLXOR VS32, VS33, VS32
|
||||||
STXVD2X VS32, (R3)(R8)
|
STXVD2X VS32, (R3)(R8)
|
||||||
ADD $16, R8
|
ADD $16, R8
|
||||||
ADD $-16, R6
|
ADD $-16, R6
|
||||||
CMP R6, $8
|
|
||||||
BLT small
|
|
||||||
xor8:
|
|
||||||
// Case for 8 ≤ n < 16 bytes
|
|
||||||
MOVD (R4)(R8), R14 // R14 = a[i,...,i+7]
|
|
||||||
MOVD (R5)(R8), R15 // R15 = b[i,...,i+7]
|
|
||||||
XOR R14, R15, R16 // R16 = a[] ^ b[]
|
|
||||||
SUB $8, R6 // n = n - 8
|
|
||||||
MOVD R16, (R3)(R8) // Store to dst
|
|
||||||
ADD $8, R8
|
|
||||||
|
|
||||||
// Check if we're finished
|
|
||||||
CMP R6, R0
|
|
||||||
BGT small
|
|
||||||
RET
|
|
||||||
|
|
||||||
// Case for n < 8 bytes and tailing bytes from the
|
|
||||||
// previous cases.
|
|
||||||
small:
|
small:
|
||||||
CMP R6, R0
|
CMP R6, R0
|
||||||
BEQ done
|
BC 12,2,LR // BEQLR
|
||||||
MOVD R6, CTR // Setup loop counter
|
xor8:
|
||||||
|
#ifdef GOPPC64_power10
|
||||||
loop:
|
SLD $56,R6,R17
|
||||||
|
ADD R4,R8,R18
|
||||||
|
ADD R5,R8,R19
|
||||||
|
ADD R3,R8,R20
|
||||||
|
LXVL R18,R17,V0
|
||||||
|
LXVL R19,R17,V1
|
||||||
|
VXOR V0,V1,V1
|
||||||
|
STXVL V1,R20,R17
|
||||||
|
RET
|
||||||
|
#else
|
||||||
|
CMP R6, $8
|
||||||
|
BLT xor4
|
||||||
|
// Case for 8 ≤ n < 16 bytes
|
||||||
|
MOVD (R4)(R8), R14 // R14 = a[i,...,i+7]
|
||||||
|
MOVD (R5)(R8), R15 // R15 = b[i,...,i+7]
|
||||||
|
XOR R14, R15, R16 // R16 = a[] ^ b[]
|
||||||
|
SUB $8, R6 // n = n - 8
|
||||||
|
MOVD R16, (R3)(R8) // Store to dst
|
||||||
|
ADD $8, R8
|
||||||
|
xor4:
|
||||||
|
CMP R6, $4
|
||||||
|
BLT xor2
|
||||||
|
MOVWZ (R4)(R8), R14
|
||||||
|
MOVWZ (R5)(R8), R15
|
||||||
|
XOR R14, R15, R16
|
||||||
|
MOVW R16, (R3)(R8)
|
||||||
|
ADD $4,R8
|
||||||
|
ADD $-4,R6
|
||||||
|
xor2:
|
||||||
|
CMP R6, $2
|
||||||
|
BLT xor1
|
||||||
|
MOVHZ (R4)(R8), R14
|
||||||
|
MOVHZ (R5)(R8), R15
|
||||||
|
XOR R14, R15, R16
|
||||||
|
MOVH R16, (R3)(R8)
|
||||||
|
ADD $2,R8
|
||||||
|
ADD $-2,R6
|
||||||
|
xor1:
|
||||||
|
CMP R6, R0
|
||||||
|
BC 12,2,LR // BEQLR
|
||||||
MOVBZ (R4)(R8), R14 // R14 = a[i]
|
MOVBZ (R4)(R8), R14 // R14 = a[i]
|
||||||
MOVBZ (R5)(R8), R15 // R15 = b[i]
|
MOVBZ (R5)(R8), R15 // R15 = b[i]
|
||||||
XOR R14, R15, R16 // R16 = a[i] ^ b[i]
|
XOR R14, R15, R16 // R16 = a[i] ^ b[i]
|
||||||
MOVB R16, (R3)(R8) // Store to dst
|
MOVB R16, (R3)(R8) // Store to dst
|
||||||
ADD $1, R8
|
#endif
|
||||||
BC 16, 0, loop // bdnz loop
|
|
||||||
|
|
||||||
done:
|
done:
|
||||||
RET
|
RET
|
||||||
|
Loading…
Reference in New Issue
Block a user