1
0
mirror of https://github.com/golang/go synced 2024-11-18 09:54:57 -07:00

crypto/subtle: improve xorBytes assembler on PPC64

This makes some improvements to the xorBytes assembler
implementation for PPC64 targets.

The loops to process large streams of bytes has been changed to
do 64 bytes at a time. Other changes were made to prevent
degradations in some of the common sizes like 8, 16.

The case for < 8 bytes on power10 has been modified to use
the LXVL and STXVL instructions.

Change-Id: I7477d12d5375d484af8c274443d595ccdafbda7c
Reviewed-on: https://go-review.googlesource.com/c/go/+/530877
Reviewed-by: Paul Murphy <murp@ibm.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Jayanth Krishnamurthy <jayanth.krishnamurthy@ibm.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Benny Siegert <bsiegert@gmail.com>
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
This commit is contained in:
Lynn Boger 2023-09-25 10:49:11 -05:00
parent be3d5fb6e6
commit 9b61b0054b

View File

@ -13,75 +13,130 @@ TEXT ·xorBytes(SB), NOSPLIT, $0
MOVD b+16(FP), R5 // R5 = b MOVD b+16(FP), R5 // R5 = b
MOVD n+24(FP), R6 // R6 = n MOVD n+24(FP), R6 // R6 = n
CMPU R6, $32, CR7 // Check if n 32 bytes CMPU R6, $64, CR7 // Check if n 64 bytes
MOVD R0, R8 // R8 = index MOVD R0, R8 // R8 = index
CMPU R6, $8, CR6 // Check if 8 n < 32 bytes CMPU R6, $8, CR6 // Check if 8 n < 64 bytes
BLT CR6, small // Smaller than 8 BLE CR6, small // <= 8
BLT CR7, xor16 // Case for 16 n < 32 bytes BLT CR7, xor32 // Case for 32 n < 64 bytes
// Case for n 32 bytes // Case for n 64 bytes
preloop32: preloop64:
SRD $5, R6, R7 // Setup loop counter SRD $6, R6, R7 // Set up loop counter
MOVD R7, CTR MOVD R7, CTR
MOVD $16, R10 MOVD $16, R10
ANDCC $31, R6, R9 // Check for tailing bytes for later MOVD $32, R14
loop32: MOVD $48, R15
LXVD2X (R4)(R8), VS32 // VS32 = a[i,...,i+15] ANDCC $63, R6, R9 // Check for tailing bytes for later
LXVD2X (R4)(R10), VS34 PCALIGN $16
LXVD2X (R5)(R8), VS33 // VS33 = b[i,...,i+15] // Case for >= 64 bytes
LXVD2X (R5)(R10), VS35 // Process 64 bytes per iteration
XXLXOR VS32, VS33, VS32 // VS34 = a[] ^ b[] // Load 4 vectors of a and b
XXLXOR VS34, VS35, VS34 // XOR the corresponding vectors
STXVD2X VS32, (R3)(R8) // Store to dst // from a and b and store the result
STXVD2X VS34, (R3)(R10) loop64:
ADD $32, R8 // Update index LXVD2X (R4)(R8), VS32
ADD $32, R10 LXVD2X (R4)(R10), VS34
BC 16, 0, loop32 // bdnz loop16 LXVD2X (R4)(R14), VS36
LXVD2X (R4)(R15), VS38
BEQ CR0, done LXVD2X (R5)(R8), VS33
LXVD2X (R5)(R10), VS35
MOVD R9, R6 LXVD2X (R5)(R14), VS37
CMP R6, $8 LXVD2X (R5)(R15), VS39
BLT small XXLXOR VS32, VS33, VS32
XXLXOR VS34, VS35, VS34
XXLXOR VS36, VS37, VS36
XXLXOR VS38, VS39, VS38
STXVD2X VS32, (R3)(R8)
STXVD2X VS34, (R3)(R10)
STXVD2X VS36, (R3)(R14)
STXVD2X VS38, (R3)(R15)
ADD $64, R8
ADD $64, R10
ADD $64, R14
ADD $64, R15
BDNZ loop64
BC 12,2,LR // BEQLR
MOVD R9, R6
CMP R6, $8
BLE small
// Case for 8 <= n < 64 bytes
// Process 32 bytes if available
xor32:
CMP R6, $32
BLT xor16
ADD $16, R8, R9
LXVD2X (R4)(R8), VS32
LXVD2X (R4)(R9), VS33
LXVD2X (R5)(R8), VS34
LXVD2X (R5)(R9), VS35
XXLXOR VS32, VS34, VS32
XXLXOR VS33, VS35, VS33
STXVD2X VS32, (R3)(R8)
STXVD2X VS33, (R3)(R9)
ADD $32, R8
ADD $-32, R6
CMP R6, $8
BLE small
// Case for 8 <= n < 32 bytes
// Process 16 bytes if available
xor16: xor16:
CMP R6, $16 CMP R6, $16
BLT xor8 BLT xor8
LXVD2X (R4)(R8), VS32 LXVD2X (R4)(R8), VS32
LXVD2X (R5)(R8), VS33 LXVD2X (R5)(R8), VS33
XXLXOR VS32, VS33, VS32 XXLXOR VS32, VS33, VS32
STXVD2X VS32, (R3)(R8) STXVD2X VS32, (R3)(R8)
ADD $16, R8 ADD $16, R8
ADD $-16, R6 ADD $-16, R6
CMP R6, $8
BLT small
xor8:
// Case for 8 n < 16 bytes
MOVD (R4)(R8), R14 // R14 = a[i,...,i+7]
MOVD (R5)(R8), R15 // R15 = b[i,...,i+7]
XOR R14, R15, R16 // R16 = a[] ^ b[]
SUB $8, R6 // n = n - 8
MOVD R16, (R3)(R8) // Store to dst
ADD $8, R8
// Check if we're finished
CMP R6, R0
BGT small
RET
// Case for n < 8 bytes and tailing bytes from the
// previous cases.
small: small:
CMP R6, R0 CMP R6, R0
BEQ done BC 12,2,LR // BEQLR
MOVD R6, CTR // Setup loop counter xor8:
#ifdef GOPPC64_power10
loop: SLD $56,R6,R17
ADD R4,R8,R18
ADD R5,R8,R19
ADD R3,R8,R20
LXVL R18,R17,V0
LXVL R19,R17,V1
VXOR V0,V1,V1
STXVL V1,R20,R17
RET
#else
CMP R6, $8
BLT xor4
// Case for 8 n < 16 bytes
MOVD (R4)(R8), R14 // R14 = a[i,...,i+7]
MOVD (R5)(R8), R15 // R15 = b[i,...,i+7]
XOR R14, R15, R16 // R16 = a[] ^ b[]
SUB $8, R6 // n = n - 8
MOVD R16, (R3)(R8) // Store to dst
ADD $8, R8
xor4:
CMP R6, $4
BLT xor2
MOVWZ (R4)(R8), R14
MOVWZ (R5)(R8), R15
XOR R14, R15, R16
MOVW R16, (R3)(R8)
ADD $4,R8
ADD $-4,R6
xor2:
CMP R6, $2
BLT xor1
MOVHZ (R4)(R8), R14
MOVHZ (R5)(R8), R15
XOR R14, R15, R16
MOVH R16, (R3)(R8)
ADD $2,R8
ADD $-2,R6
xor1:
CMP R6, R0
BC 12,2,LR // BEQLR
MOVBZ (R4)(R8), R14 // R14 = a[i] MOVBZ (R4)(R8), R14 // R14 = a[i]
MOVBZ (R5)(R8), R15 // R15 = b[i] MOVBZ (R5)(R8), R15 // R15 = b[i]
XOR R14, R15, R16 // R16 = a[i] ^ b[i] XOR R14, R15, R16 // R16 = a[i] ^ b[i]
MOVB R16, (R3)(R8) // Store to dst MOVB R16, (R3)(R8) // Store to dst
ADD $1, R8 #endif
BC 16, 0, loop // bdnz loop
done: done:
RET RET