From 9b61b0054b34760506efe5a7763594d0188defce Mon Sep 17 00:00:00 2001 From: Lynn Boger Date: Mon, 25 Sep 2023 10:49:11 -0500 Subject: [PATCH] crypto/subtle: improve xorBytes assembler on PPC64 This makes some improvements to the xorBytes assembler implementation for PPC64 targets. The loops to process large streams of bytes has been changed to do 64 bytes at a time. Other changes were made to prevent degradations in some of the common sizes like 8, 16. The case for < 8 bytes on power10 has been modified to use the LXVL and STXVL instructions. Change-Id: I7477d12d5375d484af8c274443d595ccdafbda7c Reviewed-on: https://go-review.googlesource.com/c/go/+/530877 Reviewed-by: Paul Murphy TryBot-Result: Gopher Robot Reviewed-by: Jayanth Krishnamurthy LUCI-TryBot-Result: Go LUCI Reviewed-by: Michael Pratt Reviewed-by: Benny Siegert Run-TryBot: Lynn Boger --- src/crypto/subtle/xor_ppc64x.s | 173 ++++++++++++++++++++++----------- 1 file changed, 114 insertions(+), 59 deletions(-) diff --git a/src/crypto/subtle/xor_ppc64x.s b/src/crypto/subtle/xor_ppc64x.s index 72bb80d246..0de4350cb2 100644 --- a/src/crypto/subtle/xor_ppc64x.s +++ b/src/crypto/subtle/xor_ppc64x.s @@ -13,75 +13,130 @@ TEXT ·xorBytes(SB), NOSPLIT, $0 MOVD b+16(FP), R5 // R5 = b MOVD n+24(FP), R6 // R6 = n - CMPU R6, $32, CR7 // Check if n ≥ 32 bytes + CMPU R6, $64, CR7 // Check if n ≥ 64 bytes MOVD R0, R8 // R8 = index - CMPU R6, $8, CR6 // Check if 8 ≤ n < 32 bytes - BLT CR6, small // Smaller than 8 - BLT CR7, xor16 // Case for 16 ≤ n < 32 bytes + CMPU R6, $8, CR6 // Check if 8 ≤ n < 64 bytes + BLE CR6, small // <= 8 + BLT CR7, xor32 // Case for 32 ≤ n < 64 bytes - // Case for n ≥ 32 bytes -preloop32: - SRD $5, R6, R7 // Setup loop counter + // Case for n ≥ 64 bytes +preloop64: + SRD $6, R6, R7 // Set up loop counter MOVD R7, CTR MOVD $16, R10 - ANDCC $31, R6, R9 // Check for tailing bytes for later -loop32: - LXVD2X (R4)(R8), VS32 // VS32 = a[i,...,i+15] - LXVD2X (R4)(R10), VS34 - LXVD2X (R5)(R8), VS33 // VS33 = b[i,...,i+15] - LXVD2X (R5)(R10), VS35 - XXLXOR VS32, VS33, VS32 // VS34 = a[] ^ b[] - XXLXOR VS34, VS35, VS34 - STXVD2X VS32, (R3)(R8) // Store to dst - STXVD2X VS34, (R3)(R10) - ADD $32, R8 // Update index - ADD $32, R10 - BC 16, 0, loop32 // bdnz loop16 - - BEQ CR0, done - - MOVD R9, R6 - CMP R6, $8 - BLT small + MOVD $32, R14 + MOVD $48, R15 + ANDCC $63, R6, R9 // Check for tailing bytes for later + PCALIGN $16 + // Case for >= 64 bytes + // Process 64 bytes per iteration + // Load 4 vectors of a and b + // XOR the corresponding vectors + // from a and b and store the result +loop64: + LXVD2X (R4)(R8), VS32 + LXVD2X (R4)(R10), VS34 + LXVD2X (R4)(R14), VS36 + LXVD2X (R4)(R15), VS38 + LXVD2X (R5)(R8), VS33 + LXVD2X (R5)(R10), VS35 + LXVD2X (R5)(R14), VS37 + LXVD2X (R5)(R15), VS39 + XXLXOR VS32, VS33, VS32 + XXLXOR VS34, VS35, VS34 + XXLXOR VS36, VS37, VS36 + XXLXOR VS38, VS39, VS38 + STXVD2X VS32, (R3)(R8) + STXVD2X VS34, (R3)(R10) + STXVD2X VS36, (R3)(R14) + STXVD2X VS38, (R3)(R15) + ADD $64, R8 + ADD $64, R10 + ADD $64, R14 + ADD $64, R15 + BDNZ loop64 + BC 12,2,LR // BEQLR + MOVD R9, R6 + CMP R6, $8 + BLE small + // Case for 8 <= n < 64 bytes + // Process 32 bytes if available +xor32: + CMP R6, $32 + BLT xor16 + ADD $16, R8, R9 + LXVD2X (R4)(R8), VS32 + LXVD2X (R4)(R9), VS33 + LXVD2X (R5)(R8), VS34 + LXVD2X (R5)(R9), VS35 + XXLXOR VS32, VS34, VS32 + XXLXOR VS33, VS35, VS33 + STXVD2X VS32, (R3)(R8) + STXVD2X VS33, (R3)(R9) + ADD $32, R8 + ADD $-32, R6 + CMP R6, $8 + BLE small + // Case for 8 <= n < 32 bytes + // Process 16 bytes if available xor16: - CMP R6, $16 - BLT xor8 - LXVD2X (R4)(R8), VS32 - LXVD2X (R5)(R8), VS33 - XXLXOR VS32, VS33, VS32 - STXVD2X VS32, (R3)(R8) - ADD $16, R8 - ADD $-16, R6 - CMP R6, $8 - BLT small -xor8: - // Case for 8 ≤ n < 16 bytes - MOVD (R4)(R8), R14 // R14 = a[i,...,i+7] - MOVD (R5)(R8), R15 // R15 = b[i,...,i+7] - XOR R14, R15, R16 // R16 = a[] ^ b[] - SUB $8, R6 // n = n - 8 - MOVD R16, (R3)(R8) // Store to dst - ADD $8, R8 - - // Check if we're finished - CMP R6, R0 - BGT small - RET - - // Case for n < 8 bytes and tailing bytes from the - // previous cases. + CMP R6, $16 + BLT xor8 + LXVD2X (R4)(R8), VS32 + LXVD2X (R5)(R8), VS33 + XXLXOR VS32, VS33, VS32 + STXVD2X VS32, (R3)(R8) + ADD $16, R8 + ADD $-16, R6 small: CMP R6, R0 - BEQ done - MOVD R6, CTR // Setup loop counter - -loop: + BC 12,2,LR // BEQLR +xor8: +#ifdef GOPPC64_power10 + SLD $56,R6,R17 + ADD R4,R8,R18 + ADD R5,R8,R19 + ADD R3,R8,R20 + LXVL R18,R17,V0 + LXVL R19,R17,V1 + VXOR V0,V1,V1 + STXVL V1,R20,R17 + RET +#else + CMP R6, $8 + BLT xor4 + // Case for 8 ≤ n < 16 bytes + MOVD (R4)(R8), R14 // R14 = a[i,...,i+7] + MOVD (R5)(R8), R15 // R15 = b[i,...,i+7] + XOR R14, R15, R16 // R16 = a[] ^ b[] + SUB $8, R6 // n = n - 8 + MOVD R16, (R3)(R8) // Store to dst + ADD $8, R8 +xor4: + CMP R6, $4 + BLT xor2 + MOVWZ (R4)(R8), R14 + MOVWZ (R5)(R8), R15 + XOR R14, R15, R16 + MOVW R16, (R3)(R8) + ADD $4,R8 + ADD $-4,R6 +xor2: + CMP R6, $2 + BLT xor1 + MOVHZ (R4)(R8), R14 + MOVHZ (R5)(R8), R15 + XOR R14, R15, R16 + MOVH R16, (R3)(R8) + ADD $2,R8 + ADD $-2,R6 +xor1: + CMP R6, R0 + BC 12,2,LR // BEQLR MOVBZ (R4)(R8), R14 // R14 = a[i] MOVBZ (R5)(R8), R15 // R15 = b[i] XOR R14, R15, R16 // R16 = a[i] ^ b[i] MOVB R16, (R3)(R8) // Store to dst - ADD $1, R8 - BC 16, 0, loop // bdnz loop - +#endif done: RET