diff --git a/src/crypto/subtle/xor_ppc64x.s b/src/crypto/subtle/xor_ppc64x.s index 72bb80d246..0de4350cb2 100644 --- a/src/crypto/subtle/xor_ppc64x.s +++ b/src/crypto/subtle/xor_ppc64x.s @@ -13,75 +13,130 @@ TEXT ·xorBytes(SB), NOSPLIT, $0 MOVD b+16(FP), R5 // R5 = b MOVD n+24(FP), R6 // R6 = n - CMPU R6, $32, CR7 // Check if n ≥ 32 bytes + CMPU R6, $64, CR7 // Check if n ≥ 64 bytes MOVD R0, R8 // R8 = index - CMPU R6, $8, CR6 // Check if 8 ≤ n < 32 bytes - BLT CR6, small // Smaller than 8 - BLT CR7, xor16 // Case for 16 ≤ n < 32 bytes + CMPU R6, $8, CR6 // Check if 8 ≤ n < 64 bytes + BLE CR6, small // <= 8 + BLT CR7, xor32 // Case for 32 ≤ n < 64 bytes - // Case for n ≥ 32 bytes -preloop32: - SRD $5, R6, R7 // Setup loop counter + // Case for n ≥ 64 bytes +preloop64: + SRD $6, R6, R7 // Set up loop counter MOVD R7, CTR MOVD $16, R10 - ANDCC $31, R6, R9 // Check for tailing bytes for later -loop32: - LXVD2X (R4)(R8), VS32 // VS32 = a[i,...,i+15] - LXVD2X (R4)(R10), VS34 - LXVD2X (R5)(R8), VS33 // VS33 = b[i,...,i+15] - LXVD2X (R5)(R10), VS35 - XXLXOR VS32, VS33, VS32 // VS34 = a[] ^ b[] - XXLXOR VS34, VS35, VS34 - STXVD2X VS32, (R3)(R8) // Store to dst - STXVD2X VS34, (R3)(R10) - ADD $32, R8 // Update index - ADD $32, R10 - BC 16, 0, loop32 // bdnz loop16 - - BEQ CR0, done - - MOVD R9, R6 - CMP R6, $8 - BLT small + MOVD $32, R14 + MOVD $48, R15 + ANDCC $63, R6, R9 // Check for tailing bytes for later + PCALIGN $16 + // Case for >= 64 bytes + // Process 64 bytes per iteration + // Load 4 vectors of a and b + // XOR the corresponding vectors + // from a and b and store the result +loop64: + LXVD2X (R4)(R8), VS32 + LXVD2X (R4)(R10), VS34 + LXVD2X (R4)(R14), VS36 + LXVD2X (R4)(R15), VS38 + LXVD2X (R5)(R8), VS33 + LXVD2X (R5)(R10), VS35 + LXVD2X (R5)(R14), VS37 + LXVD2X (R5)(R15), VS39 + XXLXOR VS32, VS33, VS32 + XXLXOR VS34, VS35, VS34 + XXLXOR VS36, VS37, VS36 + XXLXOR VS38, VS39, VS38 + STXVD2X VS32, (R3)(R8) + STXVD2X VS34, (R3)(R10) + STXVD2X VS36, (R3)(R14) + STXVD2X VS38, (R3)(R15) + ADD $64, R8 + ADD $64, R10 + ADD $64, R14 + ADD $64, R15 + BDNZ loop64 + BC 12,2,LR // BEQLR + MOVD R9, R6 + CMP R6, $8 + BLE small + // Case for 8 <= n < 64 bytes + // Process 32 bytes if available +xor32: + CMP R6, $32 + BLT xor16 + ADD $16, R8, R9 + LXVD2X (R4)(R8), VS32 + LXVD2X (R4)(R9), VS33 + LXVD2X (R5)(R8), VS34 + LXVD2X (R5)(R9), VS35 + XXLXOR VS32, VS34, VS32 + XXLXOR VS33, VS35, VS33 + STXVD2X VS32, (R3)(R8) + STXVD2X VS33, (R3)(R9) + ADD $32, R8 + ADD $-32, R6 + CMP R6, $8 + BLE small + // Case for 8 <= n < 32 bytes + // Process 16 bytes if available xor16: - CMP R6, $16 - BLT xor8 - LXVD2X (R4)(R8), VS32 - LXVD2X (R5)(R8), VS33 - XXLXOR VS32, VS33, VS32 - STXVD2X VS32, (R3)(R8) - ADD $16, R8 - ADD $-16, R6 - CMP R6, $8 - BLT small -xor8: - // Case for 8 ≤ n < 16 bytes - MOVD (R4)(R8), R14 // R14 = a[i,...,i+7] - MOVD (R5)(R8), R15 // R15 = b[i,...,i+7] - XOR R14, R15, R16 // R16 = a[] ^ b[] - SUB $8, R6 // n = n - 8 - MOVD R16, (R3)(R8) // Store to dst - ADD $8, R8 - - // Check if we're finished - CMP R6, R0 - BGT small - RET - - // Case for n < 8 bytes and tailing bytes from the - // previous cases. + CMP R6, $16 + BLT xor8 + LXVD2X (R4)(R8), VS32 + LXVD2X (R5)(R8), VS33 + XXLXOR VS32, VS33, VS32 + STXVD2X VS32, (R3)(R8) + ADD $16, R8 + ADD $-16, R6 small: CMP R6, R0 - BEQ done - MOVD R6, CTR // Setup loop counter - -loop: + BC 12,2,LR // BEQLR +xor8: +#ifdef GOPPC64_power10 + SLD $56,R6,R17 + ADD R4,R8,R18 + ADD R5,R8,R19 + ADD R3,R8,R20 + LXVL R18,R17,V0 + LXVL R19,R17,V1 + VXOR V0,V1,V1 + STXVL V1,R20,R17 + RET +#else + CMP R6, $8 + BLT xor4 + // Case for 8 ≤ n < 16 bytes + MOVD (R4)(R8), R14 // R14 = a[i,...,i+7] + MOVD (R5)(R8), R15 // R15 = b[i,...,i+7] + XOR R14, R15, R16 // R16 = a[] ^ b[] + SUB $8, R6 // n = n - 8 + MOVD R16, (R3)(R8) // Store to dst + ADD $8, R8 +xor4: + CMP R6, $4 + BLT xor2 + MOVWZ (R4)(R8), R14 + MOVWZ (R5)(R8), R15 + XOR R14, R15, R16 + MOVW R16, (R3)(R8) + ADD $4,R8 + ADD $-4,R6 +xor2: + CMP R6, $2 + BLT xor1 + MOVHZ (R4)(R8), R14 + MOVHZ (R5)(R8), R15 + XOR R14, R15, R16 + MOVH R16, (R3)(R8) + ADD $2,R8 + ADD $-2,R6 +xor1: + CMP R6, R0 + BC 12,2,LR // BEQLR MOVBZ (R4)(R8), R14 // R14 = a[i] MOVBZ (R5)(R8), R15 // R15 = b[i] XOR R14, R15, R16 // R16 = a[i] ^ b[i] MOVB R16, (R3)(R8) // Store to dst - ADD $1, R8 - BC 16, 0, loop // bdnz loop - +#endif done: RET