1
0
mirror of https://github.com/golang/go synced 2024-11-23 21:30:08 -07:00

crypto/aes: rewrite ppc64le block crypting

This unrolls the counter loop and simplifies the load/storing
of text/ciphertext and keys by using unaligned VSX memory
operations.

Performance delta on POWER9:

name     old time/op   new time/op    delta
Encrypt   19.9ns ± 0%    14.9ns ± 0%  -24.95%
Decrypt   19.8ns ± 0%    14.6ns ± 0%  -26.12%

Change-Id: Iba98d5c1d88c6bead45bc04c97ae64bcb6fc9f21
Reviewed-on: https://go-review.googlesource.com/c/go/+/404354
Run-TryBot: Paul Murphy <murp@ibm.com>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Filippo Valsorda <valsorda@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
This commit is contained in:
Paul E. Murphy 2022-03-24 11:44:42 -05:00 committed by Paul Murphy
parent 9c9090eb1d
commit f495b36ccd

View File

@ -296,131 +296,189 @@ done:
// func encryptBlockAsm(nr int, xk *uint32, dst, src *byte)
TEXT ·encryptBlockAsm(SB), NOSPLIT|NOFRAME, $0
// Load the arguments inside the registers
MOVD nr+0(FP), BLK_ROUNDS
MOVD xk+8(FP), BLK_KEY
MOVD dst+16(FP), BLK_OUT
MOVD src+24(FP), BLK_INP
MOVD nr+0(FP), R6 // Round count/Key size
MOVD xk+8(FP), R5 // Key pointer
MOVD dst+16(FP), R3 // Dest pointer
MOVD src+24(FP), R4 // Src pointer
MOVD $·rcon(SB), R7
LVX (R7), ESPERM // Permute value for P8_ macros.
MOVD $15, BLK_IDX // li 7,15
// Set CR{1,2,3}EQ to hold the key size information.
CMPU R6, $10, CR1
CMPU R6, $12, CR2
CMPU R6, $14, CR3
LVX (BLK_INP)(R0), ZERO // lvx 0,0,3
NEG BLK_OUT, R11 // neg 11,4
LVX (BLK_INP)(BLK_IDX), IN0 // lvx 1,7,3
LVSL (BLK_INP)(R0), IN1 // lvsl 2,0,3
VSPLTISB $0x0f, RCON // vspltisb 4,0x0f
LVSR (R11)(R0), KEY // lvsr 3,0,11
VXOR IN1, RCON, IN1 // vxor 2,2,4
MOVD $16, BLK_IDX // li 7,16
VPERM ZERO, IN0, IN1, ZERO // vperm 0,0,1,2
LVX (BLK_KEY)(R0), IN0 // lvx 1,0,5
LVSR (BLK_KEY)(R0), MASK // lvsr 5,0,5
SRW $1, BLK_ROUNDS, BLK_ROUNDS // srwi 6,6,1
LVX (BLK_KEY)(BLK_IDX), IN1 // lvx 2,7,5
ADD $16, BLK_IDX, BLK_IDX // addi 7,7,16
SUB $1, BLK_ROUNDS, BLK_ROUNDS // subi 6,6,1
VPERM IN1, IN0, MASK, IN0 // vperm 1,2,1,5
MOVD $16, R6
MOVD $32, R7
MOVD $48, R8
MOVD $64, R9
MOVD $80, R10
MOVD $96, R11
MOVD $112, R12
VXOR ZERO, IN0, ZERO // vxor 0,0,1
LVX (BLK_KEY)(BLK_IDX), IN0 // lvx 1,7,5
ADD $16, BLK_IDX, BLK_IDX // addi 7,7,16
MOVD BLK_ROUNDS, CTR // mtctr 6
// Load text in BE order
P8_LXVB16X(R4, R0, V0)
loop_enc:
VPERM IN0, IN1, MASK, IN1 // vperm 2,1,2,5
VCIPHER ZERO, IN1, ZERO // vcipher 0,0,2
LVX (BLK_KEY)(BLK_IDX), IN1 // lvx 2,7,5
ADD $16, BLK_IDX, BLK_IDX // addi 7,7,16
VPERM IN1, IN0, MASK, IN0 // vperm 1,2,1,5
VCIPHER ZERO, IN0, ZERO // vcipher 0,0,1
LVX (BLK_KEY)(BLK_IDX), IN0 // lvx 1,7,5
ADD $16, BLK_IDX, BLK_IDX // addi 7,7,16
BC 0x10, 0, loop_enc // bdnz .Loop_enc
// V1, V2 will hold keys, V0 is a temp.
// At completion, V2 will hold the ciphertext.
// Load xk[0:3] and xor with text
P8_LXV(R0, R5, V1)
VXOR V0, V1, V0
VPERM IN0, IN1, MASK, IN1 // vperm 2,1,2,5
VCIPHER ZERO, IN1, ZERO // vcipher 0,0,2
LVX (BLK_KEY)(BLK_IDX), IN1 // lvx 2,7,5
VPERM IN1, IN0, MASK, IN0 // vperm 1,2,1,5
VCIPHERLAST ZERO, IN0, ZERO // vcipherlast 0,0,1
// Load xk[4:11] and cipher
P8_LXV(R6, R5, V1)
P8_LXV(R7, R5, V2)
VCIPHER V0, V1, V0
VCIPHER V0, V2, V0
VSPLTISB $-1, IN1 // vspltisb 2,-1
VXOR IN0, IN0, IN0 // vxor 1,1,1
MOVD $15, BLK_IDX // li 7,15
VPERM IN1, IN0, KEY, IN1 // vperm 2,2,1,3
VXOR KEY, RCON, KEY // vxor 3,3,4
LVX (BLK_OUT)(R0), IN0 // lvx 1,0,4
VPERM ZERO, ZERO, KEY, ZERO // vperm 0,0,0,3
VSEL IN0, ZERO, IN1, IN0 // vsel 1,1,0,2
LVX (BLK_OUT)(BLK_IDX), RCON // lvx 4,7,4
STVX IN0, (BLK_OUT+R0) // stvx 1,0,4
VSEL ZERO, RCON, IN1, ZERO // vsel 0,0,4,2
STVX ZERO, (BLK_OUT+BLK_IDX) // stvx 0,7,4
// Load xk[12:19] and cipher
P8_LXV(R8, R5, V1)
P8_LXV(R9, R5, V2)
VCIPHER V0, V1, V0
VCIPHER V0, V2, V0
RET // blr
// Load xk[20:27] and cipher
P8_LXV(R10, R5, V1)
P8_LXV(R11, R5, V2)
VCIPHER V0, V1, V0
VCIPHER V0, V2, V0
// Increment xk pointer to reuse constant offsets in R6-R12.
ADD $112, R5
// Load xk[28:35] and cipher
P8_LXV(R0, R5, V1)
P8_LXV(R6, R5, V2)
VCIPHER V0, V1, V0
VCIPHER V0, V2, V0
// Load xk[36:43] and cipher
P8_LXV(R7, R5, V1)
P8_LXV(R8, R5, V2)
BEQ CR1, Ldec_tail // Key size 10?
VCIPHER V0, V1, V0
VCIPHER V0, V2, V0
// Load xk[44:51] and cipher
P8_LXV(R9, R5, V1)
P8_LXV(R10, R5, V2)
BEQ CR2, Ldec_tail // Key size 12?
VCIPHER V0, V1, V0
VCIPHER V0, V2, V0
// Load xk[52:59] and cipher
P8_LXV(R11, R5, V1)
P8_LXV(R12, R5, V2)
BNE CR3, Linvalid_key_len // Not key size 14?
// Fallthrough to final cipher
Ldec_tail:
// Cipher last two keys such that key information is
// cleared from V1 and V2.
VCIPHER V0, V1, V1
VCIPHERLAST V1, V2, V2
// Store the result in BE order.
P8_STXVB16X(V2, R3, R0)
RET
Linvalid_key_len:
// Segfault, this should never happen. Only 3 keys sizes are created/used.
MOVD R0, 0(R0)
RET
// func decryptBlockAsm(nr int, xk *uint32, dst, src *byte)
TEXT ·decryptBlockAsm(SB), NOSPLIT|NOFRAME, $0
// Load the arguments inside the registers
MOVD nr+0(FP), BLK_ROUNDS
MOVD xk+8(FP), BLK_KEY
MOVD dst+16(FP), BLK_OUT
MOVD src+24(FP), BLK_INP
MOVD nr+0(FP), R6 // Round count/Key size
MOVD xk+8(FP), R5 // Key pointer
MOVD dst+16(FP), R3 // Dest pointer
MOVD src+24(FP), R4 // Src pointer
MOVD $·rcon(SB), R7
LVX (R7), ESPERM // Permute value for P8_ macros.
MOVD $15, BLK_IDX // li 7,15
// Set CR{1,2,3}EQ to hold the key size information.
CMPU R6, $10, CR1
CMPU R6, $12, CR2
CMPU R6, $14, CR3
LVX (BLK_INP)(R0), ZERO // lvx 0,0,3
NEG BLK_OUT, R11 // neg 11,4
LVX (BLK_INP)(BLK_IDX), IN0 // lvx 1,7,3
LVSL (BLK_INP)(R0), IN1 // lvsl 2,0,3
VSPLTISB $0x0f, RCON // vspltisb 4,0x0f
LVSR (R11)(R0), KEY // lvsr 3,0,11
VXOR IN1, RCON, IN1 // vxor 2,2,4
MOVD $16, BLK_IDX // li 7,16
VPERM ZERO, IN0, IN1, ZERO // vperm 0,0,1,2
LVX (BLK_KEY)(R0), IN0 // lvx 1,0,5
LVSR (BLK_KEY)(R0), MASK // lvsr 5,0,5
SRW $1, BLK_ROUNDS, BLK_ROUNDS // srwi 6,6,1
LVX (BLK_KEY)(BLK_IDX), IN1 // lvx 2,7,5
ADD $16, BLK_IDX, BLK_IDX // addi 7,7,16
SUB $1, BLK_ROUNDS, BLK_ROUNDS // subi 6,6,1
VPERM IN1, IN0, MASK, IN0 // vperm 1,2,1,5
MOVD $16, R6
MOVD $32, R7
MOVD $48, R8
MOVD $64, R9
MOVD $80, R10
MOVD $96, R11
MOVD $112, R12
VXOR ZERO, IN0, ZERO // vxor 0,0,1
LVX (BLK_KEY)(BLK_IDX), IN0 // lvx 1,7,5
ADD $16, BLK_IDX, BLK_IDX // addi 7,7,16
MOVD BLK_ROUNDS, CTR // mtctr 6
// Load text in BE order
P8_LXVB16X(R4, R0, V0)
loop_dec:
VPERM IN0, IN1, MASK, IN1 // vperm 2,1,2,5
VNCIPHER ZERO, IN1, ZERO // vncipher 0,0,2
LVX (BLK_KEY)(BLK_IDX), IN1 // lvx 2,7,5
ADD $16, BLK_IDX, BLK_IDX // addi 7,7,16
VPERM IN1, IN0, MASK, IN0 // vperm 1,2,1,5
VNCIPHER ZERO, IN0, ZERO // vncipher 0,0,1
LVX (BLK_KEY)(BLK_IDX), IN0 // lvx 1,7,5
ADD $16, BLK_IDX, BLK_IDX // addi 7,7,16
BC 0x10, 0, loop_dec // bdnz .Loop_dec
// V1, V2 will hold keys, V0 is a temp.
// At completion, V2 will hold the text.
// Load xk[0:3] and xor with ciphertext
P8_LXV(R0, R5, V1)
VXOR V0, V1, V0
VPERM IN0, IN1, MASK, IN1 // vperm 2,1,2,5
VNCIPHER ZERO, IN1, ZERO // vncipher 0,0,2
LVX (BLK_KEY)(BLK_IDX), IN1 // lvx 2,7,5
VPERM IN1, IN0, MASK, IN0 // vperm 1,2,1,5
VNCIPHERLAST ZERO, IN0, ZERO // vncipherlast 0,0,1
// Load xk[4:11] and cipher
P8_LXV(R6, R5, V1)
P8_LXV(R7, R5, V2)
VNCIPHER V0, V1, V0
VNCIPHER V0, V2, V0
VSPLTISB $-1, IN1 // vspltisb 2,-1
VXOR IN0, IN0, IN0 // vxor 1,1,1
MOVD $15, BLK_IDX // li 7,15
VPERM IN1, IN0, KEY, IN1 // vperm 2,2,1,3
VXOR KEY, RCON, KEY // vxor 3,3,4
LVX (BLK_OUT)(R0), IN0 // lvx 1,0,4
VPERM ZERO, ZERO, KEY, ZERO // vperm 0,0,0,3
VSEL IN0, ZERO, IN1, IN0 // vsel 1,1,0,2
LVX (BLK_OUT)(BLK_IDX), RCON // lvx 4,7,4
STVX IN0, (BLK_OUT+R0) // stvx 1,0,4
VSEL ZERO, RCON, IN1, ZERO // vsel 0,0,4,2
STVX ZERO, (BLK_OUT+BLK_IDX) // stvx 0,7,4
// Load xk[12:19] and cipher
P8_LXV(R8, R5, V1)
P8_LXV(R9, R5, V2)
VNCIPHER V0, V1, V0
VNCIPHER V0, V2, V0
RET // blr
// Load xk[20:27] and cipher
P8_LXV(R10, R5, V1)
P8_LXV(R11, R5, V2)
VNCIPHER V0, V1, V0
VNCIPHER V0, V2, V0
// Increment xk pointer to reuse constant offsets in R6-R12.
ADD $112, R5
// Load xk[28:35] and cipher
P8_LXV(R0, R5, V1)
P8_LXV(R6, R5, V2)
VNCIPHER V0, V1, V0
VNCIPHER V0, V2, V0
// Load xk[36:43] and cipher
P8_LXV(R7, R5, V1)
P8_LXV(R8, R5, V2)
BEQ CR1, Ldec_tail // Key size 10?
VNCIPHER V0, V1, V0
VNCIPHER V0, V2, V0
// Load xk[44:51] and cipher
P8_LXV(R9, R5, V1)
P8_LXV(R10, R5, V2)
BEQ CR2, Ldec_tail // Key size 12?
VNCIPHER V0, V1, V0
VNCIPHER V0, V2, V0
// Load xk[52:59] and cipher
P8_LXV(R11, R5, V1)
P8_LXV(R12, R5, V2)
BNE CR3, Linvalid_key_len // Not key size 14?
// Fallthrough to final cipher
Ldec_tail:
// Cipher last two keys such that key information is
// cleared from V1 and V2.
VNCIPHER V0, V1, V1
VNCIPHERLAST V1, V2, V2
// Store the result in BE order.
P8_STXVB16X(V2, R3, R0)
RET
Linvalid_key_len:
// Segfault, this should never happen. Only 3 keys sizes are created/used.
MOVD R0, 0(R0)
RET
// Remove defines from above so they can be defined here
#undef INP