mirror of
https://github.com/golang/go
synced 2024-11-23 14:50:07 -07:00
crypto/aes,crypto/cipher: improve gcm performance on ppc64x
This improves performance for AES-GCM. The function counterCrypt is written in assembler so the loop can be unrolled and the stitched approach used for the encryption. This implementation works on ppc64le and ppc64. The use of GOPPC64=power9 generates the best performance, goos: linux goarch: ppc64le pkg: crypto/cipher cpu: POWER10 │ gcmx8.cpu1.out │ gcmx8.new.cpu1.out │ │ sec/op │ sec/op vs base │ AESGCM/Open-128-64 180.5n ± 0% 152.7n ± 1% -15.43% (p=0.000 n=8) AESGCM/Seal-128-64 166.8n ± 0% 147.2n ± 0% -11.72% (p=0.000 n=8) AESGCM/Open-256-64 194.9n ± 0% 156.6n ± 1% -19.65% (p=0.000 n=8) AESGCM/Seal-256-64 183.7n ± 0% 157.0n ± 0% -14.51% (p=0.000 n=8) AESGCM/Open-128-1350 1769.5n ± 0% 454.5n ± 0% -74.31% (p=0.000 n=8) AESGCM/Seal-128-1350 1759.0n ± 0% 453.7n ± 0% -74.21% (p=0.000 n=8) AESGCM/Open-256-1350 2104.0n ± 0% 504.4n ± 1% -76.03% (p=0.000 n=8) AESGCM/Seal-256-1350 2092.0n ± 0% 503.0n ± 0% -75.96% (p=0.000 n=8) AESGCM/Open-128-8192 10.054µ ± 0% 1.961µ ± 0% -80.50% (p=0.000 n=8) AESGCM/Seal-128-8192 10.050µ ± 0% 1.965µ ± 0% -80.45% (p=0.000 n=8) AESGCM/Open-256-8192 12.080µ ± 0% 2.232µ ± 0% -81.52% (p=0.000 n=8) AESGCM/Seal-256-8192 12.069µ ± 0% 2.238µ ± 0% -81.46% (p=0.000 n=8) geomean 1.566µ 535.5n -65.80% │ gcmx8.cpu1.out │ gcmx8.new.cpu1.out │ │ B/s │ B/s vs base │ AESGCM/Open-128-64 338.1Mi ± 0% 399.8Mi ± 1% +18.27% (p=0.000 n=8) AESGCM/Seal-128-64 366.1Mi ± 0% 414.6Mi ± 0% +13.28% (p=0.000 n=8) AESGCM/Open-256-64 313.1Mi ± 0% 389.7Mi ± 0% +24.47% (p=0.000 n=8) AESGCM/Seal-256-64 332.3Mi ± 0% 388.5Mi ± 0% +16.93% (p=0.000 n=8) AESGCM/Open-128-1350 727.6Mi ± 0% 2832.8Mi ± 0% +289.33% (p=0.000 n=8) AESGCM/Seal-128-1350 732.0Mi ± 0% 2837.8Mi ± 0% +287.70% (p=0.000 n=8) AESGCM/Open-256-1350 611.9Mi ± 0% 2552.6Mi ± 0% +317.18% (p=0.000 n=8) AESGCM/Seal-256-1350 615.3Mi ± 0% 2559.6Mi ± 0% +315.97% (p=0.000 n=8) AESGCM/Open-128-8192 777.1Mi ± 0% 3983.5Mi ± 0% +412.63% (p=0.000 n=8) AESGCM/Seal-128-8192 777.3Mi ± 0% 3975.9Mi ± 0% +411.47% (p=0.000 n=8) AESGCM/Open-256-8192 646.7Mi ± 0% 3500.6Mi ± 0% +441.27% (p=0.000 n=8) AESGCM/Seal-256-8192 647.3Mi ± 0% 3491.1Mi ± 0% +439.30% (p=0.000 n=8) geomean 542.7Mi 1.550Gi +192.42% Change-Id: I3600831a263ec8a99b5e3bdd495eb36e966d8075 Reviewed-on: https://go-review.googlesource.com/c/go/+/484575 Reviewed-by: Roland Shoemaker <roland@golang.org> TryBot-Result: Gopher Robot <gobot@golang.org> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> Reviewed-by: Paul Murphy <murp@ibm.com> Reviewed-by: Than McIntosh <thanm@google.com>
This commit is contained in:
parent
e0948d825d
commit
a8ca649bbe
@ -51,6 +51,8 @@ type gcmAsm struct {
|
||||
tagSize int
|
||||
}
|
||||
|
||||
func counterCryptASM(nr int, out, in []byte, counter *[gcmBlockSize]byte, key *uint32)
|
||||
|
||||
// NewGCM returns the AES cipher wrapped in Galois Counter Mode. This is only
|
||||
// called by crypto/cipher.NewGCM via the gcmAble interface.
|
||||
func (c *aesCipherAsm) NewGCM(nonceSize, tagSize int) (cipher.AEAD, error) {
|
||||
@ -114,34 +116,10 @@ func (g *gcmAsm) deriveCounter(counter *[gcmBlockSize]byte, nonce []byte) {
|
||||
// into out. counter is the initial count value and will be updated with the next
|
||||
// count value. The length of out must be greater than or equal to the length
|
||||
// of in.
|
||||
// counterCryptASM implements counterCrypt which then allows the loop to
|
||||
// be unrolled and optimized.
|
||||
func (g *gcmAsm) counterCrypt(out, in []byte, counter *[gcmBlockSize]byte) {
|
||||
var mask [gcmBlockSize]byte
|
||||
|
||||
for len(in) >= gcmBlockSize {
|
||||
// Hint to avoid bounds check
|
||||
_, _ = in[15], out[15]
|
||||
g.cipher.Encrypt(mask[:], counter[:])
|
||||
gcmInc32(counter)
|
||||
|
||||
// XOR 16 bytes each loop iteration in 8 byte chunks
|
||||
in0 := binary.LittleEndian.Uint64(in[0:])
|
||||
in1 := binary.LittleEndian.Uint64(in[8:])
|
||||
m0 := binary.LittleEndian.Uint64(mask[:8])
|
||||
m1 := binary.LittleEndian.Uint64(mask[8:])
|
||||
binary.LittleEndian.PutUint64(out[:8], in0^m0)
|
||||
binary.LittleEndian.PutUint64(out[8:], in1^m1)
|
||||
out = out[16:]
|
||||
in = in[16:]
|
||||
}
|
||||
|
||||
if len(in) > 0 {
|
||||
g.cipher.Encrypt(mask[:], counter[:])
|
||||
gcmInc32(counter)
|
||||
// XOR leftover bytes
|
||||
for i, inb := range in {
|
||||
out[i] = inb ^ mask[i]
|
||||
}
|
||||
}
|
||||
counterCryptASM(len(g.cipher.enc)/4-1, out, in, counter, &g.cipher.enc[0])
|
||||
}
|
||||
|
||||
// increments the rightmost 32-bits of the count value by 1.
|
||||
|
@ -4,7 +4,7 @@
|
||||
|
||||
//go:build ppc64 || ppc64le
|
||||
|
||||
// Based on CRYPTOGAMS code with the following comment:
|
||||
// Portions based on CRYPTOGAMS code with the following comment:
|
||||
// # ====================================================================
|
||||
// # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
// # project. The module is, however, dual licensed under OpenSSL and
|
||||
@ -12,13 +12,17 @@
|
||||
// # details see http://www.openssl.org/~appro/cryptogams/.
|
||||
// # ====================================================================
|
||||
|
||||
// This implementation is based on the ppc64 asm generated by the
|
||||
// script https://github.com/dot-asm/cryptogams/blob/master/ppc/ghashp8-ppc.pl
|
||||
// The implementations for gcmHash, gcmInit and gcmMul are based on the generated asm
|
||||
// from the script https://github.com/dot-asm/cryptogams/blob/master/ppc/ghashp8-ppc.pl
|
||||
// from commit d47afb3c.
|
||||
|
||||
// Changes were made due to differences in the ABI and some register usage.
|
||||
// Some arguments were changed due to the way the Go code passes them.
|
||||
|
||||
// Portions that use the stitched AES-GCM approach in counterCryptASM
|
||||
// are based on code found in
|
||||
// https://github.com/IBM/ipcri/blob/main/aes/p10_aes_gcm.s
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
#define XIP R3
|
||||
@ -87,6 +91,292 @@
|
||||
|
||||
#define VIN0 VIN
|
||||
|
||||
#define ESPERM V10
|
||||
#define TMP2 V11
|
||||
|
||||
// The following macros provide appropriate
|
||||
// implementations for endianness as well as
|
||||
// ISA specific for power8 and power9.
|
||||
#ifdef GOARCH_ppc64le
|
||||
# ifdef GOPPC64_power9
|
||||
#define P8_LXVB16X(RA,RB,VT) LXVB16X (RA)(RB), VT
|
||||
#define P8_STXVB16X(VS,RA,RB) STXVB16X VS, (RA)(RB)
|
||||
# else
|
||||
#define NEEDS_ESPERM
|
||||
#define P8_LXVB16X(RA,RB,VT) \
|
||||
LXVD2X (RA+RB), VT \
|
||||
VPERM VT, VT, ESPERM, VT
|
||||
|
||||
#define P8_STXVB16X(VS,RA,RB) \
|
||||
VPERM VS, VS, ESPERM, TMP2; \
|
||||
STXVD2X TMP2, (RA+RB)
|
||||
|
||||
# endif
|
||||
#else
|
||||
#define P8_LXVB16X(RA,RB,VT) \
|
||||
LXVD2X (RA+RB), VT
|
||||
|
||||
#define P8_STXVB16X(VS,RA,RB) \
|
||||
STXVD2X VS, (RA+RB)
|
||||
|
||||
#endif
|
||||
|
||||
#define MASK_PTR R8
|
||||
|
||||
#define MASKV V0
|
||||
#define INV V1
|
||||
|
||||
// The following macros are used for
|
||||
// the stitched implementation within
|
||||
// counterCryptASM.
|
||||
|
||||
// Load the initial GCM counter value
|
||||
// in V30 and set up the counter increment
|
||||
// in V31
|
||||
#define SETUP_COUNTER \
|
||||
P8_LXVB16X(COUNTER, R0, V30); \
|
||||
VSPLTISB $1, V28; \
|
||||
VXOR V31, V31, V31; \
|
||||
VSLDOI $1, V31, V28, V31
|
||||
|
||||
// These macros set up the initial value
|
||||
// for a single encryption, or 4 or 8
|
||||
// stitched encryptions implemented
|
||||
// with interleaving vciphers.
|
||||
//
|
||||
// The input value for each encryption
|
||||
// is generated by XORing the counter
|
||||
// from V30 with the first key in VS0
|
||||
// and incrementing the counter.
|
||||
//
|
||||
// Single encryption in V15
|
||||
#define GEN_VCIPHER_INPUT \
|
||||
XXLOR VS0, VS0, V29 \
|
||||
VXOR V30, V29, V15; \
|
||||
VADDUWM V30, V31, V30
|
||||
|
||||
// 4 encryptions in V15 - V18
|
||||
#define GEN_VCIPHER_4_INPUTS \
|
||||
XXLOR VS0, VS0, V29; \
|
||||
VXOR V30, V29, V15; \
|
||||
VADDUWM V30, V31, V30; \
|
||||
VXOR V30, V29, V16; \
|
||||
VADDUWM V30, V31, V30; \
|
||||
VXOR V30, V29, V17; \
|
||||
VADDUWM V30, V31, V30; \
|
||||
VXOR V30, V29, V18; \
|
||||
VADDUWM V30, V31, V30
|
||||
|
||||
// 8 encryptions in V15 - V22
|
||||
#define GEN_VCIPHER_8_INPUTS \
|
||||
XXLOR VS0, VS0, V29; \
|
||||
VXOR V30, V29, V15; \
|
||||
VADDUWM V30, V31, V30; \
|
||||
VXOR V30, V29, V16; \
|
||||
VADDUWM V30, V31, V30; \
|
||||
VXOR V30, V29, V17; \
|
||||
VADDUWM V30, V31, V30; \
|
||||
VXOR V30, V29, V18; \
|
||||
VADDUWM V30, V31, V30; \
|
||||
VXOR V30, V29, V19; \
|
||||
VADDUWM V30, V31, V30; \
|
||||
VXOR V30, V29, V20; \
|
||||
VADDUWM V30, V31, V30; \
|
||||
VXOR V30, V29, V21; \
|
||||
VADDUWM V30, V31, V30; \
|
||||
VXOR V30, V29, V22; \
|
||||
VADDUWM V30, V31, V30
|
||||
|
||||
// Load the keys to be used for
|
||||
// encryption based on key_len.
|
||||
// Keys are in VS0 - VS14
|
||||
// depending on key_len.
|
||||
// Valid keys sizes are verified
|
||||
// here. CR2 is set and used
|
||||
// throughout to check key_len.
|
||||
#define LOAD_KEYS(blk_key, key_len) \
|
||||
MOVD $16, R16; \
|
||||
MOVD $32, R17; \
|
||||
MOVD $48, R18; \
|
||||
MOVD $64, R19; \
|
||||
LXVD2X (blk_key)(R0), VS0; \
|
||||
LXVD2X (blk_key)(R16), VS1; \
|
||||
LXVD2X (blk_key)(R17), VS2; \
|
||||
LXVD2X (blk_key)(R18), VS3; \
|
||||
LXVD2X (blk_key)(R19), VS4; \
|
||||
ADD $64, R16; \
|
||||
ADD $64, R17; \
|
||||
ADD $64, R18; \
|
||||
ADD $64, R19; \
|
||||
LXVD2X (blk_key)(R16), VS5; \
|
||||
LXVD2X (blk_key)(R17), VS6; \
|
||||
LXVD2X (blk_key)(R18), VS7; \
|
||||
LXVD2X (blk_key)(R19), VS8; \
|
||||
ADD $64, R16; \
|
||||
ADD $64, R17; \
|
||||
ADD $64, R18; \
|
||||
ADD $64, R19; \
|
||||
LXVD2X (blk_key)(R16), VS9; \
|
||||
LXVD2X (blk_key)(R17), VS10; \
|
||||
CMP key_len, $12, CR2; \
|
||||
CMP key_len, $10; \
|
||||
BEQ keysLoaded; \
|
||||
LXVD2X (blk_key)(R18), VS11; \
|
||||
LXVD2X (blk_key)(R19), VS12; \
|
||||
BEQ CR2, keysLoaded; \
|
||||
ADD $64, R16; \
|
||||
ADD $64, R17; \
|
||||
LXVD2X (blk_key)(R16), VS13; \
|
||||
LXVD2X (blk_key)(R17), VS14; \
|
||||
CMP key_len, $14; \
|
||||
BEQ keysLoaded; \
|
||||
MOVD R0,0(R0); \
|
||||
keysLoaded:
|
||||
|
||||
// Encrypt 1 (vin) with first 9
|
||||
// keys from VS1 - VS9.
|
||||
#define VCIPHER_1X9_KEYS(vin) \
|
||||
XXLOR VS1, VS1, V23; \
|
||||
XXLOR VS2, VS2, V24; \
|
||||
XXLOR VS3, VS3, V25; \
|
||||
XXLOR VS4, VS4, V26; \
|
||||
XXLOR VS5, VS5, V27; \
|
||||
VCIPHER vin, V23, vin; \
|
||||
VCIPHER vin, V24, vin; \
|
||||
VCIPHER vin, V25, vin; \
|
||||
VCIPHER vin, V26, vin; \
|
||||
VCIPHER vin, V27, vin; \
|
||||
XXLOR VS6, VS6, V23; \
|
||||
XXLOR VS7, VS7, V24; \
|
||||
XXLOR VS8, VS8, V25; \
|
||||
XXLOR VS9, VS9, V26; \
|
||||
VCIPHER vin, V23, vin; \
|
||||
VCIPHER vin, V24, vin; \
|
||||
VCIPHER vin, V25, vin; \
|
||||
VCIPHER vin, V26, vin
|
||||
|
||||
// Encrypt 1 value (vin) with
|
||||
// 2 specified keys
|
||||
#define VCIPHER_1X2_KEYS(vin, key1, key2) \
|
||||
XXLOR key1, key1, V25; \
|
||||
XXLOR key2, key2, V26; \
|
||||
VCIPHER vin, V25, vin; \
|
||||
VCIPHER vin, V26, vin
|
||||
|
||||
// Encrypt 4 values in V15 - V18
|
||||
// with the specified key from
|
||||
// VS1 - VS9.
|
||||
#define VCIPHER_4X1_KEY(key) \
|
||||
XXLOR key, key, V23; \
|
||||
VCIPHER V15, V23, V15; \
|
||||
VCIPHER V16, V23, V16; \
|
||||
VCIPHER V17, V23, V17; \
|
||||
VCIPHER V18, V23, V18
|
||||
|
||||
// Encrypt 8 values in V15 - V22
|
||||
// with the specified key,
|
||||
// assuming it is a VSreg
|
||||
#define VCIPHER_8X1_KEY(key) \
|
||||
XXLOR key, key, V23; \
|
||||
VCIPHER V15, V23, V15; \
|
||||
VCIPHER V16, V23, V16; \
|
||||
VCIPHER V17, V23, V17; \
|
||||
VCIPHER V18, V23, V18; \
|
||||
VCIPHER V19, V23, V19; \
|
||||
VCIPHER V20, V23, V20; \
|
||||
VCIPHER V21, V23, V21; \
|
||||
VCIPHER V22, V23, V22
|
||||
|
||||
// Load input block into V1-V4
|
||||
// in big endian order and
|
||||
// update blk_inp by 64.
|
||||
#define LOAD_INPUT_BLOCK64(blk_inp) \
|
||||
MOVD $16, R16; \
|
||||
MOVD $32, R17; \
|
||||
MOVD $48, R18; \
|
||||
P8_LXVB16X(blk_inp,R0,V1); \
|
||||
P8_LXVB16X(blk_inp,R16,V2); \
|
||||
P8_LXVB16X(blk_inp,R17,V3); \
|
||||
P8_LXVB16X(blk_inp,R18,V4); \
|
||||
ADD $64, blk_inp
|
||||
|
||||
// Load input block into V1-V8
|
||||
// in big endian order and
|
||||
// Update blk_inp by 128
|
||||
#define LOAD_INPUT_BLOCK128(blk_inp) \
|
||||
MOVD $16, R16; \
|
||||
MOVD $32, R17; \
|
||||
MOVD $48, R18; \
|
||||
MOVD $64, R19; \
|
||||
MOVD $80, R20; \
|
||||
MOVD $96, R21; \
|
||||
MOVD $112, R22; \
|
||||
P8_LXVB16X(blk_inp,R0,V1); \
|
||||
P8_LXVB16X(blk_inp,R16,V2); \
|
||||
P8_LXVB16X(blk_inp,R17,V3); \
|
||||
P8_LXVB16X(blk_inp,R18,V4); \
|
||||
P8_LXVB16X(blk_inp,R19,V5); \
|
||||
P8_LXVB16X(blk_inp,R20,V6); \
|
||||
P8_LXVB16X(blk_inp,R21,V7); \
|
||||
P8_LXVB16X(blk_inp,R22,V8); \
|
||||
ADD $128, blk_inp
|
||||
|
||||
// Finish encryption on 8 streams and
|
||||
// XOR with input block
|
||||
#define VCIPHERLAST8_XOR_INPUT \
|
||||
VCIPHERLAST V15, V23, V15; \
|
||||
VCIPHERLAST V16, V23, V16; \
|
||||
VCIPHERLAST V17, V23, V17; \
|
||||
VCIPHERLAST V18, V23, V18; \
|
||||
VCIPHERLAST V19, V23, V19; \
|
||||
VCIPHERLAST V20, V23, V20; \
|
||||
VCIPHERLAST V21, V23, V21; \
|
||||
VCIPHERLAST V22, V23, V22; \
|
||||
XXLXOR V1, V15, V1; \
|
||||
XXLXOR V2, V16, V2; \
|
||||
XXLXOR V3, V17, V3; \
|
||||
XXLXOR V4, V18, V4; \
|
||||
XXLXOR V5, V19, V5; \
|
||||
XXLXOR V6, V20, V6; \
|
||||
XXLXOR V7, V21, V7; \
|
||||
XXLXOR V8, V22, V8
|
||||
|
||||
// Finish encryption on 4 streams and
|
||||
// XOR with input block
|
||||
#define VCIPHERLAST4_XOR_INPUT \
|
||||
VCIPHERLAST V15, V23, V15; \
|
||||
VCIPHERLAST V16, V23, V16; \
|
||||
VCIPHERLAST V17, V23, V17; \
|
||||
VCIPHERLAST V18, V23, V18; \
|
||||
XXLXOR V1, V15, V1; \
|
||||
XXLXOR V2, V16, V2; \
|
||||
XXLXOR V3, V17, V3; \
|
||||
XXLXOR V4, V18, V4
|
||||
|
||||
// Store output block from V1-V8
|
||||
// in big endian order and
|
||||
// Update blk_out by 128
|
||||
#define STORE_OUTPUT_BLOCK128(blk_out) \
|
||||
P8_STXVB16X(V1,blk_out,R0); \
|
||||
P8_STXVB16X(V2,blk_out,R16); \
|
||||
P8_STXVB16X(V3,blk_out,R17); \
|
||||
P8_STXVB16X(V4,blk_out,R18); \
|
||||
P8_STXVB16X(V5,blk_out,R19); \
|
||||
P8_STXVB16X(V6,blk_out,R20); \
|
||||
P8_STXVB16X(V7,blk_out,R21); \
|
||||
P8_STXVB16X(V8,blk_out,R22); \
|
||||
ADD $128, blk_out
|
||||
|
||||
// Store output block from V1-V4
|
||||
// in big endian order and
|
||||
// Update blk_out by 64
|
||||
#define STORE_OUTPUT_BLOCK64(blk_out) \
|
||||
P8_STXVB16X(V1,blk_out,R0); \
|
||||
P8_STXVB16X(V2,blk_out,R16); \
|
||||
P8_STXVB16X(V3,blk_out,R17); \
|
||||
P8_STXVB16X(V4,blk_out,R18); \
|
||||
ADD $64, blk_out
|
||||
|
||||
// func gcmInit(productTable *[256]byte, h []byte)
|
||||
TEXT ·gcmInit(SB), NOSPLIT, $0-32
|
||||
MOVD productTable+0(FP), XIP
|
||||
@ -588,3 +878,226 @@ TEXT ·gcmMul(SB), NOSPLIT, $0-32
|
||||
#endif
|
||||
STXVD2X VXL, (XIP+R0) // write out Xi
|
||||
RET
|
||||
|
||||
#define BLK_INP R3
|
||||
#define BLK_OUT R4
|
||||
#define BLK_KEY R5
|
||||
#define KEY_LEN R6
|
||||
#define BLK_IDX R7
|
||||
#define IDX R8
|
||||
#define IN_LEN R9
|
||||
#define COUNTER R10
|
||||
#define CONPTR R14
|
||||
#define MASK V5
|
||||
|
||||
// Implementation of the counterCrypt function in assembler.
|
||||
// Original loop is unrolled to allow for multiple encryption
|
||||
// streams to be done in parallel, which is achieved by interleaving
|
||||
// vcipher instructions from each stream. This is also referred to as
|
||||
// stitching, and provides significant performance improvements.
|
||||
// Some macros are defined which enable execution for big or little
|
||||
// endian as well as different ISA targets.
|
||||
//func (g *gcmAsm) counterCrypt(out, in []byte, counter *[gcmBlockSize]byte, key[gcmBlockSize]uint32)
|
||||
//func counterCryptASM(xr, out, in, counter, key)
|
||||
TEXT ·counterCryptASM(SB), NOSPLIT, $16-72
|
||||
MOVD xr(FP), KEY_LEN
|
||||
MOVD out+8(FP), BLK_OUT
|
||||
MOVD out_len+16(FP), R8
|
||||
MOVD in+32(FP), BLK_INP
|
||||
MOVD in_len+40(FP), IN_LEN
|
||||
MOVD counter+56(FP), COUNTER
|
||||
MOVD key+64(FP), BLK_KEY
|
||||
|
||||
// Set up permute string when needed.
|
||||
#ifdef NEEDS_ESPERM
|
||||
MOVD $·rcon(SB), R14
|
||||
LVX (R14), ESPERM // Permute value for P8_ macros.
|
||||
#endif
|
||||
SETUP_COUNTER // V30 Counter V31 BE {0, 0, 0, 1}
|
||||
LOAD_KEYS(BLK_KEY, KEY_LEN) // VS1 - VS10/12/14 based on keysize
|
||||
CMP IN_LEN, $128
|
||||
BLT block64
|
||||
block128_loop:
|
||||
// Do 8 encryptions in parallel by setting
|
||||
// input values in V15-V22 and executing
|
||||
// vcipher on the updated value and the keys.
|
||||
GEN_VCIPHER_8_INPUTS
|
||||
VCIPHER_8X1_KEY(VS1)
|
||||
VCIPHER_8X1_KEY(VS2)
|
||||
VCIPHER_8X1_KEY(VS3)
|
||||
VCIPHER_8X1_KEY(VS4)
|
||||
VCIPHER_8X1_KEY(VS5)
|
||||
VCIPHER_8X1_KEY(VS6)
|
||||
VCIPHER_8X1_KEY(VS7)
|
||||
VCIPHER_8X1_KEY(VS8)
|
||||
VCIPHER_8X1_KEY(VS9)
|
||||
// Additional encryptions are done based on
|
||||
// the key length, with the last key moved
|
||||
// to V23 for use with VCIPHERLAST.
|
||||
// CR2 = CMP key_len, $12
|
||||
XXLOR VS10, VS10, V23
|
||||
BLT CR2, block128_last // key_len = 10
|
||||
VCIPHER_8X1_KEY(VS10)
|
||||
VCIPHER_8X1_KEY(VS11)
|
||||
XXLOR VS12,VS12,V23
|
||||
BEQ CR2, block128_last // ken_len = 12
|
||||
VCIPHER_8X1_KEY(VS12)
|
||||
VCIPHER_8X1_KEY(VS13)
|
||||
XXLOR VS14,VS14,V23 // key_len = 14
|
||||
block128_last:
|
||||
// vcipher encryptions are in V15-V22 at this
|
||||
// point with vcipherlast remaining to be done.
|
||||
// Load input block into V1-V8, setting index offsets
|
||||
// in R16-R22 to use with the STORE.
|
||||
LOAD_INPUT_BLOCK128(BLK_INP)
|
||||
// Do VCIPHERLAST on the last key for each encryption
|
||||
// stream and XOR the result with the corresponding
|
||||
// value from the input block.
|
||||
VCIPHERLAST8_XOR_INPUT
|
||||
// Store the results (8*16) and update BLK_OUT by 128.
|
||||
STORE_OUTPUT_BLOCK128(BLK_OUT)
|
||||
ADD $-128, IN_LEN // input size
|
||||
CMP IN_LEN, $128 // check if >= blocksize
|
||||
BGE block128_loop // next input block
|
||||
CMP IN_LEN, $0
|
||||
BEQ done
|
||||
block64:
|
||||
CMP IN_LEN, $64 // Check if >= 64
|
||||
BLT block16_loop
|
||||
// Do 4 encryptions in parallel by setting
|
||||
// input values in V15-V18 and executing
|
||||
// vcipher on the updated value and the keys.
|
||||
GEN_VCIPHER_4_INPUTS
|
||||
VCIPHER_4X1_KEY(VS1)
|
||||
VCIPHER_4X1_KEY(VS2)
|
||||
VCIPHER_4X1_KEY(VS3)
|
||||
VCIPHER_4X1_KEY(VS4)
|
||||
VCIPHER_4X1_KEY(VS5)
|
||||
VCIPHER_4X1_KEY(VS6)
|
||||
VCIPHER_4X1_KEY(VS7)
|
||||
VCIPHER_4X1_KEY(VS8)
|
||||
VCIPHER_4X1_KEY(VS9)
|
||||
// Check key length based on CR2
|
||||
// Move last key to V23 for use with later vcipherlast
|
||||
XXLOR VS10, VS10, V23
|
||||
BLT CR2, block64_last // size = 10
|
||||
VCIPHER_4X1_KEY(VS10) // Encrypt next 2 keys
|
||||
VCIPHER_4X1_KEY(VS11)
|
||||
XXLOR VS12, VS12, V23
|
||||
BEQ CR2, block64_last // size = 12
|
||||
VCIPHER_4X1_KEY(VS12) // Encrypt last 2 keys
|
||||
VCIPHER_4X1_KEY(VS13)
|
||||
XXLOR VS14, VS14, V23 // size = 14
|
||||
block64_last:
|
||||
LOAD_INPUT_BLOCK64(BLK_INP) // Load 64 bytes of input
|
||||
// Do VCIPHERLAST on the last for each encryption
|
||||
// stream and XOR the result with the corresponding
|
||||
// value from the input block.
|
||||
VCIPHERLAST4_XOR_INPUT
|
||||
// Store the results (4*16) and update BLK_OUT by 64.
|
||||
STORE_OUTPUT_BLOCK64(BLK_OUT)
|
||||
ADD $-64, IN_LEN // decrement input block length
|
||||
CMP IN_LEN, $0 // check for remaining length
|
||||
BEQ done
|
||||
block16_loop:
|
||||
CMP IN_LEN, $16 // More input
|
||||
BLT final_block // If not, then handle partial block
|
||||
// Single encryption, no stitching
|
||||
GEN_VCIPHER_INPUT // Generate input value for single encryption
|
||||
VCIPHER_1X9_KEYS(V15) // Encrypt V15 value with 9 keys
|
||||
XXLOR VS10, VS10, V23 // Last key -> V23 for later vcipiherlast
|
||||
// Key length based on CR2. (LT=10, EQ=12, GT=14)
|
||||
BLT CR2, block16_last // Finish for key size 10
|
||||
VCIPHER_1X2_KEYS(V15, VS10, VS11) // Encrypt V15 with 2 more keys
|
||||
XXLOR VS12, VS12, V23 // Last key -> V23 for later vcipherlast
|
||||
BEQ CR2, block16_last // Finish for key size 12
|
||||
VCIPHER_1X2_KEYS(V15, VS12, VS13) // Encrypt V15 with last 2 keys
|
||||
XXLOR VS14, VS14, V23 // Last key -> V23 for vcipherlast with key size 14
|
||||
block16_last:
|
||||
P8_LXVB16X(BLK_INP, R0, V1) // Load input
|
||||
VCIPHERLAST V15, V23, V15 // Encrypt last value in V23
|
||||
XXLXOR V15, V1, V1 // XOR with input
|
||||
P8_STXVB16X(V1,R0,BLK_OUT) // Store final encryption value to output
|
||||
ADD $16, BLK_INP // Increment input pointer
|
||||
ADD $16, BLK_OUT // Increment output pointer
|
||||
ADD $-16, IN_LEN // Decrement input length
|
||||
BR block16_loop // Check for next
|
||||
final_block:
|
||||
CMP IN_LEN, $0
|
||||
BEQ done
|
||||
GEN_VCIPHER_INPUT // Generate input value for partial encryption
|
||||
VCIPHER_1X9_KEYS(V15) // Encrypt V15 with 9 keys
|
||||
XXLOR VS10, VS10, V23 // Save possible last key
|
||||
BLT CR2, final_block_last
|
||||
VCIPHER_1X2_KEYS(V15, VS10, VS11) // Encrypt V15 with next 2 keys
|
||||
XXLOR VS12, VS12, V23 // Save possible last key
|
||||
BEQ CR2, final_block_last
|
||||
VCIPHER_1X2_KEYS(V15, VS12, VS13) // Encrypt V15 with last 2 keys
|
||||
XXLOR VS14, VS14, V23 // Save last key
|
||||
final_block_last:
|
||||
VCIPHERLAST V15, V23, V15 // Finish encryption
|
||||
#ifdef GOPPC64_power10
|
||||
// set up length
|
||||
SLD $56, IN_LEN, R17
|
||||
LXVLL BLK_INP, R17, V25
|
||||
VXOR V25, V15, V25
|
||||
STXVLL V25, BLK_OUT, R17
|
||||
#else
|
||||
ADD $32, R1, MASK_PTR
|
||||
MOVD $0, R16
|
||||
P8_STXVB16X(V15, MASK_PTR, R0)
|
||||
CMP IN_LEN, $8
|
||||
BLT next4
|
||||
MOVD 0(MASK_PTR), R14
|
||||
MOVD 0(BLK_INP), R15
|
||||
XOR R14, R15, R14
|
||||
MOVD R14, 0(BLK_OUT)
|
||||
ADD $8, R16
|
||||
ADD $-8, IN_LEN
|
||||
next4:
|
||||
CMP IN_LEN, $4
|
||||
BLT next2
|
||||
MOVWZ (BLK_INP)(R16), R15
|
||||
MOVWZ (MASK_PTR)(R16), R14
|
||||
XOR R14, R15, R14
|
||||
MOVW R14, (R16)(BLK_OUT)
|
||||
ADD $4, R16
|
||||
ADD $-4, IN_LEN
|
||||
next2:
|
||||
CMP IN_LEN, $2
|
||||
BLT next1
|
||||
MOVHZ (BLK_INP)(R16), R15
|
||||
MOVHZ (MASK_PTR)(R16), R14
|
||||
XOR R14, R15, R14
|
||||
MOVH R14, (R16)(BLK_OUT)
|
||||
ADD $2, R16
|
||||
ADD $-2, IN_LEN
|
||||
next1:
|
||||
CMP IN_LEN, $1
|
||||
BLT done
|
||||
MOVBZ (MASK_PTR)(R16), R14
|
||||
MOVBZ (BLK_INP)(R16), R15
|
||||
XOR R14, R15, R14
|
||||
MOVB R14, (R16)(BLK_OUT)
|
||||
#endif
|
||||
done:
|
||||
// Save the updated counter value
|
||||
P8_STXVB16X(V30, COUNTER, R0)
|
||||
// Clear the keys
|
||||
XXLXOR VS0, VS0, VS0
|
||||
XXLXOR VS1, VS1, VS1
|
||||
XXLXOR VS2, VS2, VS2
|
||||
XXLXOR VS3, VS3, VS3
|
||||
XXLXOR VS4, VS4, VS4
|
||||
XXLXOR VS5, VS5, VS5
|
||||
XXLXOR VS6, VS6, VS6
|
||||
XXLXOR VS7, VS7, VS7
|
||||
XXLXOR VS8, VS8, VS8
|
||||
XXLXOR VS9, VS9, VS9
|
||||
XXLXOR VS10, VS10, VS10
|
||||
XXLXOR VS11, VS11, VS11
|
||||
XXLXOR VS12, VS12, VS12
|
||||
XXLXOR VS13, VS13, VS13
|
||||
XXLXOR VS14, VS14, VS14
|
||||
RET
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user