crypto/aes,crypto/cipher: improve gcm performance on ppc64x

This improves performance for AES-GCM. The function counterCrypt is written in assembler so the loop can be unrolled and the stitched approach used for the encryption. This implementation works on ppc64le and ppc64. The use of GOPPC64=power9 generates the best performance, goos: linux goarch: ppc64le pkg: crypto/cipher cpu: POWER10 │ gcmx8.cpu1.out │ gcmx8.new.cpu1.out │ │ sec/op │ sec/op vs base │ AESGCM/Open-128-64 180.5n ± 0% 152.7n ± 1% -15.43% (p=0.000 n=8) AESGCM/Seal-128-64 166.8n ± 0% 147.2n ± 0% -11.72% (p=0.000 n=8) AESGCM/Open-256-64 194.9n ± 0% 156.6n ± 1% -19.65% (p=0.000 n=8) AESGCM/Seal-256-64 183.7n ± 0% 157.0n ± 0% -14.51% (p=0.000 n=8) AESGCM/Open-128-1350 1769.5n ± 0% 454.5n ± 0% -74.31% (p=0.000 n=8) AESGCM/Seal-128-1350 1759.0n ± 0% 453.7n ± 0% -74.21% (p=0.000 n=8) AESGCM/Open-256-1350 2104.0n ± 0% 504.4n ± 1% -76.03% (p=0.000 n=8) AESGCM/Seal-256-1350 2092.0n ± 0% 503.0n ± 0% -75.96% (p=0.000 n=8) AESGCM/Open-128-8192 10.054µ ± 0% 1.961µ ± 0% -80.50% (p=0.000 n=8) AESGCM/Seal-128-8192 10.050µ ± 0% 1.965µ ± 0% -80.45% (p=0.000 n=8) AESGCM/Open-256-8192 12.080µ ± 0% 2.232µ ± 0% -81.52% (p=0.000 n=8) AESGCM/Seal-256-8192 12.069µ ± 0% 2.238µ ± 0% -81.46% (p=0.000 n=8) geomean 1.566µ 535.5n -65.80% │ gcmx8.cpu1.out │ gcmx8.new.cpu1.out │ │ B/s │ B/s vs base │ AESGCM/Open-128-64 338.1Mi ± 0% 399.8Mi ± 1% +18.27% (p=0.000 n=8) AESGCM/Seal-128-64 366.1Mi ± 0% 414.6Mi ± 0% +13.28% (p=0.000 n=8) AESGCM/Open-256-64 313.1Mi ± 0% 389.7Mi ± 0% +24.47% (p=0.000 n=8) AESGCM/Seal-256-64 332.3Mi ± 0% 388.5Mi ± 0% +16.93% (p=0.000 n=8) AESGCM/Open-128-1350 727.6Mi ± 0% 2832.8Mi ± 0% +289.33% (p=0.000 n=8) AESGCM/Seal-128-1350 732.0Mi ± 0% 2837.8Mi ± 0% +287.70% (p=0.000 n=8) AESGCM/Open-256-1350 611.9Mi ± 0% 2552.6Mi ± 0% +317.18% (p=0.000 n=8) AESGCM/Seal-256-1350 615.3Mi ± 0% 2559.6Mi ± 0% +315.97% (p=0.000 n=8) AESGCM/Open-128-8192 777.1Mi ± 0% 3983.5Mi ± 0% +412.63% (p=0.000 n=8) AESGCM/Seal-128-8192 777.3Mi ± 0% 3975.9Mi ± 0% +411.47% (p=0.000 n=8) AESGCM/Open-256-8192 646.7Mi ± 0% 3500.6Mi ± 0% +441.27% (p=0.000 n=8) AESGCM/Seal-256-8192 647.3Mi ± 0% 3491.1Mi ± 0% +439.30% (p=0.000 n=8) geomean 542.7Mi 1.550Gi +192.42% Change-Id: I3600831a263ec8a99b5e3bdd495eb36e966d8075 Reviewed-on: https://go-review.googlesource.com/c/go/+/484575 Reviewed-by: Roland Shoemaker <roland@golang.org> TryBot-Result: Gopher Robot <gobot@golang.org> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> Reviewed-by: Paul Murphy <murp@ibm.com> Reviewed-by: Than McIntosh <thanm@google.com>
2024-11-23 14:50:07 -07:00 · 2023-04-11 14:49:29 -05:00 · 2023-04-11 14:49:29 -05:00 · a8ca649bbe
commit a8ca649bbe
parent e0948d825d
2 changed files with 521 additions and 30 deletions
--- a/src/crypto/aes/gcm_ppc64x.go
+++ b/src/crypto/aes/gcm_ppc64x.go
@ -51,6 +51,8 @@ type gcmAsm struct {
 	tagSize int
 }

+func counterCryptASM(nr int, out, in []byte, counter *[gcmBlockSize]byte, key *uint32)
+
 // NewGCM returns the AES cipher wrapped in Galois Counter Mode. This is only
 // called by crypto/cipher.NewGCM via the gcmAble interface.
 func (c *aesCipherAsm) NewGCM(nonceSize, tagSize int) (cipher.AEAD, error) {
@ -114,34 +116,10 @@ func (g *gcmAsm) deriveCounter(counter *[gcmBlockSize]byte, nonce []byte) {
 // into out. counter is the initial count value and will be updated with the next
 // count value. The length of out must be greater than or equal to the length
 // of in.
+// counterCryptASM implements counterCrypt which then allows the loop to
+// be unrolled and optimized.
 func (g *gcmAsm) counterCrypt(out, in []byte, counter *[gcmBlockSize]byte) {
-	var mask [gcmBlockSize]byte
-
-	for len(in) >= gcmBlockSize {
-		// Hint to avoid bounds check
-		_, _ = in[15], out[15]
-		g.cipher.Encrypt(mask[:], counter[:])
-		gcmInc32(counter)
-
-		// XOR 16 bytes each loop iteration in 8 byte chunks
-		in0 := binary.LittleEndian.Uint64(in[0:])
-		in1 := binary.LittleEndian.Uint64(in[8:])
-		m0 := binary.LittleEndian.Uint64(mask[:8])
-		m1 := binary.LittleEndian.Uint64(mask[8:])
-		binary.LittleEndian.PutUint64(out[:8], in0^m0)
-		binary.LittleEndian.PutUint64(out[8:], in1^m1)
-		out = out[16:]
-		in = in[16:]
-	}
-
-	if len(in) > 0 {
-		g.cipher.Encrypt(mask[:], counter[:])
-		gcmInc32(counter)
-		// XOR leftover bytes
-		for i, inb := range in {
-			out[i] = inb ^ mask[i]
-		}
-	}
+	counterCryptASM(len(g.cipher.enc)/4-1, out, in, counter, &g.cipher.enc[0])
 }

 // increments the rightmost 32-bits of the count value by 1.
--- a/src/crypto/aes/gcm_ppc64x.s
+++ b/src/crypto/aes/gcm_ppc64x.s
@ -4,7 +4,7 @@

 //go:build ppc64 || ppc64le

-// Based on CRYPTOGAMS code with the following comment:
+// Portions based on CRYPTOGAMS code with the following comment:
 // # ====================================================================
 // # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 // # project. The module is, however, dual licensed under OpenSSL and
@ -12,13 +12,17 @@
 // # details see http://www.openssl.org/~appro/cryptogams/.
 // # ====================================================================

-// This implementation is based on the ppc64 asm generated by the
-// script https://github.com/dot-asm/cryptogams/blob/master/ppc/ghashp8-ppc.pl
+// The implementations for gcmHash, gcmInit and gcmMul are based on the generated asm
+// from the script https://github.com/dot-asm/cryptogams/blob/master/ppc/ghashp8-ppc.pl
 // from commit d47afb3c.

 // Changes were made due to differences in the ABI and some register usage.
 // Some arguments were changed due to the way the Go code passes them.

+// Portions that use the stitched AES-GCM approach in counterCryptASM
+// are based on code found in
+// https://github.com/IBM/ipcri/blob/main/aes/p10_aes_gcm.s
+
 #include "textflag.h"

 #define XIP    R3
@ -87,6 +91,292 @@

 #define VIN0   VIN

+#define ESPERM V10
+#define TMP2 V11
+
+// The following macros provide appropriate
+// implementations for endianness as well as
+// ISA specific for power8 and power9.
+#ifdef GOARCH_ppc64le
+#  ifdef GOPPC64_power9
+#define P8_LXVB16X(RA,RB,VT)   LXVB16X (RA)(RB), VT
+#define P8_STXVB16X(VS,RA,RB)  STXVB16X VS, (RA)(RB)
+#  else
+#define NEEDS_ESPERM
+#define P8_LXVB16X(RA,RB,VT) \
+	LXVD2X  (RA+RB), VT \
+	VPERM	VT, VT, ESPERM, VT
+
+#define P8_STXVB16X(VS,RA,RB) \
+	VPERM	VS, VS, ESPERM, TMP2; \
+	STXVD2X TMP2, (RA+RB)
+
+#  endif
+#else
+#define P8_LXVB16X(RA,RB,VT) \
+	LXVD2X  (RA+RB), VT
+
+#define P8_STXVB16X(VS,RA,RB) \
+	STXVD2X VS, (RA+RB)
+
+#endif
+
+#define MASK_PTR   R8
+
+#define MASKV   V0
+#define INV     V1
+
+// The following macros are used for
+// the stitched implementation within
+// counterCryptASM.
+
+// Load the initial GCM counter value
+// in V30 and set up the counter increment
+// in V31
+#define SETUP_COUNTER \
+	P8_LXVB16X(COUNTER, R0, V30); \
+	VSPLTISB $1, V28; \
+	VXOR V31, V31, V31; \
+	VSLDOI $1, V31, V28, V31
+
+// These macros set up the initial value
+// for a single encryption, or 4 or 8
+// stitched encryptions implemented
+// with interleaving vciphers.
+//
+// The input value for each encryption
+// is generated by XORing the counter
+// from V30 with the first key in VS0
+// and incrementing the counter.
+//
+// Single encryption in V15
+#define GEN_VCIPHER_INPUT \
+	XXLOR VS0, VS0, V29 \
+	VXOR V30, V29, V15; \
+	VADDUWM V30, V31, V30
+
+// 4 encryptions in V15 - V18
+#define GEN_VCIPHER_4_INPUTS \
+	XXLOR VS0, VS0, V29; \
+	VXOR V30, V29, V15; \
+	VADDUWM V30, V31, V30; \
+	VXOR V30, V29, V16; \
+	VADDUWM V30, V31, V30; \
+	VXOR V30, V29, V17; \
+	VADDUWM V30, V31, V30; \
+	VXOR V30, V29, V18; \
+	VADDUWM V30, V31, V30
+
+// 8 encryptions in V15 - V22
+#define GEN_VCIPHER_8_INPUTS \
+	XXLOR VS0, VS0, V29; \
+	VXOR V30, V29, V15; \
+	VADDUWM V30, V31, V30; \
+	VXOR V30, V29, V16; \
+	VADDUWM V30, V31, V30; \
+	VXOR V30, V29, V17; \
+	VADDUWM V30, V31, V30; \
+	VXOR V30, V29, V18; \
+	VADDUWM V30, V31, V30; \
+	VXOR V30, V29, V19; \
+	VADDUWM V30, V31, V30; \
+	VXOR V30, V29, V20; \
+	VADDUWM V30, V31, V30; \
+	VXOR V30, V29, V21; \
+	VADDUWM V30, V31, V30; \
+	VXOR V30, V29, V22; \
+	VADDUWM V30, V31, V30
+
+// Load the keys to be used for
+// encryption based on key_len.
+// Keys are in VS0 - VS14
+// depending on key_len.
+// Valid keys sizes are verified
+// here. CR2 is set and used
+// throughout to check key_len.
+#define LOAD_KEYS(blk_key, key_len) \
+	MOVD	$16, R16; \
+	MOVD	$32, R17; \
+	MOVD	$48, R18; \
+	MOVD	$64, R19; \
+	LXVD2X (blk_key)(R0), VS0; \
+	LXVD2X (blk_key)(R16), VS1; \
+	LXVD2X (blk_key)(R17), VS2; \
+	LXVD2X (blk_key)(R18), VS3; \
+	LXVD2X (blk_key)(R19), VS4; \
+	ADD $64, R16; \
+	ADD $64, R17; \
+	ADD $64, R18; \
+	ADD $64, R19; \
+	LXVD2X (blk_key)(R16), VS5; \
+	LXVD2X (blk_key)(R17), VS6; \
+	LXVD2X (blk_key)(R18), VS7; \
+	LXVD2X (blk_key)(R19), VS8; \
+	ADD $64, R16; \
+	ADD $64, R17; \
+	ADD $64, R18; \
+	ADD $64, R19; \
+	LXVD2X (blk_key)(R16), VS9; \
+	LXVD2X (blk_key)(R17), VS10; \
+	CMP key_len, $12, CR2; \
+	CMP key_len, $10; \
+	BEQ keysLoaded; \
+	LXVD2X (blk_key)(R18), VS11; \
+	LXVD2X (blk_key)(R19), VS12; \
+	BEQ CR2, keysLoaded; \
+	ADD $64, R16; \
+	ADD $64, R17; \
+	LXVD2X (blk_key)(R16), VS13; \
+	LXVD2X (blk_key)(R17), VS14; \
+	CMP key_len, $14; \
+	BEQ keysLoaded; \
+	MOVD R0,0(R0); \
+keysLoaded:
+
+// Encrypt 1 (vin) with first 9
+// keys from VS1 - VS9.
+#define VCIPHER_1X9_KEYS(vin) \
+	XXLOR VS1, VS1, V23; \
+	XXLOR VS2, VS2, V24; \
+	XXLOR VS3, VS3, V25; \
+	XXLOR VS4, VS4, V26; \
+	XXLOR VS5, VS5, V27; \
+	VCIPHER vin, V23, vin; \
+	VCIPHER vin, V24, vin; \
+	VCIPHER vin, V25, vin; \
+	VCIPHER vin, V26, vin; \
+	VCIPHER vin, V27, vin; \
+	XXLOR VS6, VS6, V23; \
+	XXLOR VS7, VS7, V24; \
+	XXLOR VS8, VS8, V25; \
+	XXLOR VS9, VS9, V26; \
+	VCIPHER vin, V23, vin; \
+	VCIPHER vin, V24, vin; \
+	VCIPHER vin, V25, vin; \
+	VCIPHER	vin, V26, vin
+
+// Encrypt 1 value (vin) with
+// 2 specified keys
+#define VCIPHER_1X2_KEYS(vin, key1, key2) \
+	XXLOR key1, key1, V25; \
+	XXLOR key2, key2, V26; \
+	VCIPHER vin, V25, vin; \
+	VCIPHER vin, V26, vin
+
+// Encrypt 4 values in V15 - V18
+// with the specified key from
+// VS1 - VS9.
+#define VCIPHER_4X1_KEY(key) \
+	XXLOR key, key, V23; \
+	VCIPHER V15, V23, V15; \
+	VCIPHER V16, V23, V16; \
+	VCIPHER V17, V23, V17; \
+	VCIPHER V18, V23, V18
+
+// Encrypt 8 values in V15 - V22
+// with the specified key,
+// assuming it is a VSreg
+#define VCIPHER_8X1_KEY(key) \
+	XXLOR key, key, V23; \
+	VCIPHER V15, V23, V15; \
+	VCIPHER V16, V23, V16; \
+	VCIPHER V17, V23, V17; \
+	VCIPHER V18, V23, V18; \
+	VCIPHER V19, V23, V19; \
+	VCIPHER V20, V23, V20; \
+	VCIPHER V21, V23, V21; \
+	VCIPHER V22, V23, V22
+
+// Load input block into V1-V4
+// in big endian order and
+// update blk_inp by 64.
+#define LOAD_INPUT_BLOCK64(blk_inp) \
+	MOVD $16, R16; \
+	MOVD $32, R17; \
+	MOVD $48, R18; \
+	P8_LXVB16X(blk_inp,R0,V1); \
+	P8_LXVB16X(blk_inp,R16,V2); \
+	P8_LXVB16X(blk_inp,R17,V3); \
+	P8_LXVB16X(blk_inp,R18,V4); \
+	ADD $64, blk_inp
+
+// Load input block into V1-V8
+// in big endian order and
+// Update blk_inp by 128
+#define LOAD_INPUT_BLOCK128(blk_inp) \
+	MOVD $16, R16; \
+	MOVD $32, R17; \
+	MOVD $48, R18; \
+	MOVD $64, R19; \
+	MOVD $80, R20; \
+	MOVD $96, R21; \
+	MOVD $112, R22; \
+	P8_LXVB16X(blk_inp,R0,V1); \
+	P8_LXVB16X(blk_inp,R16,V2); \
+	P8_LXVB16X(blk_inp,R17,V3); \
+	P8_LXVB16X(blk_inp,R18,V4); \
+	P8_LXVB16X(blk_inp,R19,V5); \
+	P8_LXVB16X(blk_inp,R20,V6); \
+	P8_LXVB16X(blk_inp,R21,V7); \
+	P8_LXVB16X(blk_inp,R22,V8); \
+	ADD $128, blk_inp
+
+// Finish encryption on 8 streams and
+// XOR with input block
+#define VCIPHERLAST8_XOR_INPUT \
+	VCIPHERLAST     V15, V23, V15; \
+	VCIPHERLAST     V16, V23, V16; \
+	VCIPHERLAST     V17, V23, V17; \
+	VCIPHERLAST     V18, V23, V18; \
+	VCIPHERLAST     V19, V23, V19; \
+	VCIPHERLAST     V20, V23, V20; \
+	VCIPHERLAST     V21, V23, V21; \
+	VCIPHERLAST     V22, V23, V22; \
+	XXLXOR          V1, V15, V1; \
+	XXLXOR          V2, V16, V2; \
+	XXLXOR          V3, V17, V3; \
+	XXLXOR          V4, V18, V4; \
+	XXLXOR          V5, V19, V5; \
+	XXLXOR          V6, V20, V6; \
+	XXLXOR          V7, V21, V7; \
+	XXLXOR          V8, V22, V8
+
+// Finish encryption on 4 streams and
+// XOR with input block
+#define VCIPHERLAST4_XOR_INPUT \
+	VCIPHERLAST     V15, V23, V15; \
+	VCIPHERLAST     V16, V23, V16; \
+	VCIPHERLAST     V17, V23, V17; \
+	VCIPHERLAST     V18, V23, V18; \
+	XXLXOR          V1, V15, V1; \
+	XXLXOR          V2, V16, V2; \
+	XXLXOR          V3, V17, V3; \
+	XXLXOR          V4, V18, V4
+
+// Store output block from V1-V8
+// in big endian order and
+// Update blk_out by 128
+#define STORE_OUTPUT_BLOCK128(blk_out) \
+	P8_STXVB16X(V1,blk_out,R0); \
+	P8_STXVB16X(V2,blk_out,R16); \
+	P8_STXVB16X(V3,blk_out,R17); \
+	P8_STXVB16X(V4,blk_out,R18); \
+	P8_STXVB16X(V5,blk_out,R19); \
+	P8_STXVB16X(V6,blk_out,R20); \
+	P8_STXVB16X(V7,blk_out,R21); \
+	P8_STXVB16X(V8,blk_out,R22); \
+	ADD $128, blk_out
+
+// Store output block from V1-V4
+// in big endian order and
+// Update blk_out by 64
+#define STORE_OUTPUT_BLOCK64(blk_out) \
+	P8_STXVB16X(V1,blk_out,R0); \
+	P8_STXVB16X(V2,blk_out,R16); \
+	P8_STXVB16X(V3,blk_out,R17); \
+	P8_STXVB16X(V4,blk_out,R18); \
+	ADD $64, blk_out
+
 // func gcmInit(productTable *[256]byte, h []byte)
 TEXT ·gcmInit(SB), NOSPLIT, $0-32
 	MOVD productTable+0(FP), XIP
@ -588,3 +878,226 @@ TEXT ·gcmMul(SB), NOSPLIT, $0-32
 #endif
 	STXVD2X VXL, (XIP+R0)      // write out Xi
 	RET
+
+#define BLK_INP    R3
+#define BLK_OUT    R4
+#define BLK_KEY    R5
+#define KEY_LEN    R6
+#define BLK_IDX    R7
+#define IDX        R8
+#define IN_LEN     R9
+#define COUNTER    R10
+#define CONPTR     R14
+#define MASK       V5
+
+// Implementation of the counterCrypt function in assembler.
+// Original loop is unrolled to allow for multiple encryption
+// streams to be done in parallel, which is achieved by interleaving
+// vcipher instructions from each stream. This is also referred to as
+// stitching, and provides significant performance improvements.
+// Some macros are defined which enable execution for big or little
+// endian as well as different ISA targets.
+//func (g *gcmAsm) counterCrypt(out, in []byte, counter *[gcmBlockSize]byte, key[gcmBlockSize]uint32)
+//func counterCryptASM(xr, out, in, counter, key)
+TEXT ·counterCryptASM(SB), NOSPLIT, $16-72
+	MOVD	xr(FP), KEY_LEN
+	MOVD    out+8(FP), BLK_OUT
+	MOVD    out_len+16(FP), R8
+	MOVD    in+32(FP), BLK_INP
+	MOVD    in_len+40(FP), IN_LEN
+	MOVD    counter+56(FP), COUNTER
+	MOVD    key+64(FP), BLK_KEY
+
+// Set up permute string when needed.
+#ifdef NEEDS_ESPERM
+	MOVD    $·rcon(SB), R14
+	LVX     (R14), ESPERM   // Permute value for P8_ macros.
+#endif
+	SETUP_COUNTER		// V30 Counter V31 BE {0, 0, 0, 1}
+	LOAD_KEYS(BLK_KEY, KEY_LEN)	// VS1 - VS10/12/14 based on keysize
+	CMP     IN_LEN, $128
+	BLT	block64
+block128_loop:
+	// Do 8 encryptions in parallel by setting
+	// input values in V15-V22 and executing
+	// vcipher on the updated value and the keys.
+	GEN_VCIPHER_8_INPUTS
+	VCIPHER_8X1_KEY(VS1)
+	VCIPHER_8X1_KEY(VS2)
+	VCIPHER_8X1_KEY(VS3)
+	VCIPHER_8X1_KEY(VS4)
+	VCIPHER_8X1_KEY(VS5)
+	VCIPHER_8X1_KEY(VS6)
+	VCIPHER_8X1_KEY(VS7)
+	VCIPHER_8X1_KEY(VS8)
+	VCIPHER_8X1_KEY(VS9)
+	// Additional encryptions are done based on
+	// the key length, with the last key moved
+	// to V23 for use with VCIPHERLAST.
+	// CR2 = CMP key_len, $12
+	XXLOR VS10, VS10, V23
+	BLT	CR2, block128_last // key_len = 10
+	VCIPHER_8X1_KEY(VS10)
+	VCIPHER_8X1_KEY(VS11)
+	XXLOR VS12,VS12,V23
+	BEQ	CR2, block128_last // ken_len = 12
+	VCIPHER_8X1_KEY(VS12)
+	VCIPHER_8X1_KEY(VS13)
+	XXLOR VS14,VS14,V23	// key_len = 14
+block128_last:
+	// vcipher encryptions are in V15-V22 at this
+	// point with vcipherlast remaining to be done.
+	// Load input block into V1-V8, setting index offsets
+	// in R16-R22 to use with the STORE.
+	LOAD_INPUT_BLOCK128(BLK_INP)
+	// Do VCIPHERLAST on the last key for each encryption
+	// stream and XOR the result with the corresponding
+	// value from the input block.
+	VCIPHERLAST8_XOR_INPUT
+	// Store the results (8*16) and update BLK_OUT by 128.
+	STORE_OUTPUT_BLOCK128(BLK_OUT)
+	ADD	$-128, IN_LEN	// input size
+	CMP     IN_LEN, $128	// check if >= blocksize
+	BGE	block128_loop	// next input block
+	CMP	IN_LEN, $0
+	BEQ	done
+block64:
+	CMP	IN_LEN, $64	// Check if >= 64
+	BLT	block16_loop
+	// Do 4 encryptions in parallel by setting
+	// input values in V15-V18 and executing
+	// vcipher on the updated value and the keys.
+	GEN_VCIPHER_4_INPUTS
+	VCIPHER_4X1_KEY(VS1)
+	VCIPHER_4X1_KEY(VS2)
+	VCIPHER_4X1_KEY(VS3)
+	VCIPHER_4X1_KEY(VS4)
+	VCIPHER_4X1_KEY(VS5)
+	VCIPHER_4X1_KEY(VS6)
+	VCIPHER_4X1_KEY(VS7)
+	VCIPHER_4X1_KEY(VS8)
+	VCIPHER_4X1_KEY(VS9)
+	// Check key length based on CR2
+	// Move last key to V23 for use with later vcipherlast
+	XXLOR	VS10, VS10, V23
+	BLT	CR2, block64_last	// size = 10
+	VCIPHER_4X1_KEY(VS10)		// Encrypt next 2 keys
+	VCIPHER_4X1_KEY(VS11)
+	XXLOR	VS12, VS12, V23
+	BEQ	CR2, block64_last	// size = 12
+	VCIPHER_4X1_KEY(VS12)		// Encrypt last 2 keys
+	VCIPHER_4X1_KEY(VS13)
+	XXLOR	VS14, VS14, V23		// size = 14
+block64_last:
+	LOAD_INPUT_BLOCK64(BLK_INP)	// Load 64 bytes of input
+	// Do VCIPHERLAST on the last for each encryption
+	// stream and XOR the result with the corresponding
+	// value from the input block.
+	VCIPHERLAST4_XOR_INPUT
+	// Store the results (4*16) and update BLK_OUT by 64.
+	STORE_OUTPUT_BLOCK64(BLK_OUT)
+	ADD	$-64, IN_LEN		// decrement input block length
+	CMP	IN_LEN, $0		// check for remaining length
+	BEQ	done
+block16_loop:
+	CMP	IN_LEN, $16		// More input
+	BLT	final_block		// If not, then handle partial block
+	// Single encryption, no stitching
+	GEN_VCIPHER_INPUT		// Generate input value for single encryption
+	VCIPHER_1X9_KEYS(V15)		// Encrypt V15 value with 9 keys
+	XXLOR	VS10, VS10, V23		// Last key -> V23 for later vcipiherlast
+	// Key length based on CR2. (LT=10, EQ=12, GT=14)
+	BLT	CR2, block16_last	// Finish for key size 10
+	VCIPHER_1X2_KEYS(V15, VS10, VS11) // Encrypt V15 with 2 more keys
+	XXLOR	VS12, VS12, V23		// Last key -> V23 for later vcipherlast
+	BEQ	CR2, block16_last	// Finish for key size 12
+	VCIPHER_1X2_KEYS(V15, VS12, VS13) // Encrypt V15 with last 2 keys
+	XXLOR	VS14, VS14, V23		// Last key -> V23 for vcipherlast with key size 14
+block16_last:
+	P8_LXVB16X(BLK_INP, R0, V1)	// Load input
+	VCIPHERLAST V15, V23, V15	// Encrypt last value in V23
+	XXLXOR	V15, V1, V1		// XOR with input
+	P8_STXVB16X(V1,R0,BLK_OUT)	// Store final encryption value to output
+	ADD	$16, BLK_INP		// Increment input pointer
+	ADD	$16, BLK_OUT		// Increment output pointer
+	ADD	$-16, IN_LEN		// Decrement input length
+	BR	block16_loop		// Check for next
+final_block:
+	CMP	IN_LEN, $0
+	BEQ	done
+	GEN_VCIPHER_INPUT		// Generate input value for partial encryption
+	VCIPHER_1X9_KEYS(V15)		// Encrypt V15 with 9 keys
+	XXLOR	VS10, VS10, V23		// Save possible last key
+	BLT	CR2, final_block_last
+	VCIPHER_1X2_KEYS(V15, VS10, VS11)	// Encrypt V15 with next 2 keys
+	XXLOR	VS12, VS12, V23		// Save possible last key
+	BEQ	CR2, final_block_last
+	VCIPHER_1X2_KEYS(V15, VS12, VS13) // Encrypt V15 with last 2 keys
+	XXLOR	VS14, VS14, V23		// Save last key
+final_block_last:
+	VCIPHERLAST V15, V23, V15	// Finish encryption
+#ifdef GOPPC64_power10
+	// set up length
+	SLD	$56, IN_LEN, R17
+	LXVLL	BLK_INP, R17, V25
+	VXOR	V25, V15, V25
+	STXVLL	V25, BLK_OUT, R17
+#else
+	ADD	$32, R1, MASK_PTR
+	MOVD	$0, R16
+	P8_STXVB16X(V15, MASK_PTR, R0)
+	CMP	IN_LEN, $8
+	BLT	next4
+	MOVD	0(MASK_PTR), R14
+	MOVD	0(BLK_INP), R15
+	XOR	R14, R15, R14
+	MOVD	R14, 0(BLK_OUT)
+	ADD	$8, R16
+	ADD	$-8, IN_LEN
+next4:
+	CMP	IN_LEN, $4
+	BLT	next2
+	MOVWZ	(BLK_INP)(R16), R15
+	MOVWZ	(MASK_PTR)(R16), R14
+	XOR	R14, R15, R14
+	MOVW	R14, (R16)(BLK_OUT)
+	ADD	$4, R16
+	ADD	$-4, IN_LEN
+next2:
+	CMP	IN_LEN, $2
+	BLT	next1
+	MOVHZ	(BLK_INP)(R16), R15
+	MOVHZ	(MASK_PTR)(R16), R14
+	XOR	R14, R15, R14
+	MOVH	R14, (R16)(BLK_OUT)
+	ADD	$2, R16
+	ADD	$-2, IN_LEN
+next1:
+	CMP	IN_LEN, $1
+	BLT	done
+	MOVBZ	(MASK_PTR)(R16), R14
+	MOVBZ	(BLK_INP)(R16), R15
+	XOR	R14, R15, R14
+	MOVB	R14, (R16)(BLK_OUT)
+#endif
+done:
+	// Save the updated counter value
+	P8_STXVB16X(V30, COUNTER, R0)
+	// Clear the keys
+	XXLXOR	VS0, VS0, VS0
+	XXLXOR	VS1, VS1, VS1
+	XXLXOR	VS2, VS2, VS2
+	XXLXOR	VS3, VS3, VS3
+	XXLXOR	VS4, VS4, VS4
+	XXLXOR	VS5, VS5, VS5
+	XXLXOR	VS6, VS6, VS6
+	XXLXOR	VS7, VS7, VS7
+	XXLXOR	VS8, VS8, VS8
+	XXLXOR	VS9, VS9, VS9
+	XXLXOR	VS10, VS10, VS10
+	XXLXOR	VS11, VS11, VS11
+	XXLXOR	VS12, VS12, VS12
+	XXLXOR	VS13, VS13, VS13
+	XXLXOR	VS14, VS14, VS14
+	RET
+