crypto/aes: implement AES-GCM AEAD for arm64

Use the dedicated AES* and PMULL* instructions to accelerate AES-GCM name old time/op new time/op delta AESGCMSeal1K-46 12.1µs ± 0% 0.9µs ± 0% -92.66% (p=0.000 n=9+10) AESGCMOpen1K-46 12.1µs ± 0% 0.9µs ± 0% -92.43% (p=0.000 n=10+10) AESGCMSign8K-46 58.6µs ± 0% 2.1µs ± 0% -96.41% (p=0.000 n=9+8) AESGCMSeal8K-46 92.8µs ± 0% 5.7µs ± 0% -93.86% (p=0.000 n=9+9) AESGCMOpen8K-46 92.9µs ± 0% 5.7µs ± 0% -93.84% (p=0.000 n=8+9) name old speed new speed delta AESGCMSeal1K-46 84.7MB/s ± 0% 1153.4MB/s ± 0% +1262.21% (p=0.000 n=9+10) AESGCMOpen1K-46 84.4MB/s ± 0% 1115.2MB/s ± 0% +1220.53% (p=0.000 n=10+10) AESGCMSign8K-46 140MB/s ± 0% 3894MB/s ± 0% +2687.50% (p=0.000 n=9+10) AESGCMSeal8K-46 88.2MB/s ± 0% 1437.5MB/s ± 0% +1529.30% (p=0.000 n=9+9) AESGCMOpen8K-46 88.2MB/s ± 0% 1430.5MB/s ± 0% +1522.01% (p=0.000 n=8+9) This change mirrors the current amd64 implementation, and provides optimal performance on a range of arm64 processors including Centriq 2400 and Apple A12. By and large it is implicitly tested by the robustness of the already existing amd64 implementation. The implementation interleaves GHASH with CTR mode to achieve the highest possible throughput, it also aggregates GHASH with a factor of 8, to decrease the cost of the reduction step. Even thought there is a significant amount of assembly, the code reuses the go code for the amd64 implementation, so there is little additional go code. Since AES-GCM is critical for performance of all web servers, this change is required to level the playfield for arm64 CPUs, where amd64 currently enjoys an unfair advantage. Ideally both amd64 and arm64 codepaths could be replaced by hypothetical AES and CLMUL intrinsics, with a few additional vector instructions. Fixes #18498 Fixes #19840 Change-Id: Icc57b868cd1f67ac695c1ac163a8e215f74c7910 Reviewed-on: https://go-review.googlesource.com/107298 Run-TryBot: Vlad Krasnov <vlad@cloudflare.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
2024-11-17 22:24:47 -07:00 · 2018-04-14 04:01:02 +00:00 · 2018-04-14 04:01:02 +00:00 · 4f1f503373
commit 4f1f503373
parent c814ac44c0
8 changed files with 1217 additions and 151 deletions
--- a/src/crypto/aes/aes_gcm.go
+++ b/src/crypto/aes/aes_gcm.go
@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-// +build amd64
+// +build amd64 arm64

 package aes

@ -13,10 +13,7 @@ import (
 	"errors"
 )

-// The following functions are defined in gcm_amd64.s.
-
-//go:noescape
-func aesEncBlock(dst, src *[16]byte, ks []uint32)
+// The following functions are defined in gcm_*.s.

 //go:noescape
 func gcmAesInit(productTable *[256]byte, ks []uint32)
@ -118,7 +115,7 @@ func (g *gcmAsm) Seal(dst, nonce, plaintext, data []byte) []byte {
 		gcmAesFinish(&g.productTable, &tagMask, &counter, uint64(len(nonce)), uint64(0))
 	}

-	aesEncBlock(&tagMask, &counter, g.ks)
+	encryptBlockAsm(len(g.ks)/4-1, &g.ks[0], &tagMask[0], &counter[0])

 	var tagOut [gcmTagSize]byte
 	gcmAesData(&g.productTable, data, &tagOut)
@ -171,7 +168,7 @@ func (g *gcmAsm) Open(dst, nonce, ciphertext, data []byte) ([]byte, error) {
 		gcmAesFinish(&g.productTable, &tagMask, &counter, uint64(len(nonce)), uint64(0))
 	}

-	aesEncBlock(&tagMask, &counter, g.ks)
+	encryptBlockAsm(len(g.ks)/4-1, &g.ks[0], &tagMask[0], &counter[0])

 	var expectedTag [gcmTagSize]byte
 	gcmAesData(&g.productTable, data, &expectedTag)
--- a/src/crypto/aes/asm_arm64.s
+++ b/src/crypto/aes/asm_arm64.s
@ -3,7 +3,12 @@
 // license that can be found in the LICENSE file.

 #include "textflag.h"
-
+DATA rotInvSRows<>+0x00(SB)/8, $0x080f0205040b0e01
+DATA rotInvSRows<>+0x08(SB)/8, $0x00070a0d0c030609
+GLOBL rotInvSRows<>(SB), (NOPTR+RODATA), $16
+DATA invSRows<>+0x00(SB)/8, $0x0b0e0104070a0d00
+DATA invSRows<>+0x08(SB)/8, $0x0306090c0f020508
+GLOBL invSRows<>(SB), (NOPTR+RODATA), $16
 // func encryptBlockAsm(nr int, xk *uint32, dst, src *byte)
 TEXT ·encryptBlockAsm(SB),NOSPLIT,$0
 	MOVD	nr+0(FP), R9
@ -105,3 +110,172 @@ dec128:
 	VEOR    V0.B16, V15.B16, V0.B16
 	VST1	[V0.B16], (R11)
 	RET
+
+// func expandKeyAsm(nr int, key *byte, enc, dec *uint32) {
+// Note that round keys are stored in uint128 format, not uint32
+TEXT ·expandKeyAsm(SB),NOSPLIT,$0
+	MOVD	nr+0(FP), R8
+	MOVD	key+8(FP), R9
+	MOVD	enc+16(FP), R10
+	MOVD	dec+24(FP), R11
+	LDP	rotInvSRows<>(SB), (R0, R1)
+	VMOV	R0, V3.D[0]
+	VMOV	R1, V3.D[1]
+	VEOR	V0.B16, V0.B16, V0.B16 // All zeroes
+	MOVW	$1, R13
+	TBZ	$1, R8, ks192
+	TBNZ	$2, R8, ks256
+	LDPW	(R9), (R4, R5)
+	LDPW	8(R9), (R6, R7)
+	STPW.P	(R4, R5), 8(R10)
+	STPW.P	(R6, R7), 8(R10)
+	MOVW	$0x1b, R14
+ks128Loop:
+		VMOV	R7, V2.S[0]
+		WORD	$0x4E030042       // TBL V3.B16, [V2.B16], V2.B16
+		AESE	V0.B16, V2.B16    // Use AES to compute the SBOX
+		EORW	R13, R4
+		LSLW	$1, R13           // Compute next Rcon
+		ANDSW	$0x100, R13, ZR
+		CSELW	NE, R14, R13, R13 // Fake modulo
+		SUBS	$1, R8
+		VMOV	V2.S[0], R0
+		EORW	R0, R4
+		EORW	R4, R5
+		EORW	R5, R6
+		EORW	R6, R7
+		STPW.P	(R4, R5), 8(R10)
+		STPW.P	(R6, R7), 8(R10)
+	BNE	ks128Loop
+	CBZ	R11, ksDone       // If dec is nil we are done
+	SUB	$176, R10
+        // Decryption keys are encryption keys with InverseMixColumns applied
+	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
+	VMOV	V0.B16, V7.B16
+	AESIMC	V1.B16, V6.B16
+	AESIMC	V2.B16, V5.B16
+	AESIMC	V3.B16, V4.B16
+	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
+	AESIMC	V0.B16, V11.B16
+	AESIMC	V1.B16, V10.B16
+	AESIMC	V2.B16, V9.B16
+	AESIMC	V3.B16, V8.B16
+	VLD1	(R10), [V0.B16, V1.B16, V2.B16]
+	AESIMC	V0.B16, V14.B16
+	AESIMC	V1.B16, V13.B16
+	VMOV	V2.B16, V12.B16
+	VST1.P	[V12.B16, V13.B16, V14.B16], 48(R11)
+	VST1.P	[V8.B16, V9.B16, V10.B16, V11.B16], 64(R11)
+	VST1	[V4.B16, V5.B16, V6.B16, V7.B16], (R11)
+	B	ksDone
+ks192:
+	LDPW	(R9), (R2, R3)
+	LDPW	8(R9), (R4, R5)
+	LDPW	16(R9), (R6, R7)
+	STPW.P	(R2, R3), 8(R10)
+	STPW.P	(R4, R5), 8(R10)
+	SUB	$4, R8
+ks192Loop:
+		STPW.P	(R6, R7), 8(R10)
+		VMOV	R7, V2.S[0]
+		WORD	$0x4E030042 //TBL	V3.B16, [V2.B16], V2.B16
+		AESE	V0.B16, V2.B16
+		EORW	R13, R2
+		LSLW	$1, R13
+		SUBS	$1, R8
+		VMOV	V2.S[0], R0
+		EORW	R0, R2
+		EORW	R2, R3
+		EORW	R3, R4
+		EORW	R4, R5
+		EORW	R5, R6
+		EORW	R6, R7
+		STPW.P	(R2, R3), 8(R10)
+		STPW.P	(R4, R5), 8(R10)
+	BNE	ks192Loop
+	CBZ	R11, ksDone
+	SUB	$208, R10
+	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
+	VMOV	V0.B16, V7.B16
+	AESIMC	V1.B16, V6.B16
+	AESIMC	V2.B16, V5.B16
+	AESIMC	V3.B16, V4.B16
+	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
+	AESIMC	V0.B16, V11.B16
+	AESIMC	V1.B16, V10.B16
+	AESIMC	V2.B16, V9.B16
+	AESIMC	V3.B16, V8.B16
+	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
+	AESIMC	V0.B16, V15.B16
+	AESIMC	V1.B16, V14.B16
+	AESIMC	V2.B16, V13.B16
+	AESIMC	V3.B16, V12.B16
+	VLD1	(R10), [V0.B16]
+	VST1.P	[V0.B16], 16(R11)
+	VST1.P	[V12.B16, V13.B16, V14.B16, V15.B16], 64(R11)
+	VST1.P	[V8.B16, V9.B16, V10.B16, V11.B16], 64(R11)
+	VST1	[V4.B16, V5.B16, V6.B16, V7.B16], (R11)
+	B	ksDone
+ks256:
+	LDP	invSRows<>(SB), (R0, R1)
+	VMOV	R0, V4.D[0]
+	VMOV	R1, V4.D[1]
+	LDPW	(R9), (R0, R1)
+	LDPW	8(R9), (R2, R3)
+	LDPW	16(R9), (R4, R5)
+	LDPW	24(R9), (R6, R7)
+	STPW.P	(R0, R1), 8(R10)
+	STPW.P	(R2, R3), 8(R10)
+	SUB	$7, R8
+ks256Loop:
+		STPW.P	(R4, R5), 8(R10)
+		STPW.P	(R6, R7), 8(R10)
+		VMOV	R7, V2.S[0]
+		WORD	$0x4E030042 //TBL	V3.B16, [V2.B16], V2.B16
+		AESE	V0.B16, V2.B16
+		EORW	R13, R0
+		LSLW	$1, R13
+		SUBS	$1, R8
+		VMOV	V2.S[0], R9
+		EORW	R9, R0
+		EORW	R0, R1
+		EORW	R1, R2
+		EORW	R2, R3
+		VMOV	R3, V2.S[0]
+		WORD	$0x4E040042 //TBL	V3.B16, [V2.B16], V2.B16
+		AESE	V0.B16, V2.B16
+		VMOV	V2.S[0], R9
+		EORW	R9, R4
+		EORW	R4, R5
+		EORW	R5, R6
+		EORW	R6, R7
+		STPW.P	(R0, R1), 8(R10)
+		STPW.P	(R2, R3), 8(R10)
+	BNE	ks256Loop
+	CBZ	R11, ksDone
+	SUB	$240, R10
+	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
+	VMOV	V0.B16, V7.B16
+	AESIMC	V1.B16, V6.B16
+	AESIMC	V2.B16, V5.B16
+	AESIMC	V3.B16, V4.B16
+	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
+	AESIMC	V0.B16, V11.B16
+	AESIMC	V1.B16, V10.B16
+	AESIMC	V2.B16, V9.B16
+	AESIMC	V3.B16, V8.B16
+	VLD1.P	64(R10), [V0.B16, V1.B16, V2.B16, V3.B16]
+	AESIMC	V0.B16, V15.B16
+	AESIMC	V1.B16, V14.B16
+	AESIMC	V2.B16, V13.B16
+	AESIMC	V3.B16, V12.B16
+	VLD1	(R10), [V0.B16, V1.B16, V2.B16]
+	AESIMC	V0.B16, V18.B16
+	AESIMC	V1.B16, V17.B16
+	VMOV	V2.B16, V16.B16
+	VST1.P	[V16.B16, V17.B16, V18.B16], 48(R11)
+	VST1.P	[V12.B16, V13.B16, V14.B16, V15.B16], 64(R11)
+	VST1.P	[V8.B16, V9.B16, V10.B16, V11.B16], 64(R11)
+	VST1	[V4.B16, V5.B16, V6.B16, V7.B16], (R11)
+ksDone:
+	RET
--- a/src/crypto/aes/cipher_arm64.go
+++ b/src/crypto/aes/cipher_arm64.go
@ -1,80 +0,0 @@
-// Copyright 2017 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package aes
-
-import (
-	"crypto/cipher"
-	"crypto/internal/subtle"
-	"internal/cpu"
-	"math/bits"
-)
-
-// defined in asm_arm64.s
-//go:noescape
-func encryptBlockAsm(nr int, xk *uint32, dst, src *byte)
-
-//go:noescape
-func decryptBlockAsm(nr int, xk *uint32, dst, src *byte)
-
-type aesCipherAsm struct {
-	aesCipher
-}
-
-func newCipher(key []byte) (cipher.Block, error) {
-	if !cpu.ARM64.HasAES {
-		return newCipherGeneric(key)
-	}
-	n := len(key) + 28
-	c := aesCipherAsm{aesCipher{make([]uint32, n), make([]uint32, n)}}
-	arm64ExpandKey(key, c.enc, c.dec)
-	return &c, nil
-}
-
-func (c *aesCipherAsm) BlockSize() int { return BlockSize }
-
-func (c *aesCipherAsm) Encrypt(dst, src []byte) {
-	if len(src) < BlockSize {
-		panic("crypto/aes: input not full block")
-	}
-	if len(dst) < BlockSize {
-		panic("crypto/aes: output not full block")
-	}
-	if subtle.InexactOverlap(dst[:BlockSize], src[:BlockSize]) {
-		panic("crypto/aes: invalid buffer overlap")
-	}
-	encryptBlockAsm(len(c.enc)/4-1, &c.enc[0], &dst[0], &src[0])
-}
-
-func (c *aesCipherAsm) Decrypt(dst, src []byte) {
-	if len(src) < BlockSize {
-		panic("crypto/aes: input not full block")
-	}
-	if len(dst) < BlockSize {
-		panic("crypto/aes: output not full block")
-	}
-	if subtle.InexactOverlap(dst[:BlockSize], src[:BlockSize]) {
-		panic("crypto/aes: invalid buffer overlap")
-	}
-	decryptBlockAsm(len(c.dec)/4-1, &c.dec[0], &dst[0], &src[0])
-}
-
-func arm64ExpandKey(key []byte, enc, dec []uint32) {
-	expandKeyGo(key, enc, dec)
-	nk := len(enc)
-	for i := 0; i < nk; i++ {
-		enc[i] = bits.ReverseBytes32(enc[i])
-		dec[i] = bits.ReverseBytes32(dec[i])
-	}
-}
-
-// expandKey is used by BenchmarkExpand to ensure that the asm implementation
-// of key expansion is used for the benchmark when it is available.
-func expandKey(key []byte, enc, dec []uint32) {
-	if cpu.ARM64.HasAES {
-		arm64ExpandKey(key, enc, dec)
-	} else {
-		expandKeyGo(key, enc, dec)
-	}
-}
--- a/src/crypto/aes/cipher_amd64.go
+++ b/src/crypto/aes/cipher_amd64.go
@ -2,6 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

+// +build amd64 arm64
+
 package aes

 import (
@ -10,23 +12,31 @@ import (
 	"internal/cpu"
 )

-// defined in asm_amd64.s
+// defined in asm_*.s

+//go:noescape
 func encryptBlockAsm(nr int, xk *uint32, dst, src *byte)
+
+//go:noescape
 func decryptBlockAsm(nr int, xk *uint32, dst, src *byte)
+
+//go:noescape
 func expandKeyAsm(nr int, key *byte, enc *uint32, dec *uint32)

 type aesCipherAsm struct {
 	aesCipher
 }

+var supportsAES = cpu.X86.HasAES || cpu.ARM64.HasAES
+var supportsGFMUL = cpu.X86.HasPCLMULQDQ || cpu.ARM64.HasPMULL
+
 func newCipher(key []byte) (cipher.Block, error) {
-	if !cpu.X86.HasAES {
+	if !supportsAES {
 		return newCipherGeneric(key)
 	}
 	n := len(key) + 28
 	c := aesCipherAsm{aesCipher{make([]uint32, n), make([]uint32, n)}}
-	rounds := 10
+	var rounds int
 	switch len(key) {
 	case 128 / 8:
 		rounds = 10
@ -37,10 +47,9 @@ func newCipher(key []byte) (cipher.Block, error) {
 	}

 	expandKeyAsm(rounds, &key[0], &c.enc[0], &c.dec[0])
-	if cpu.X86.HasAES && cpu.X86.HasPCLMULQDQ {
+	if supportsAES && supportsGFMUL {
 		return &aesCipherGCM{c}, nil
 	}
-
 	return &c, nil
 }

@ -75,7 +84,7 @@ func (c *aesCipherAsm) Decrypt(dst, src []byte) {
 // expandKey is used by BenchmarkExpand to ensure that the asm implementation
 // of key expansion is used for the benchmark when it is available.
 func expandKey(key []byte, enc, dec []uint32) {
-	if cpu.X86.HasAES {
+	if supportsAES {
 		rounds := 10 // rounds needed for AES128
 		switch len(key) {
 		case 192 / 8:
--- a/src/crypto/aes/gcm_amd64.s
+++ b/src/crypto/aes/gcm_amd64.s
@ -71,56 +71,6 @@ GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16
 GLOBL gcmPoly<>(SB), (NOPTR+RODATA), $16
 GLOBL andMask<>(SB), (NOPTR+RODATA), $240

-// func aesEncBlock(dst, src *[16]byte, ks []uint32)
-TEXT ·aesEncBlock(SB),NOSPLIT,$0
-	MOVQ dst+0(FP), DI
-	MOVQ src+8(FP), SI
-	MOVQ ks_base+16(FP), DX
-	MOVQ ks_len+24(FP), CX
-
-	SHRQ $2, CX
-	DECQ CX
-
-	MOVOU (SI), X0
-	MOVOU (16*0)(DX), X1
-	PXOR X1, X0
-	MOVOU (16*1)(DX), X1
-	AESENC X1, X0
-	MOVOU (16*2)(DX), X1
-	AESENC X1, X0
-	MOVOU (16*3)(DX), X1
-	AESENC X1, X0
-	MOVOU (16*4)(DX), X1
-	AESENC X1, X0
-	MOVOU (16*5)(DX), X1
-	AESENC X1, X0
-	MOVOU (16*6)(DX), X1
-	AESENC X1, X0
-	MOVOU (16*7)(DX), X1
-	AESENC X1, X0
-	MOVOU (16*8)(DX), X1
-	AESENC X1, X0
-	MOVOU (16*9)(DX), X1
-	AESENC X1, X0
-	MOVOU (16*10)(DX), X1
-	CMPQ CX, $12
-	JB encLast
-	AESENC X1, X0
-	MOVOU (16*11)(DX), X1
-	AESENC X1, X0
-	MOVOU (16*12)(DX), X1
-	JE encLast
-	AESENC X1, X0
-	MOVOU (16*13)(DX), X1
-	AESENC X1, X0
-	MOVOU (16*14)(DX), X1
-
-encLast:
-	AESENCLAST X1, X0
-	MOVOU X0, (DI)
-
-	RET
-
 // func gcmAesFinish(productTable *[256]byte, tagMask, T *[16]byte, pLen, dLen uint64)
 TEXT ·gcmAesFinish(SB),NOSPLIT,$0
 #define pTbl DI
--- a/src/crypto/aes/gcm_arm64.s
+++ b/src/crypto/aes/gcm_arm64.s
--- a/src/crypto/cipher/gcm_test.go
+++ b/src/crypto/cipher/gcm_test.go
@ -424,7 +424,7 @@ func TestGCMAsm(t *testing.T) {

 	// generate permutations
 	type pair struct{ align, length int }
-	lengths := []int{0, 8192, 8193, 8208}
+	lengths := []int{0, 156, 8192, 8193, 8208}
 	keySizes := []int{16, 24, 32}
 	alignments := []int{0, 1, 2, 3}
 	if testing.Short() {
--- a/src/crypto/tls/common.go
+++ b/src/crypto/tls/common.go
@ -925,12 +925,7 @@ func initDefaultCipherSuites() {
 	// Worst case, these variables will just all be false
 	hasGCMAsmAMD64 := cpu.X86.HasAES && cpu.X86.HasPCLMULQDQ

-	// TODO: enable the arm64 HasAES && HasPMULL feature check after the
-	// optimized AES-GCM implementation for arm64 is merged (CL 107298).
-	// This is explicitly set to false for now to prevent misprioritization
-	// of AES-GCM based cipher suites, which will be slower than chacha20-poly1305
-	hasGCMAsmARM64 := false
-	// hasGCMAsmARM64 := cpu.ARM64.HasAES && cpu.ARM64.HasPMULL
+	hasGCMAsmARM64 := cpu.ARM64.HasAES && cpu.ARM64.HasPMULL

 	// Keep in sync with crypto/aes/cipher_s390x.go.
 	hasGCMAsmS390X := cpu.S390X.HasAES && cpu.S390X.HasAESCBC && cpu.S390X.HasAESCTR && (cpu.S390X.HasGHASH || cpu.S390X.HasAESGCM)