1
0
mirror of https://github.com/golang/go synced 2024-11-23 20:30:04 -07:00

crypto/aes: make the GHASH part of AES-GCM faster

By processing 8 blocks in parallel GHASH achieves higher throughput on amd64

Results on Skylake i7:

benchmark                   old ns/op     new ns/op     delta
BenchmarkAESGCMSeal1K-8     316           314           -0.63%
BenchmarkAESGCMOpen1K-8     282           281           -0.35%
BenchmarkAESGCMSign8K-8     5611          1099          -80.41%
BenchmarkAESGCMSeal8K-8     1869          1922          +2.84%
BenchmarkAESGCMOpen8K-8     1718          1724          +0.35%

benchmark                   old MB/s     new MB/s     speedup
BenchmarkAESGCMSeal1K-8     3237.10      3260.94      1.01x
BenchmarkAESGCMOpen1K-8     3629.74      3638.10      1.00x
BenchmarkAESGCMSign8K-8     1459.82      7452.99      5.11x
BenchmarkAESGCMSeal8K-8     4382.45      4260.93      0.97x
BenchmarkAESGCMOpen8K-8     4766.41      4750.54      1.00x

Change-Id: I479f2a791a968caa1c516115b0b6b96a791a20d2
Reviewed-on: https://go-review.googlesource.com/57150
Reviewed-by: Adam Langley <agl@golang.org>
This commit is contained in:
Vlad Krasnov 2017-08-18 12:49:59 -07:00 committed by Adam Langley
parent 57584a0ee1
commit b2174a16c0
2 changed files with 97 additions and 5 deletions

View File

@ -324,6 +324,20 @@ TEXT ·gcmAesData(SB),NOSPLIT,$0
#define tPtr CX
#define autLen DX
#define reduceRound(a) MOVOU POLY, T0; PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a
#define mulRoundAAD(X ,i) \
MOVOU (16*(i*2))(pTbl), T1;\
MOVOU T1, T2;\
PCLMULQDQ $0x00, X, T1;\
PXOR T1, ACC0;\
PCLMULQDQ $0x11, X, T2;\
PXOR T2, ACC1;\
PSHUFD $78, X, T1;\
PXOR T1, X;\
MOVOU (16*(i*2+1))(pTbl), T1;\
PCLMULQDQ $0x00, X, T1;\
PXOR T1, ACCM
MOVQ productTable+0(FP), pTbl
MOVQ data_base+8(FP), aut
MOVQ data_len+16(FP), autLen
@ -333,15 +347,18 @@ TEXT ·gcmAesData(SB),NOSPLIT,$0
MOVOU bswapMask<>(SB), BSWAP
MOVOU gcmPoly<>(SB), POLY
MOVOU (16*14)(pTbl), T1
MOVOU (16*15)(pTbl), T2
TESTQ autLen, autLen
JEQ dataBail
CMPQ autLen, $13 // optimize the TLS case
JNE dataSinglesLoop
JE dataTLS
CMPQ autLen, $128
JB startSinglesLoop
JMP dataOctaLoop
dataTLS:
MOVOU (16*14)(pTbl), T1
MOVOU (16*15)(pTbl), T2
PXOR B0, B0
MOVQ (aut), B0
PINSRD $2, 8(aut), B0
@ -349,6 +366,63 @@ TEXT ·gcmAesData(SB),NOSPLIT,$0
XORQ autLen, autLen
JMP dataMul
dataOctaLoop:
CMPQ autLen, $128
JB startSinglesLoop
SUBQ $128, autLen
MOVOU (16*0)(aut), X0
MOVOU (16*1)(aut), X1
MOVOU (16*2)(aut), X2
MOVOU (16*3)(aut), X3
MOVOU (16*4)(aut), X4
MOVOU (16*5)(aut), X5
MOVOU (16*6)(aut), X6
MOVOU (16*7)(aut), X7
LEAQ (16*8)(aut), aut
PSHUFB BSWAP, X0
PSHUFB BSWAP, X1
PSHUFB BSWAP, X2
PSHUFB BSWAP, X3
PSHUFB BSWAP, X4
PSHUFB BSWAP, X5
PSHUFB BSWAP, X6
PSHUFB BSWAP, X7
PXOR ACC0, X0
MOVOU (16*0)(pTbl), ACC0
MOVOU (16*1)(pTbl), ACCM
MOVOU ACC0, ACC1
PSHUFD $78, X0, T1
PXOR X0, T1
PCLMULQDQ $0x00, X0, ACC0
PCLMULQDQ $0x11, X0, ACC1
PCLMULQDQ $0x00, T1, ACCM
mulRoundAAD(X1, 1)
mulRoundAAD(X2, 2)
mulRoundAAD(X3, 3)
mulRoundAAD(X4, 4)
mulRoundAAD(X5, 5)
mulRoundAAD(X6, 6)
mulRoundAAD(X7, 7)
PXOR ACC0, ACCM
PXOR ACC1, ACCM
MOVOU ACCM, T0
PSRLDQ $8, ACCM
PSLLDQ $8, T0
PXOR ACCM, ACC1
PXOR T0, ACC0
reduceRound(ACC0)
reduceRound(ACC0)
PXOR ACC1, ACC0
JMP dataOctaLoop
startSinglesLoop:
MOVOU (16*14)(pTbl), T1
MOVOU (16*15)(pTbl), T2
dataSinglesLoop:
CMPQ autLen, $16
@ -438,7 +512,6 @@ TEXT ·gcmAesEnc(SB),0,$256-96
#define aesRnd(k) AESENC k, B0; AESENC k, B1; AESENC k, B2; AESENC k, B3; AESENC k, B4; AESENC k, B5; AESENC k, B6; AESENC k, B7
#define aesRound(i) MOVOU (16*i)(ks), T0;AESENC T0, B0; AESENC T0, B1; AESENC T0, B2; AESENC T0, B3; AESENC T0, B4; AESENC T0, B5; AESENC T0, B6; AESENC T0, B7
#define aesRndLast(k) AESENCLAST k, B0; AESENCLAST k, B1; AESENCLAST k, B2; AESENCLAST k, B3; AESENCLAST k, B4; AESENCLAST k, B5; AESENCLAST k, B6; AESENCLAST k, B7
#define reduceRound(a) MOVOU POLY, T0; PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a
#define combinedRound(i) \
MOVOU (16*i)(ks), T0;\
AESENC T0, B0;\

View File

@ -10,6 +10,21 @@ import (
"testing"
)
func benchmarkAESGCMSign(b *testing.B, buf []byte) {
b.SetBytes(int64(len(buf)))
var key [16]byte
var nonce [12]byte
aes, _ := aes.NewCipher(key[:])
aesgcm, _ := cipher.NewGCM(aes)
var out []byte
b.ResetTimer()
for i := 0; i < b.N; i++ {
out = aesgcm.Seal(out[:0], nonce[:], nil, buf)
}
}
func benchmarkAESGCMSeal(b *testing.B, buf []byte) {
b.SetBytes(int64(len(buf)))
@ -54,6 +69,10 @@ func BenchmarkAESGCMOpen1K(b *testing.B) {
benchmarkAESGCMOpen(b, make([]byte, 1024))
}
func BenchmarkAESGCMSign8K(b *testing.B) {
benchmarkAESGCMSign(b, make([]byte, 8*1024))
}
func BenchmarkAESGCMSeal8K(b *testing.B) {
benchmarkAESGCMSeal(b, make([]byte, 8*1024))
}