mirror of
https://github.com/golang/go
synced 2024-11-23 20:30:04 -07:00
crypto/aes: make the GHASH part of AES-GCM faster
By processing 8 blocks in parallel GHASH achieves higher throughput on amd64 Results on Skylake i7: benchmark old ns/op new ns/op delta BenchmarkAESGCMSeal1K-8 316 314 -0.63% BenchmarkAESGCMOpen1K-8 282 281 -0.35% BenchmarkAESGCMSign8K-8 5611 1099 -80.41% BenchmarkAESGCMSeal8K-8 1869 1922 +2.84% BenchmarkAESGCMOpen8K-8 1718 1724 +0.35% benchmark old MB/s new MB/s speedup BenchmarkAESGCMSeal1K-8 3237.10 3260.94 1.01x BenchmarkAESGCMOpen1K-8 3629.74 3638.10 1.00x BenchmarkAESGCMSign8K-8 1459.82 7452.99 5.11x BenchmarkAESGCMSeal8K-8 4382.45 4260.93 0.97x BenchmarkAESGCMOpen8K-8 4766.41 4750.54 1.00x Change-Id: I479f2a791a968caa1c516115b0b6b96a791a20d2 Reviewed-on: https://go-review.googlesource.com/57150 Reviewed-by: Adam Langley <agl@golang.org>
This commit is contained in:
parent
57584a0ee1
commit
b2174a16c0
@ -324,6 +324,20 @@ TEXT ·gcmAesData(SB),NOSPLIT,$0
|
||||
#define tPtr CX
|
||||
#define autLen DX
|
||||
|
||||
#define reduceRound(a) MOVOU POLY, T0; PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a
|
||||
#define mulRoundAAD(X ,i) \
|
||||
MOVOU (16*(i*2))(pTbl), T1;\
|
||||
MOVOU T1, T2;\
|
||||
PCLMULQDQ $0x00, X, T1;\
|
||||
PXOR T1, ACC0;\
|
||||
PCLMULQDQ $0x11, X, T2;\
|
||||
PXOR T2, ACC1;\
|
||||
PSHUFD $78, X, T1;\
|
||||
PXOR T1, X;\
|
||||
MOVOU (16*(i*2+1))(pTbl), T1;\
|
||||
PCLMULQDQ $0x00, X, T1;\
|
||||
PXOR T1, ACCM
|
||||
|
||||
MOVQ productTable+0(FP), pTbl
|
||||
MOVQ data_base+8(FP), aut
|
||||
MOVQ data_len+16(FP), autLen
|
||||
@ -333,15 +347,18 @@ TEXT ·gcmAesData(SB),NOSPLIT,$0
|
||||
MOVOU bswapMask<>(SB), BSWAP
|
||||
MOVOU gcmPoly<>(SB), POLY
|
||||
|
||||
MOVOU (16*14)(pTbl), T1
|
||||
MOVOU (16*15)(pTbl), T2
|
||||
|
||||
TESTQ autLen, autLen
|
||||
JEQ dataBail
|
||||
|
||||
CMPQ autLen, $13 // optimize the TLS case
|
||||
JNE dataSinglesLoop
|
||||
JE dataTLS
|
||||
CMPQ autLen, $128
|
||||
JB startSinglesLoop
|
||||
JMP dataOctaLoop
|
||||
|
||||
dataTLS:
|
||||
MOVOU (16*14)(pTbl), T1
|
||||
MOVOU (16*15)(pTbl), T2
|
||||
PXOR B0, B0
|
||||
MOVQ (aut), B0
|
||||
PINSRD $2, 8(aut), B0
|
||||
@ -349,6 +366,63 @@ TEXT ·gcmAesData(SB),NOSPLIT,$0
|
||||
XORQ autLen, autLen
|
||||
JMP dataMul
|
||||
|
||||
dataOctaLoop:
|
||||
CMPQ autLen, $128
|
||||
JB startSinglesLoop
|
||||
SUBQ $128, autLen
|
||||
|
||||
MOVOU (16*0)(aut), X0
|
||||
MOVOU (16*1)(aut), X1
|
||||
MOVOU (16*2)(aut), X2
|
||||
MOVOU (16*3)(aut), X3
|
||||
MOVOU (16*4)(aut), X4
|
||||
MOVOU (16*5)(aut), X5
|
||||
MOVOU (16*6)(aut), X6
|
||||
MOVOU (16*7)(aut), X7
|
||||
LEAQ (16*8)(aut), aut
|
||||
PSHUFB BSWAP, X0
|
||||
PSHUFB BSWAP, X1
|
||||
PSHUFB BSWAP, X2
|
||||
PSHUFB BSWAP, X3
|
||||
PSHUFB BSWAP, X4
|
||||
PSHUFB BSWAP, X5
|
||||
PSHUFB BSWAP, X6
|
||||
PSHUFB BSWAP, X7
|
||||
PXOR ACC0, X0
|
||||
|
||||
MOVOU (16*0)(pTbl), ACC0
|
||||
MOVOU (16*1)(pTbl), ACCM
|
||||
MOVOU ACC0, ACC1
|
||||
PSHUFD $78, X0, T1
|
||||
PXOR X0, T1
|
||||
PCLMULQDQ $0x00, X0, ACC0
|
||||
PCLMULQDQ $0x11, X0, ACC1
|
||||
PCLMULQDQ $0x00, T1, ACCM
|
||||
|
||||
mulRoundAAD(X1, 1)
|
||||
mulRoundAAD(X2, 2)
|
||||
mulRoundAAD(X3, 3)
|
||||
mulRoundAAD(X4, 4)
|
||||
mulRoundAAD(X5, 5)
|
||||
mulRoundAAD(X6, 6)
|
||||
mulRoundAAD(X7, 7)
|
||||
|
||||
PXOR ACC0, ACCM
|
||||
PXOR ACC1, ACCM
|
||||
MOVOU ACCM, T0
|
||||
PSRLDQ $8, ACCM
|
||||
PSLLDQ $8, T0
|
||||
PXOR ACCM, ACC1
|
||||
PXOR T0, ACC0
|
||||
reduceRound(ACC0)
|
||||
reduceRound(ACC0)
|
||||
PXOR ACC1, ACC0
|
||||
JMP dataOctaLoop
|
||||
|
||||
startSinglesLoop:
|
||||
MOVOU (16*14)(pTbl), T1
|
||||
MOVOU (16*15)(pTbl), T2
|
||||
|
||||
dataSinglesLoop:
|
||||
|
||||
CMPQ autLen, $16
|
||||
@ -438,7 +512,6 @@ TEXT ·gcmAesEnc(SB),0,$256-96
|
||||
#define aesRnd(k) AESENC k, B0; AESENC k, B1; AESENC k, B2; AESENC k, B3; AESENC k, B4; AESENC k, B5; AESENC k, B6; AESENC k, B7
|
||||
#define aesRound(i) MOVOU (16*i)(ks), T0;AESENC T0, B0; AESENC T0, B1; AESENC T0, B2; AESENC T0, B3; AESENC T0, B4; AESENC T0, B5; AESENC T0, B6; AESENC T0, B7
|
||||
#define aesRndLast(k) AESENCLAST k, B0; AESENCLAST k, B1; AESENCLAST k, B2; AESENCLAST k, B3; AESENCLAST k, B4; AESENCLAST k, B5; AESENCLAST k, B6; AESENCLAST k, B7
|
||||
#define reduceRound(a) MOVOU POLY, T0; PCLMULQDQ $0x01, a, T0; PSHUFD $78, a, a; PXOR T0, a
|
||||
#define combinedRound(i) \
|
||||
MOVOU (16*i)(ks), T0;\
|
||||
AESENC T0, B0;\
|
||||
|
@ -10,6 +10,21 @@ import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func benchmarkAESGCMSign(b *testing.B, buf []byte) {
|
||||
b.SetBytes(int64(len(buf)))
|
||||
|
||||
var key [16]byte
|
||||
var nonce [12]byte
|
||||
aes, _ := aes.NewCipher(key[:])
|
||||
aesgcm, _ := cipher.NewGCM(aes)
|
||||
var out []byte
|
||||
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
out = aesgcm.Seal(out[:0], nonce[:], nil, buf)
|
||||
}
|
||||
}
|
||||
|
||||
func benchmarkAESGCMSeal(b *testing.B, buf []byte) {
|
||||
b.SetBytes(int64(len(buf)))
|
||||
|
||||
@ -54,6 +69,10 @@ func BenchmarkAESGCMOpen1K(b *testing.B) {
|
||||
benchmarkAESGCMOpen(b, make([]byte, 1024))
|
||||
}
|
||||
|
||||
func BenchmarkAESGCMSign8K(b *testing.B) {
|
||||
benchmarkAESGCMSign(b, make([]byte, 8*1024))
|
||||
}
|
||||
|
||||
func BenchmarkAESGCMSeal8K(b *testing.B) {
|
||||
benchmarkAESGCMSeal(b, make([]byte, 8*1024))
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user