1
0
mirror of https://github.com/golang/go synced 2024-11-17 09:14:46 -07:00

crypto/aes: improve performance for aes-cbc on ppc64le

This adds an asm implementation of aes-cbc for ppc64le to
improve performance. This is ported from the
cryptogams implementation as are other functions in
crypto/aes with further description at the top of
the asm file.

Improvements on a power10:

name             old time/op   new time/op    delta
AESCBCEncrypt1K   1.67µs ± 0%    0.87µs ±-48.15%
AESCBCDecrypt1K   1.35µs ± 0%    0.43µs ±-68.48%

name             old speed     new speed      delta
AESCBCEncrypt1K  614MB/s ± 0%  1184MB/s ± 0%+92.84%
AESCBCDecrypt1K  757MB/s ± 0%  2403M/s ± 0 +217.21%

A fuzz test to compare the generic Go implemenation
against the asm implementation has been added.

Change-Id: I18613dfc95c640820b8f1c60d29df638efc7a75c
Reviewed-on: https://go-review.googlesource.com/c/go/+/355429
Trust: Lynn Boger <laboger@linux.vnet.ibm.com>
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Paul Murphy <murp@ibm.com>
Trust: Paul Murphy <murp@ibm.com>
This commit is contained in:
Lynn Boger 2021-08-26 07:09:50 -05:00
parent 86b5f6a7be
commit 2c6700739a
5 changed files with 423 additions and 0 deletions

View File

@ -498,3 +498,228 @@ loop_dec:
RET // blr RET // blr
// Remove defines from above so they can be defined here
#undef INP
#undef OUT
#undef ROUNDS
#undef KEY
#undef TMP
#undef OUTPERM
#undef OUTMASK
#undef OUTHEAD
#undef OUTTAIL
// CBC encrypt or decrypt
// R3 src
// R4 dst
// R5 len
// R6 key
// R7 iv
// R8 enc=1 dec=0
// Ported from: aes_p8_cbc_encrypt
// Register usage:
// R9: ROUNDS
// R10: Index
// V0: initialized to 0
// V3: initialized to mask
// V4: IV
// V5: SRC
// V6: IV perm mask
// V7: DST
// V10: KEY perm mask
#define INP R3
#define OUT R4
#define LEN R5
#define KEY R6
#define IVP R7
#define ENC R8
#define ROUNDS R9
#define IDX R10
#define RNDKEY0 V0
#define RNDKEY1 V1
#define INOUT V2
#define TMP V3
#define IVEC V4
#define INPTAIL V5
#define INPPERM V6
#define OUTHEAD V7
#define OUTPERM V8
#define OUTMASK V9
#define KEYPERM V10
// Vector loads are done using LVX followed by
// a VPERM using mask generated from previous
// LVSL or LVSR instruction, to obtain the correct
// bytes if address is unaligned.
// Encryption is done with VCIPHER and VCIPHERLAST
// Decryption is done with VNCIPHER and VNCIPHERLAST
// Encrypt and decypt is done as follows:
// - INOUT value is initialized in outer loop.
// - ROUNDS value is adjusted for loop unrolling.
// - Encryption/decryption is done in loop based on
// adjusted ROUNDS value.
// - Final INOUT value is encrypted/decrypted and stored.
// Note: original implementation had an 8X version
// for decryption which was omitted to avoid the
// complexity.
TEXT ·cryptBlocksChain(SB), NOSPLIT|NOFRAME, $0
MOVD src+0(FP), INP
MOVD dst+8(FP), OUT
MOVD length+16(FP), LEN
MOVD key+24(FP), KEY
MOVD iv+32(FP), IVP
MOVD enc+40(FP), ENC
CMPU LEN, $16 // cmpldi r5,16
BC 14, 0, LR // bltlr-
CMPW ENC, $0 // cmpwi r8,0
MOVD $15, IDX // li r10,15
VXOR RNDKEY0, RNDKEY0, RNDKEY0 // vxor v0,v0,v0
VSPLTISB $0xf, TMP // vspltisb $0xf,v3
LVX (IVP)(R0), IVEC // lvx v4,r0,r7
LVSL (IVP)(R0), INPPERM // lvsl v6,r0,r7
LVX (IVP)(IDX), INPTAIL // lvx v5,r10,r7
VXOR INPPERM, TMP, INPPERM // vxor v3, v6, v6
VPERM IVEC, INPTAIL, INPPERM, IVEC // vperm v4,v4,v5,v6
NEG INP, R11 // neg r11,r3
LVSR (KEY)(R0), KEYPERM // lvsr v10,r0,r6
MOVWZ 240(KEY), ROUNDS // lwz r9,240(r6)
LVSR (R11)(R0), V6 // lvsr v6,r0,r11
LVX (INP)(R0), INPTAIL // lvx v5,r0,r3
ADD $15, INP // addi r3,r3,15
VXOR INPPERM, TMP, INPPERM // vxor v6, v3, v6
LVSL (OUT)(R0), OUTPERM // lvsl v8,r0,r4
VSPLTISB $-1, OUTMASK // vspltisb v9,-1
LVX (OUT)(R0), OUTHEAD // lvx v7,r0,r4
VPERM OUTMASK, RNDKEY0, OUTPERM, OUTMASK // vperm v9,v9,v0,v8
VXOR OUTPERM, TMP, OUTPERM // vxor v8, v3, v8
SRW $1, ROUNDS // rlwinm r9,r9,31,1,31
MOVD $16, IDX // li r10,16
ADD $-1, ROUNDS // addi r9,r9,-1
BEQ Lcbc_dec // beq
PCALIGN $16
// Outer loop: initialize encrypted value (INOUT)
// Load input (INPTAIL) ivec (IVEC)
Lcbc_enc:
VOR INPTAIL, INPTAIL, INOUT // vor v2,v5,v5
LVX (INP)(R0), INPTAIL // lvx v5,r0,r3
ADD $16, INP // addi r3,r3,16
MOVD ROUNDS, CTR // mtctr r9
ADD $-16, LEN // addi r5,r5,-16
LVX (KEY)(R0), RNDKEY0 // lvx v0,r0,r6
VPERM INOUT, INPTAIL, INPPERM, INOUT // vperm v2,v2,v5,v6
LVX (KEY)(IDX), RNDKEY1 // lvx v1,r10,r6
ADD $16, IDX // addi r10,r10,16
VPERM RNDKEY1, RNDKEY0, KEYPERM, RNDKEY0 // vperm v0,v1,v0,v10
VXOR INOUT, RNDKEY0, INOUT // vxor v2,v2,v0
LVX (KEY)(IDX), RNDKEY0 // lvx v0,r10,r6
ADD $16, IDX // addi r10,r10,16
VXOR INOUT, IVEC, INOUT // vxor v2,v2,v4
// Encryption loop of INOUT using RNDKEY0 and RNDKEY1
Loop_cbc_enc:
VPERM RNDKEY0, RNDKEY1, KEYPERM, RNDKEY1 // vperm v1,v1,v0,v10
VCIPHER INOUT, RNDKEY1, INOUT // vcipher v2,v2,v1
LVX (KEY)(IDX), RNDKEY1 // lvx v1,r10,r6
ADD $16, IDX // addi r10,r10,16
VPERM RNDKEY1, RNDKEY0, KEYPERM, RNDKEY0 // vperm v0,v0,v1,v10
VCIPHER INOUT, RNDKEY0, INOUT // vcipher v2,v2,v0
LVX (KEY)(IDX), RNDKEY0 // lvx v0,r10,r6
ADD $16, IDX // addi r10,r10,16
BC 16, 0, Loop_cbc_enc // bdnz Loop_cbc_enc
// Encrypt tail values and store INOUT
VPERM RNDKEY0, RNDKEY1, KEYPERM, RNDKEY1 // vperm v1,v1,v0,v10
VCIPHER INOUT, RNDKEY1, INOUT // vcipher v2,v2,v1
LVX (KEY)(IDX), RNDKEY1 // lvx v1,r10,r6
MOVD $16, IDX // li r10,16
VPERM RNDKEY1, RNDKEY0, KEYPERM, RNDKEY0 // vperm v0,v0,v1,v10
VCIPHERLAST INOUT, RNDKEY0, IVEC // vcipherlast v4,v2,v0
CMPU LEN, $16 // cmpldi r5,16
VPERM IVEC, IVEC, OUTPERM, TMP // vperm v3,v4,v4,v8
VSEL OUTHEAD, TMP, OUTMASK, INOUT // vsel v2,v7,v3,v9
VOR TMP, TMP, OUTHEAD // vor v7,v3,v3
STVX INOUT, (OUT)(R0) // stvx v2,r0,r4
ADD $16, OUT // addi r4,r4,16
BGE Lcbc_enc // bge Lcbc_enc
BR Lcbc_done // b Lcbc_done
// Outer loop: initialize decrypted value (INOUT)
// Load input (INPTAIL) ivec (IVEC)
Lcbc_dec:
VOR INPTAIL, INPTAIL, TMP // vor v3,v5,v5
LVX (INP)(R0), INPTAIL // lvx v5,r0,r3
ADD $16, INP // addi r3,r3,16
MOVD ROUNDS, CTR // mtctr r9
ADD $-16, LEN // addi r5,r5,-16
LVX (KEY)(R0), RNDKEY0 // lvx v0,r0,r6
VPERM TMP, INPTAIL, INPPERM, TMP // vperm v3,v3,v5,v6
LVX (KEY)(IDX), RNDKEY1 // lvx v1,r10,r6
ADD $16, IDX // addi r10,r10,16
VPERM RNDKEY1, RNDKEY0, KEYPERM, RNDKEY0 // vperm v0,v1,v0,v10
VXOR TMP, RNDKEY0, INOUT // vxor v2,v3,v0
LVX (KEY)(IDX), RNDKEY0 // lvx v0,r10,r6
ADD $16, IDX // addi r10,r10,16
PCALIGN $16
// Decryption loop of INOUT using RNDKEY0 and RNDKEY1
Loop_cbc_dec:
VPERM RNDKEY0, RNDKEY1, KEYPERM, RNDKEY1 // vperm v1,v0,v1,v10
VNCIPHER INOUT, RNDKEY1, INOUT // vncipher v2,v2,v1
LVX (KEY)(IDX), RNDKEY1 // lvx v1,r10,r6
ADD $16, IDX // addi r10,r10,16
VPERM RNDKEY1, RNDKEY0, KEYPERM, RNDKEY0 // vperm v0,v1,v0,v10
VNCIPHER INOUT, RNDKEY0, INOUT // vncipher v2,v2,v0
LVX (KEY)(IDX), RNDKEY0 // lvx v0,r10,r6
ADD $16, IDX // addi r10,r10,16
BC 16, 0, Loop_cbc_dec // bdnz
// Decrypt tail values and store INOUT
VPERM RNDKEY0, RNDKEY1, KEYPERM, RNDKEY1 // vperm v1,v0,v1,v10
VNCIPHER INOUT, RNDKEY1, INOUT // vncipher v2,v2,v1
LVX (KEY)(IDX), RNDKEY1 // lvx v1,r10,r6
MOVD $16, IDX // li r10,16
VPERM RNDKEY1, RNDKEY0, KEYPERM, RNDKEY0 // vperm v0,v1,v0,v10
VNCIPHERLAST INOUT, RNDKEY0, INOUT // vncipherlast v2,v2,v0
CMPU LEN, $16 // cmpldi r5,16
VXOR INOUT, IVEC, INOUT // vxor v2,v2,v4
VOR TMP, TMP, IVEC // vor v4,v3,v3
VPERM INOUT, INOUT, OUTPERM, TMP // vperm v3,v2,v2,v8
VSEL OUTHEAD, TMP, OUTMASK, INOUT // vsel v2,v7,v3,v9
VOR TMP, TMP, OUTHEAD // vor v7,v3,v3
STVX INOUT, (OUT)(R0) // stvx v2,r0,r4
ADD $16, OUT // addi r4,r4,16
BGE Lcbc_dec // bge
Lcbc_done:
ADD $-1, OUT // addi r4,r4,-1
LVX (OUT)(R0), INOUT // lvx v2,r0,r4
VSEL OUTHEAD, INOUT, OUTMASK, INOUT // vsel v2,v7,v2,v9
STVX INOUT, (OUT)(R0) // stvx v2,r0,r4
NEG IVP, ENC // neg r8,r7
MOVD $15, IDX // li r10,15
VXOR RNDKEY0, RNDKEY0, RNDKEY0 // vxor v0,v0,v0
VSPLTISB $-1, OUTMASK // vspltisb v9,-1
VSPLTISB $0xf, TMP // vspltisb v3, 0xf
LVSR (ENC)(R0), OUTPERM // lvsl v8,r0,r8
VPERM OUTMASK, RNDKEY0, OUTPERM, OUTMASK // vperm v9,v9,v0,v8
VXOR OUTPERM, TMP, OUTPERM // vxor v9, v3, v9
LVX (IVP)(R0), OUTHEAD // lvx v7,r0,r7
VPERM IVEC, IVEC, OUTPERM, IVEC // vperm v4,v4,v4,v8
VSEL OUTHEAD, IVEC, OUTMASK, INOUT // vsel v2,v7,v4,v9
LVX (IVP)(IDX), INPTAIL // lvx v5,r10,r7
STVX INOUT, (IVP)(R0) // stvx v2,r0,r7
VSEL IVEC, INPTAIL, OUTMASK, INOUT // vsel v2,v4,v5,v9
STVX INOUT, (IVP)(IDX) // stvx v2,r10,r7
RET // bclr 20,lt,0

View File

@ -0,0 +1,71 @@
// Copyright 2021 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package aes
import (
"crypto/cipher"
"crypto/internal/subtle"
)
// Assert that aesCipherAsm implements the cbcEncAble and cbcDecAble interfaces.
var _ cbcEncAble = (*aesCipherAsm)(nil)
var _ cbcDecAble = (*aesCipherAsm)(nil)
const cbcEncrypt = 1
const cbcDecrypt = 0
type cbc struct {
b *aesCipherAsm
enc int
iv [BlockSize]byte
}
func (b *aesCipherAsm) NewCBCEncrypter(iv []byte) cipher.BlockMode {
var c cbc
c.b = b
c.enc = cbcEncrypt
copy(c.iv[:], iv)
return &c
}
func (b *aesCipherAsm) NewCBCDecrypter(iv []byte) cipher.BlockMode {
var c cbc
c.b = b
c.enc = cbcDecrypt
copy(c.iv[:], iv)
return &c
}
func (x *cbc) BlockSize() int { return BlockSize }
// cryptBlocksChain invokes the cipher message identifying encrypt or decrypt.
//go:noescape
func cryptBlocksChain(src, dst *byte, length int, key *uint32, iv *byte, enc int)
func (x *cbc) CryptBlocks(dst, src []byte) {
if len(src)%BlockSize != 0 {
panic("crypto/cipher: input not full blocks")
}
if len(dst) < len(src) {
panic("crypto/cipher: output smaller than input")
}
if subtle.InexactOverlap(dst[:len(src)], src) {
panic("crypto/cipher: invalid buffer overlap")
}
if len(src) > 0 {
if x.enc == cbcEncrypt {
cryptBlocksChain(&src[0], &dst[0], len(src), &x.b.enc[0], &x.iv[0], x.enc)
} else {
cryptBlocksChain(&src[0], &dst[0], len(src), &x.b.dec[0], &x.iv[0], x.enc)
}
}
}
func (x *cbc) SetIV(iv []byte) {
if len(iv) != BlockSize {
panic("cipher: incorrect length IV")
}
copy(x.iv[:], iv)
}

View File

@ -52,6 +52,17 @@ func NewCBCEncrypter(b Block, iv []byte) BlockMode {
return (*cbcEncrypter)(newCBC(b, iv)) return (*cbcEncrypter)(newCBC(b, iv))
} }
// newCBCGenericEncrypter returns a BlockMode which encrypts in cipher block chaining
// mode, using the given Block. The length of iv must be the same as the
// Block's block size. This always returns the generic non-asm encrypter for use
// in fuzz testing.
func newCBCGenericEncrypter(b Block, iv []byte) BlockMode {
if len(iv) != b.BlockSize() {
panic("cipher.NewCBCEncrypter: IV length must equal block size")
}
return (*cbcEncrypter)(newCBC(b, iv))
}
func (x *cbcEncrypter) BlockSize() int { return x.blockSize } func (x *cbcEncrypter) BlockSize() int { return x.blockSize }
func (x *cbcEncrypter) CryptBlocks(dst, src []byte) { func (x *cbcEncrypter) CryptBlocks(dst, src []byte) {
@ -112,6 +123,17 @@ func NewCBCDecrypter(b Block, iv []byte) BlockMode {
return (*cbcDecrypter)(newCBC(b, iv)) return (*cbcDecrypter)(newCBC(b, iv))
} }
// newCBCGenericDecrypter returns a BlockMode which encrypts in cipher block chaining
// mode, using the given Block. The length of iv must be the same as the
// Block's block size. This always returns the generic non-asm decrypter for use in
// fuzz testing.
func newCBCGenericDecrypter(b Block, iv []byte) BlockMode {
if len(iv) != b.BlockSize() {
panic("cipher.NewCBCDecrypter: IV length must equal block size")
}
return (*cbcDecrypter)(newCBC(b, iv))
}
func (x *cbcDecrypter) BlockSize() int { return x.blockSize } func (x *cbcDecrypter) BlockSize() int { return x.blockSize }
func (x *cbcDecrypter) CryptBlocks(dst, src []byte) { func (x *cbcDecrypter) CryptBlocks(dst, src []byte) {

View File

@ -6,3 +6,5 @@ package cipher
// Export internal functions for testing. // Export internal functions for testing.
var XorBytes = xorBytes var XorBytes = xorBytes
var NewCBCGenericEncrypter = newCBCGenericEncrypter
var NewCBCGenericDecrypter = newCBCGenericDecrypter

View File

@ -0,0 +1,103 @@
// Copyright 2021 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build ppc64le
package cipher_test
import (
"bytes"
"crypto/aes"
"crypto/cipher"
"crypto/rand"
"testing"
"time"
)
var cbcAESFuzzTests = []struct {
name string
key []byte
}{
{
"CBC-AES128",
commonKey128,
},
{
"CBC-AES192",
commonKey192,
},
{
"CBC-AES256",
commonKey256,
},
}
var timeout *time.Timer
const datalen = 1024
func TestFuzz(t *testing.T) {
for _, ft := range cbcAESFuzzTests {
c, _ := aes.NewCipher(ft.key)
cbcAsm := cipher.NewCBCEncrypter(c, commonIV)
cbcGeneric := cipher.NewCBCGenericEncrypter(c, commonIV)
if testing.Short() {
timeout = time.NewTimer(10 * time.Millisecond)
} else {
timeout = time.NewTimer(2 * time.Second)
}
indata := make([]byte, datalen)
outgeneric := make([]byte, datalen)
outdata := make([]byte, datalen)
fuzzencrypt:
for {
select {
case <-timeout.C:
break fuzzencrypt
default:
}
rand.Read(indata[:])
cbcGeneric.CryptBlocks(indata, outgeneric)
cbcAsm.CryptBlocks(indata, outdata)
if !bytes.Equal(outdata, outgeneric) {
t.Fatalf("AES-CBC encryption does not match reference result: %x and %x, please report this error to security@golang.org", outdata, outgeneric)
}
}
cbcAsm = cipher.NewCBCDecrypter(c, commonIV)
cbcGeneric = cipher.NewCBCGenericDecrypter(c, commonIV)
if testing.Short() {
timeout = time.NewTimer(10 * time.Millisecond)
} else {
timeout = time.NewTimer(2 * time.Second)
}
fuzzdecrypt:
for {
select {
case <-timeout.C:
break fuzzdecrypt
default:
}
rand.Read(indata[:])
cbcGeneric.CryptBlocks(indata, outgeneric)
cbcAsm.CryptBlocks(indata, outdata)
if !bytes.Equal(outdata, outgeneric) {
t.Fatalf("AES-CBC decryption does not match reference result: %x and %x, please report this error to security@golang.org", outdata, outgeneric)
}
}
}
}