diff --git a/src/crypto/aes/ctr_multiblock.go b/src/crypto/aes/ctr_multiblock.go new file mode 100644 index 00000000000..6ca53f002de --- /dev/null +++ b/src/crypto/aes/ctr_multiblock.go @@ -0,0 +1,129 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build amd64 || arm64 + +package aes + +import ( + "crypto/cipher" + "crypto/internal/alias" +) + +//go:generate sh -c "go run ./ctr_multiblock_amd64_gen.go | asmfmt > ctr_multiblock_amd64.s" +//go:generate sh -c "go run ./ctr_multiblock_arm64_gen.go | asmfmt > ctr_multiblock_arm64.s" + +// defined in ctr_multiblock_*.s + +//go:noescape +func rev16Asm(iv *byte) + +//go:noescape +func ctrBlocks1Asm(nr int, xk *uint32, dst, src, ivRev *byte, blockIndex uint64) + +//go:noescape +func ctrBlocks2Asm(nr int, xk *uint32, dst, src, ivRev *byte, blockIndex uint64) + +//go:noescape +func ctrBlocks4Asm(nr int, xk *uint32, dst, src, ivRev *byte, blockIndex uint64) + +//go:noescape +func ctrBlocks8Asm(nr int, xk *uint32, dst, src, ivRev *byte, blockIndex uint64) + +type aesCtrWithIV struct { + enc []uint32 + rounds int + ivRev [BlockSize]byte + offset uint64 +} + +// NewCTR implements crypto/cipher.ctrAble so that crypto/cipher.NewCTR +// will use the optimised implementation in this file when possible. +func (c *aesCipherAsm) NewCTR(iv []byte) cipher.Stream { + if len(iv) != BlockSize { + panic("bad IV length") + } + + // Reverse IV once, because it is needed in reversed form + // in all subsequent ASM calls. + var ivRev [BlockSize]byte + copy(ivRev[:], iv) + rev16Asm(&ivRev[0]) + + return &aesCtrWithIV{ + enc: c.enc, + rounds: len(c.enc)/4 - 1, + ivRev: ivRev, + offset: 0, + } +} + +func (c *aesCtrWithIV) XORKeyStream(dst, src []byte) { + c.XORKeyStreamAt(dst, src, c.offset) + c.offset += uint64(len(src)) +} + +func (c *aesCtrWithIV) XORKeyStreamAt(dst, src []byte, offset uint64) { + if len(dst) < len(src) { + panic("len(dst) < len(src)") + } + dst = dst[:len(src)] + + if alias.InexactOverlap(dst, src) { + panic("crypto/aes: invalid buffer overlap") + } + + offsetMod16 := offset % BlockSize + + if offsetMod16 != 0 { + // We have a partial block in the beginning. + plaintext := make([]byte, BlockSize) + copy(plaintext[offsetMod16:BlockSize], src) + ciphertext := make([]byte, BlockSize) + ctrBlocks1Asm(c.rounds, &c.enc[0], &ciphertext[0], &plaintext[0], &c.ivRev[0], offset/BlockSize) + progress := BlockSize - offsetMod16 + if progress > uint64(len(src)) { + progress = uint64(len(src)) + } + copy(dst[:progress], ciphertext[offsetMod16:BlockSize]) + src = src[progress:] + dst = dst[progress:] + offset += progress + } + + for len(src) >= 8*BlockSize { + ctrBlocks8Asm(c.rounds, &c.enc[0], &dst[0], &src[0], &c.ivRev[0], offset/BlockSize) + src = src[8*BlockSize:] + dst = dst[8*BlockSize:] + offset += 8 * BlockSize + } + // 4, 2, and 1 blocks in the end can happen max 1 times, so if, not for. + if len(src) >= 4*BlockSize { + ctrBlocks4Asm(c.rounds, &c.enc[0], &dst[0], &src[0], &c.ivRev[0], offset/BlockSize) + src = src[4*BlockSize:] + dst = dst[4*BlockSize:] + offset += 4 * BlockSize + } + if len(src) >= 2*BlockSize { + ctrBlocks2Asm(c.rounds, &c.enc[0], &dst[0], &src[0], &c.ivRev[0], offset/BlockSize) + src = src[2*BlockSize:] + dst = dst[2*BlockSize:] + offset += 2 * BlockSize + } + if len(src) >= 1*BlockSize { + ctrBlocks1Asm(c.rounds, &c.enc[0], &dst[0], &src[0], &c.ivRev[0], offset/BlockSize) + src = src[1*BlockSize:] + dst = dst[1*BlockSize:] + offset += 1 * BlockSize + } + + if len(src) != 0 { + // We have a partial block in the end. + plaintext := make([]byte, BlockSize) + copy(plaintext, src) + ciphertext := make([]byte, BlockSize) + ctrBlocks1Asm(c.rounds, &c.enc[0], &ciphertext[0], &plaintext[0], &c.ivRev[0], offset/BlockSize) + copy(dst, ciphertext) + } +} diff --git a/src/crypto/aes/ctr_multiblock_amd64.s b/src/crypto/aes/ctr_multiblock_amd64.s new file mode 100644 index 00000000000..a6bc9920484 --- /dev/null +++ b/src/crypto/aes/ctr_multiblock_amd64.s @@ -0,0 +1,640 @@ +// Code generated by ctr_multiblock_amd64_gen.go. DO NOT EDIT. + +#include "textflag.h" + +// See https://golang.org/src/crypto/aes/gcm_amd64.s +#define NR CX +#define XK AX +#define DST DX +#define SRC R10 +#define IV_PTR BX +#define BLOCK_INDEX R11 +#define IV_LOW_LE R12 +#define IV_HIGH_LE R13 +#define BSWAP X15 + +DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f +DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607 + +GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16 + +// func ctrBlocks1Asm(nr int, xk *uint32, dst, src, ivRev *byte, blockIndex uint64) +TEXT ·ctrBlocks1Asm(SB), NOSPLIT, $0 + MOVQ nr+0(FP), NR + MOVQ xk+8(FP), XK + MOVUPS 0(XK), X1 + MOVQ dst+16(FP), DST + MOVQ src+24(FP), SRC + MOVQ ivRev+32(FP), IV_PTR + MOVQ 0(IV_PTR), IV_LOW_LE + MOVQ 8(IV_PTR), IV_HIGH_LE + MOVQ blockIndex+40(FP), BLOCK_INDEX + + MOVOU bswapMask<>(SB), BSWAP + + ADDQ BLOCK_INDEX, IV_LOW_LE + ADCQ $0, IV_HIGH_LE + + MOVQ IV_LOW_LE, X0 + PINSRQ $1, IV_HIGH_LE, X0 + PSHUFB BSWAP, X0 + + PXOR X1, X0 + + ADDQ $16, XK + + SUBQ $12, NR + JE Lenc192 + JB Lenc128 + +Lenc256: + MOVUPS 0(XK), X1 + AESENC X1, X0 + + MOVUPS 16(XK), X1 + AESENC X1, X0 + + ADDQ $32, XK + +Lenc192: + MOVUPS 0(XK), X1 + AESENC X1, X0 + + MOVUPS 16(XK), X1 + AESENC X1, X0 + + ADDQ $32, XK + +Lenc128: + MOVUPS 0(XK), X1 + AESENC X1, X0 + + MOVUPS 16(XK), X1 + AESENC X1, X0 + + MOVUPS 32(XK), X1 + AESENC X1, X0 + + MOVUPS 48(XK), X1 + AESENC X1, X0 + + MOVUPS 64(XK), X1 + AESENC X1, X0 + + MOVUPS 80(XK), X1 + AESENC X1, X0 + + MOVUPS 96(XK), X1 + AESENC X1, X0 + + MOVUPS 112(XK), X1 + AESENC X1, X0 + + MOVUPS 128(XK), X1 + AESENC X1, X0 + + MOVUPS 144(XK), X1 + AESENCLAST X1, X0 + + MOVUPS 0(SRC), X8 + PXOR X0, X8 + MOVUPS X8, 0(DST) + + RET + +// func ctrBlocks2Asm(nr int, xk *uint32, dst, src, ivRev *byte, blockIndex uint64) +TEXT ·ctrBlocks2Asm(SB), NOSPLIT, $0 + MOVQ nr+0(FP), NR + MOVQ xk+8(FP), XK + MOVUPS 0(XK), X2 + MOVQ dst+16(FP), DST + MOVQ src+24(FP), SRC + MOVQ ivRev+32(FP), IV_PTR + MOVQ 0(IV_PTR), IV_LOW_LE + MOVQ 8(IV_PTR), IV_HIGH_LE + MOVQ blockIndex+40(FP), BLOCK_INDEX + + MOVOU bswapMask<>(SB), BSWAP + + ADDQ BLOCK_INDEX, IV_LOW_LE + ADCQ $0, IV_HIGH_LE + + MOVQ IV_LOW_LE, X0 + PINSRQ $1, IV_HIGH_LE, X0 + PSHUFB BSWAP, X0 + ADDQ $1, IV_LOW_LE + ADCQ $0, IV_HIGH_LE + + MOVQ IV_LOW_LE, X1 + PINSRQ $1, IV_HIGH_LE, X1 + PSHUFB BSWAP, X1 + + PXOR X2, X0 + PXOR X2, X1 + + ADDQ $16, XK + + SUBQ $12, NR + JE Lenc192 + JB Lenc128 + +Lenc256: + MOVUPS 0(XK), X2 + AESENC X2, X0 + AESENC X2, X1 + + MOVUPS 16(XK), X2 + AESENC X2, X0 + AESENC X2, X1 + + ADDQ $32, XK + +Lenc192: + MOVUPS 0(XK), X2 + AESENC X2, X0 + AESENC X2, X1 + + MOVUPS 16(XK), X2 + AESENC X2, X0 + AESENC X2, X1 + + ADDQ $32, XK + +Lenc128: + MOVUPS 0(XK), X2 + AESENC X2, X0 + AESENC X2, X1 + + MOVUPS 16(XK), X2 + AESENC X2, X0 + AESENC X2, X1 + + MOVUPS 32(XK), X2 + AESENC X2, X0 + AESENC X2, X1 + + MOVUPS 48(XK), X2 + AESENC X2, X0 + AESENC X2, X1 + + MOVUPS 64(XK), X2 + AESENC X2, X0 + AESENC X2, X1 + + MOVUPS 80(XK), X2 + AESENC X2, X0 + AESENC X2, X1 + + MOVUPS 96(XK), X2 + AESENC X2, X0 + AESENC X2, X1 + + MOVUPS 112(XK), X2 + AESENC X2, X0 + AESENC X2, X1 + + MOVUPS 128(XK), X2 + AESENC X2, X0 + AESENC X2, X1 + + MOVUPS 144(XK), X2 + AESENCLAST X2, X0 + AESENCLAST X2, X1 + + MOVUPS 0(SRC), X8 + PXOR X0, X8 + MOVUPS X8, 0(DST) + + MOVUPS 16(SRC), X9 + PXOR X1, X9 + MOVUPS X9, 16(DST) + + RET + +// func ctrBlocks4Asm(nr int, xk *uint32, dst, src, ivRev *byte, blockIndex uint64) +TEXT ·ctrBlocks4Asm(SB), NOSPLIT, $0 + MOVQ nr+0(FP), NR + MOVQ xk+8(FP), XK + MOVUPS 0(XK), X4 + MOVQ dst+16(FP), DST + MOVQ src+24(FP), SRC + MOVQ ivRev+32(FP), IV_PTR + MOVQ 0(IV_PTR), IV_LOW_LE + MOVQ 8(IV_PTR), IV_HIGH_LE + MOVQ blockIndex+40(FP), BLOCK_INDEX + + MOVOU bswapMask<>(SB), BSWAP + + ADDQ BLOCK_INDEX, IV_LOW_LE + ADCQ $0, IV_HIGH_LE + + MOVQ IV_LOW_LE, X0 + PINSRQ $1, IV_HIGH_LE, X0 + PSHUFB BSWAP, X0 + ADDQ $1, IV_LOW_LE + ADCQ $0, IV_HIGH_LE + + MOVQ IV_LOW_LE, X1 + PINSRQ $1, IV_HIGH_LE, X1 + PSHUFB BSWAP, X1 + ADDQ $1, IV_LOW_LE + ADCQ $0, IV_HIGH_LE + + MOVQ IV_LOW_LE, X2 + PINSRQ $1, IV_HIGH_LE, X2 + PSHUFB BSWAP, X2 + ADDQ $1, IV_LOW_LE + ADCQ $0, IV_HIGH_LE + + MOVQ IV_LOW_LE, X3 + PINSRQ $1, IV_HIGH_LE, X3 + PSHUFB BSWAP, X3 + + PXOR X4, X0 + PXOR X4, X1 + PXOR X4, X2 + PXOR X4, X3 + + ADDQ $16, XK + + SUBQ $12, NR + JE Lenc192 + JB Lenc128 + +Lenc256: + MOVUPS 0(XK), X4 + AESENC X4, X0 + AESENC X4, X1 + AESENC X4, X2 + AESENC X4, X3 + + MOVUPS 16(XK), X4 + AESENC X4, X0 + AESENC X4, X1 + AESENC X4, X2 + AESENC X4, X3 + + ADDQ $32, XK + +Lenc192: + MOVUPS 0(XK), X4 + AESENC X4, X0 + AESENC X4, X1 + AESENC X4, X2 + AESENC X4, X3 + + MOVUPS 16(XK), X4 + AESENC X4, X0 + AESENC X4, X1 + AESENC X4, X2 + AESENC X4, X3 + + ADDQ $32, XK + +Lenc128: + MOVUPS 0(XK), X4 + AESENC X4, X0 + AESENC X4, X1 + AESENC X4, X2 + AESENC X4, X3 + + MOVUPS 16(XK), X4 + AESENC X4, X0 + AESENC X4, X1 + AESENC X4, X2 + AESENC X4, X3 + + MOVUPS 32(XK), X4 + AESENC X4, X0 + AESENC X4, X1 + AESENC X4, X2 + AESENC X4, X3 + + MOVUPS 48(XK), X4 + AESENC X4, X0 + AESENC X4, X1 + AESENC X4, X2 + AESENC X4, X3 + + MOVUPS 64(XK), X4 + AESENC X4, X0 + AESENC X4, X1 + AESENC X4, X2 + AESENC X4, X3 + + MOVUPS 80(XK), X4 + AESENC X4, X0 + AESENC X4, X1 + AESENC X4, X2 + AESENC X4, X3 + + MOVUPS 96(XK), X4 + AESENC X4, X0 + AESENC X4, X1 + AESENC X4, X2 + AESENC X4, X3 + + MOVUPS 112(XK), X4 + AESENC X4, X0 + AESENC X4, X1 + AESENC X4, X2 + AESENC X4, X3 + + MOVUPS 128(XK), X4 + AESENC X4, X0 + AESENC X4, X1 + AESENC X4, X2 + AESENC X4, X3 + + MOVUPS 144(XK), X4 + AESENCLAST X4, X0 + AESENCLAST X4, X1 + AESENCLAST X4, X2 + AESENCLAST X4, X3 + + MOVUPS 0(SRC), X8 + PXOR X0, X8 + MOVUPS X8, 0(DST) + + MOVUPS 16(SRC), X9 + PXOR X1, X9 + MOVUPS X9, 16(DST) + + MOVUPS 32(SRC), X10 + PXOR X2, X10 + MOVUPS X10, 32(DST) + + MOVUPS 48(SRC), X11 + PXOR X3, X11 + MOVUPS X11, 48(DST) + + RET + +// func ctrBlocks8Asm(nr int, xk *uint32, dst, src, ivRev *byte, blockIndex uint64) +TEXT ·ctrBlocks8Asm(SB), NOSPLIT, $0 + MOVQ nr+0(FP), NR + MOVQ xk+8(FP), XK + MOVUPS 0(XK), X8 + MOVQ dst+16(FP), DST + MOVQ src+24(FP), SRC + MOVQ ivRev+32(FP), IV_PTR + MOVQ 0(IV_PTR), IV_LOW_LE + MOVQ 8(IV_PTR), IV_HIGH_LE + MOVQ blockIndex+40(FP), BLOCK_INDEX + + MOVOU bswapMask<>(SB), BSWAP + + ADDQ BLOCK_INDEX, IV_LOW_LE + ADCQ $0, IV_HIGH_LE + + MOVQ IV_LOW_LE, X0 + PINSRQ $1, IV_HIGH_LE, X0 + PSHUFB BSWAP, X0 + ADDQ $1, IV_LOW_LE + ADCQ $0, IV_HIGH_LE + + MOVQ IV_LOW_LE, X1 + PINSRQ $1, IV_HIGH_LE, X1 + PSHUFB BSWAP, X1 + ADDQ $1, IV_LOW_LE + ADCQ $0, IV_HIGH_LE + + MOVQ IV_LOW_LE, X2 + PINSRQ $1, IV_HIGH_LE, X2 + PSHUFB BSWAP, X2 + ADDQ $1, IV_LOW_LE + ADCQ $0, IV_HIGH_LE + + MOVQ IV_LOW_LE, X3 + PINSRQ $1, IV_HIGH_LE, X3 + PSHUFB BSWAP, X3 + ADDQ $1, IV_LOW_LE + ADCQ $0, IV_HIGH_LE + + MOVQ IV_LOW_LE, X4 + PINSRQ $1, IV_HIGH_LE, X4 + PSHUFB BSWAP, X4 + ADDQ $1, IV_LOW_LE + ADCQ $0, IV_HIGH_LE + + MOVQ IV_LOW_LE, X5 + PINSRQ $1, IV_HIGH_LE, X5 + PSHUFB BSWAP, X5 + ADDQ $1, IV_LOW_LE + ADCQ $0, IV_HIGH_LE + + MOVQ IV_LOW_LE, X6 + PINSRQ $1, IV_HIGH_LE, X6 + PSHUFB BSWAP, X6 + ADDQ $1, IV_LOW_LE + ADCQ $0, IV_HIGH_LE + + MOVQ IV_LOW_LE, X7 + PINSRQ $1, IV_HIGH_LE, X7 + PSHUFB BSWAP, X7 + + PXOR X8, X0 + PXOR X8, X1 + PXOR X8, X2 + PXOR X8, X3 + PXOR X8, X4 + PXOR X8, X5 + PXOR X8, X6 + PXOR X8, X7 + + ADDQ $16, XK + + SUBQ $12, NR + JE Lenc192 + JB Lenc128 + +Lenc256: + MOVUPS 0(XK), X8 + AESENC X8, X0 + AESENC X8, X1 + AESENC X8, X2 + AESENC X8, X3 + AESENC X8, X4 + AESENC X8, X5 + AESENC X8, X6 + AESENC X8, X7 + + MOVUPS 16(XK), X8 + AESENC X8, X0 + AESENC X8, X1 + AESENC X8, X2 + AESENC X8, X3 + AESENC X8, X4 + AESENC X8, X5 + AESENC X8, X6 + AESENC X8, X7 + + ADDQ $32, XK + +Lenc192: + MOVUPS 0(XK), X8 + AESENC X8, X0 + AESENC X8, X1 + AESENC X8, X2 + AESENC X8, X3 + AESENC X8, X4 + AESENC X8, X5 + AESENC X8, X6 + AESENC X8, X7 + + MOVUPS 16(XK), X8 + AESENC X8, X0 + AESENC X8, X1 + AESENC X8, X2 + AESENC X8, X3 + AESENC X8, X4 + AESENC X8, X5 + AESENC X8, X6 + AESENC X8, X7 + + ADDQ $32, XK + +Lenc128: + MOVUPS 0(XK), X8 + AESENC X8, X0 + AESENC X8, X1 + AESENC X8, X2 + AESENC X8, X3 + AESENC X8, X4 + AESENC X8, X5 + AESENC X8, X6 + AESENC X8, X7 + + MOVUPS 16(XK), X8 + AESENC X8, X0 + AESENC X8, X1 + AESENC X8, X2 + AESENC X8, X3 + AESENC X8, X4 + AESENC X8, X5 + AESENC X8, X6 + AESENC X8, X7 + + MOVUPS 32(XK), X8 + AESENC X8, X0 + AESENC X8, X1 + AESENC X8, X2 + AESENC X8, X3 + AESENC X8, X4 + AESENC X8, X5 + AESENC X8, X6 + AESENC X8, X7 + + MOVUPS 48(XK), X8 + AESENC X8, X0 + AESENC X8, X1 + AESENC X8, X2 + AESENC X8, X3 + AESENC X8, X4 + AESENC X8, X5 + AESENC X8, X6 + AESENC X8, X7 + + MOVUPS 64(XK), X8 + AESENC X8, X0 + AESENC X8, X1 + AESENC X8, X2 + AESENC X8, X3 + AESENC X8, X4 + AESENC X8, X5 + AESENC X8, X6 + AESENC X8, X7 + + MOVUPS 80(XK), X8 + AESENC X8, X0 + AESENC X8, X1 + AESENC X8, X2 + AESENC X8, X3 + AESENC X8, X4 + AESENC X8, X5 + AESENC X8, X6 + AESENC X8, X7 + + MOVUPS 96(XK), X8 + AESENC X8, X0 + AESENC X8, X1 + AESENC X8, X2 + AESENC X8, X3 + AESENC X8, X4 + AESENC X8, X5 + AESENC X8, X6 + AESENC X8, X7 + + MOVUPS 112(XK), X8 + AESENC X8, X0 + AESENC X8, X1 + AESENC X8, X2 + AESENC X8, X3 + AESENC X8, X4 + AESENC X8, X5 + AESENC X8, X6 + AESENC X8, X7 + + MOVUPS 128(XK), X8 + AESENC X8, X0 + AESENC X8, X1 + AESENC X8, X2 + AESENC X8, X3 + AESENC X8, X4 + AESENC X8, X5 + AESENC X8, X6 + AESENC X8, X7 + + MOVUPS 144(XK), X8 + AESENCLAST X8, X0 + AESENCLAST X8, X1 + AESENCLAST X8, X2 + AESENCLAST X8, X3 + AESENCLAST X8, X4 + AESENCLAST X8, X5 + AESENCLAST X8, X6 + AESENCLAST X8, X7 + + MOVUPS 0(SRC), X8 + PXOR X0, X8 + MOVUPS X8, 0(DST) + + MOVUPS 16(SRC), X9 + PXOR X1, X9 + MOVUPS X9, 16(DST) + + MOVUPS 32(SRC), X10 + PXOR X2, X10 + MOVUPS X10, 32(DST) + + MOVUPS 48(SRC), X11 + PXOR X3, X11 + MOVUPS X11, 48(DST) + + MOVUPS 64(SRC), X12 + PXOR X4, X12 + MOVUPS X12, 64(DST) + + MOVUPS 80(SRC), X13 + PXOR X5, X13 + MOVUPS X13, 80(DST) + + MOVUPS 96(SRC), X14 + PXOR X6, X14 + MOVUPS X14, 96(DST) + + MOVUPS 112(SRC), X15 + PXOR X7, X15 + MOVUPS X15, 112(DST) + + RET + +// func rev16Asm(iv *byte) +TEXT ·rev16Asm(SB), NOSPLIT, $0 + MOVQ iv+0(FP), IV_PTR + MOVUPS 0(IV_PTR), X0 + MOVOU bswapMask<>(SB), BSWAP + PSHUFB BSWAP, X0 + MOVUPS X0, 0(IV_PTR) + + RET diff --git a/src/crypto/aes/ctr_multiblock_amd64_gen.go b/src/crypto/aes/ctr_multiblock_amd64_gen.go new file mode 100644 index 00000000000..019fa12cef5 --- /dev/null +++ b/src/crypto/aes/ctr_multiblock_amd64_gen.go @@ -0,0 +1,180 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build ignore + +// Generate Go assembly for XORing CTR output to n blocks at once with one key. + +package main + +import ( + "fmt" + "os" + "text/template" +) + +var tmplAmd64Str = ` +// Code generated by ctr_multiblock_amd64_gen.go. DO NOT EDIT. + +#include "textflag.h" + +// See https://golang.org/src/crypto/aes/gcm_amd64.s +#define NR CX +#define XK AX +#define DST DX +#define SRC R10 +#define IV_PTR BX +#define BLOCK_INDEX R11 +#define IV_LOW_LE R12 +#define IV_HIGH_LE R13 +#define BSWAP X15 + +DATA bswapMask<>+0x00(SB)/8, $0x08090a0b0c0d0e0f +DATA bswapMask<>+0x08(SB)/8, $0x0001020304050607 + +GLOBL bswapMask<>(SB), (NOPTR+RODATA), $16 + +{{define "enc"}} + MOVUPS {{ mul $.AX 16 }}(XK), {{ $.RegKey }} + {{- range $i := xrange .N }} + {{ $.Instruction }} {{ $.RegKey }}, X{{ $i }} + {{- end }} +{{ end }} + +{{ range $N := $.Sizes }} +{{ $RegKey := printf "X%d" $N }} +// func ctrBlocks{{ $N }}Asm(nr int, xk *uint32, dst, src, ivRev *byte, blockIndex uint64) +TEXT ·ctrBlocks{{ $N }}Asm(SB),NOSPLIT,$0 + MOVQ nr+0(FP), NR + MOVQ xk+8(FP), XK + MOVUPS 0(XK), {{ $RegKey }} + MOVQ dst+16(FP), DST + MOVQ src+24(FP), SRC + MOVQ ivRev+32(FP), IV_PTR + MOVQ 0(IV_PTR), IV_LOW_LE + MOVQ 8(IV_PTR), IV_HIGH_LE + MOVQ blockIndex+40(FP), BLOCK_INDEX + + MOVOU bswapMask<>(SB), BSWAP + + {{/* Prepare plain from IV and blockIndex. */}} + + {{/* Add blockIndex. */}} + ADDQ BLOCK_INDEX, IV_LOW_LE + ADCQ $0, IV_HIGH_LE + + {{/* Copy to plaintext registers. */}} + {{ range $i := xrange $N }} + {{/* https://stackoverflow.com/a/2231893 */}} + MOVQ IV_LOW_LE, X{{ $i }} + PINSRQ $1, IV_HIGH_LE, X{{ $i }} + PSHUFB BSWAP, X{{ $i }} + {{- if ne (add $i 1) $N }} + ADDQ $1, IV_LOW_LE + ADCQ $0, IV_HIGH_LE + {{ end }} + {{ end }} + + {{/* Initial key add. */}} + {{ range $i := xrange $N -}} + PXOR {{ $RegKey }}, X{{ $i }} + {{ end }} + ADDQ $16, XK + + {{/* Num rounds branching. */}} + SUBQ $12, NR + JE Lenc192 + JB Lenc128 + + {{/* 2 extra rounds for 256-bit keys. */}} + Lenc256: + {{- template "enc" (enc_args "AESENC" 0 $N) }} + {{ template "enc" (enc_args "AESENC" 1 $N) }} + ADDQ $32, XK + + {{/* 2 extra rounds for 192-bit keys. */}} + Lenc192: + {{- template "enc" (enc_args "AESENC" 0 $N) }} + {{ template "enc" (enc_args "AESENC" 1 $N) }} + ADDQ $32, XK + + {{/* 10 rounds for 128-bit (with special handling for final). */}} + Lenc128: + {{- range $ax := xrange 9 }} + {{- template "enc" (enc_args "AESENC" $ax $N) }} + {{ end }} + {{ template "enc" (enc_args "AESENCLAST" 9 $N) }} + + {{/* + XOR results to destination. Use X8-X15 for that. + It overwrites BSWAP in the end, but it is not needed. + */}} + {{ range $i := xrange $N }} + {{ $offset := mul $i 16 }} + {{ $xor_reg := add $i 8 }} + MOVUPS {{ $offset }}(SRC), X{{ $xor_reg }} + PXOR X{{ $i }}, X{{ $xor_reg }} + MOVUPS X{{ $xor_reg }}, {{ $offset }}(DST) + {{ end }} + + RET +{{ end }} + +// func rev16Asm(iv *byte) +TEXT ·rev16Asm(SB),NOSPLIT,$0 + MOVQ iv+0(FP), IV_PTR + MOVUPS 0(IV_PTR), X0 + MOVOU bswapMask<>(SB), BSWAP + PSHUFB BSWAP, X0 + MOVUPS X0, 0(IV_PTR) + + RET +` + +func main() { + type Params struct { + Sizes []int + } + + params := Params{ + Sizes: []int{1, 2, 4, 8}, + } + + type EncArgs struct { + AX int + N int + RegKey string + Instruction string + } + + funcs := template.FuncMap{ + "add": func(a, b int) int { + return a + b + }, + "mul": func(a, b int) int { + return a * b + }, + "xrange": func(n int) []int { + result := make([]int, n) + for i := 0; i < n; i++ { + result[i] = i + } + return result + }, + "enc_args": func(inst string, ax, n int) EncArgs { + return EncArgs{ + AX: ax, + N: n, + RegKey: fmt.Sprintf("X%d", n), + Instruction: inst, + } + }, + } + + var tmpl = template.Must(template.New("ctr_multiblock_amd64").Funcs(funcs).Parse(tmplAmd64Str)) + + if err := tmpl.Execute(os.Stdout, params); err != nil { + panic(err) + } +} diff --git a/src/crypto/aes/ctr_multiblock_arm64.s b/src/crypto/aes/ctr_multiblock_arm64.s new file mode 100644 index 00000000000..45ccb5c6e27 --- /dev/null +++ b/src/crypto/aes/ctr_multiblock_arm64.s @@ -0,0 +1,758 @@ +// Code generated by ctr_multiblock_arm64_gen.go. DO NOT EDIT. + +#include "textflag.h" + +// See https://golang.org/src/crypto/aes/gcm_arm64.s +#define NR R9 +#define XK R10 +#define DST R11 +#define SRC R12 +// R13 is reserved. See https://www.keil.com/support/man/docs/armasm/armasm_dom1361289861367.htm +#define IV_PTR R14 +// R15 is reserved. See https://www.keil.com/support/man/docs/armasm/armasm_dom1361289861367.htm +#define IV_LOW_LE R16 +#define IV_HIGH_LE R17 +// R18 is reserved. +#define IV_LOW_BE R19 +#define IV_HIGH_BE R20 +#define BLOCK_INDEX R21 + +// V0.B16 - V7.B16 are for blocks (<=8). See BLOCK_OFFSET. +// V8.B16 - V22.B16 are for <=15 round keys (<=15). See ROUND_KEY_OFFSET. +// V23.B16 - V30.B16 are for destinations (<=8). See DST_OFFSET. + +// func ctrBlocks1Asm(nr int, xk *uint32, dst, src, ivRev *byte, blockIndex uint64) +TEXT ·ctrBlocks1Asm(SB), NOSPLIT, $0 + MOVD nr+0(FP), NR + MOVD xk+8(FP), XK + MOVD dst+16(FP), DST + MOVD src+24(FP), SRC + MOVD ivRev+32(FP), IV_PTR + LDP (IV_PTR), (IV_LOW_LE, IV_HIGH_LE) + MOVD blockIndex+40(FP), BLOCK_INDEX + + ADDS BLOCK_INDEX, IV_LOW_LE + ADC $0, IV_HIGH_LE + + REV IV_LOW_LE, IV_LOW_BE + REV IV_HIGH_LE, IV_HIGH_BE + VMOV IV_LOW_BE, V0.D[1] + VMOV IV_HIGH_BE, V0.D[0] + + CMP $12, NR + BLT Lenc128 + BEQ Lenc192 + +Lenc256: + VLD1.P 32(XK), [V8.B16, V9.B16] + + AESE V8.B16, V0.B16 + AESMC V0.B16, V0.B16 + + AESE V9.B16, V0.B16 + AESMC V0.B16, V0.B16 + +Lenc192: + VLD1.P 32(XK), [V10.B16, V11.B16] + + AESE V10.B16, V0.B16 + AESMC V0.B16, V0.B16 + + AESE V11.B16, V0.B16 + AESMC V0.B16, V0.B16 + +Lenc128: + VLD1.P 64(XK), [V12.B16, V13.B16, V14.B16, V15.B16] + VLD1.P 64(XK), [V16.B16, V17.B16, V18.B16, V19.B16] + VLD1.P 48(XK), [V20.B16, V21.B16, V22.B16] + + AESE V12.B16, V0.B16 + AESMC V0.B16, V0.B16 + + AESE V13.B16, V0.B16 + AESMC V0.B16, V0.B16 + + AESE V14.B16, V0.B16 + AESMC V0.B16, V0.B16 + + AESE V15.B16, V0.B16 + AESMC V0.B16, V0.B16 + + AESE V16.B16, V0.B16 + AESMC V0.B16, V0.B16 + + AESE V17.B16, V0.B16 + AESMC V0.B16, V0.B16 + + AESE V18.B16, V0.B16 + AESMC V0.B16, V0.B16 + + AESE V19.B16, V0.B16 + AESMC V0.B16, V0.B16 + + AESE V20.B16, V0.B16 + AESMC V0.B16, V0.B16 + + AESE V21.B16, V0.B16 + + VEOR V0.B16, V22.B16, V0.B16 + + VLD1.P 16(SRC), [V23.B16] + VEOR V23.B16, V0.B16, V23.B16 + VST1.P [V23.B16], 16(DST) + + RET + +// func ctrBlocks2Asm(nr int, xk *uint32, dst, src, ivRev *byte, blockIndex uint64) +TEXT ·ctrBlocks2Asm(SB), NOSPLIT, $0 + MOVD nr+0(FP), NR + MOVD xk+8(FP), XK + MOVD dst+16(FP), DST + MOVD src+24(FP), SRC + MOVD ivRev+32(FP), IV_PTR + LDP (IV_PTR), (IV_LOW_LE, IV_HIGH_LE) + MOVD blockIndex+40(FP), BLOCK_INDEX + + ADDS BLOCK_INDEX, IV_LOW_LE + ADC $0, IV_HIGH_LE + + REV IV_LOW_LE, IV_LOW_BE + REV IV_HIGH_LE, IV_HIGH_BE + VMOV IV_LOW_BE, V0.D[1] + VMOV IV_HIGH_BE, V0.D[0] + ADDS $1, IV_LOW_LE + ADC $0, IV_HIGH_LE + + REV IV_LOW_LE, IV_LOW_BE + REV IV_HIGH_LE, IV_HIGH_BE + VMOV IV_LOW_BE, V1.D[1] + VMOV IV_HIGH_BE, V1.D[0] + + CMP $12, NR + BLT Lenc128 + BEQ Lenc192 + +Lenc256: + VLD1.P 32(XK), [V8.B16, V9.B16] + + AESE V8.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V8.B16, V1.B16 + AESMC V1.B16, V1.B16 + + AESE V9.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V9.B16, V1.B16 + AESMC V1.B16, V1.B16 + +Lenc192: + VLD1.P 32(XK), [V10.B16, V11.B16] + + AESE V10.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V10.B16, V1.B16 + AESMC V1.B16, V1.B16 + + AESE V11.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V11.B16, V1.B16 + AESMC V1.B16, V1.B16 + +Lenc128: + VLD1.P 64(XK), [V12.B16, V13.B16, V14.B16, V15.B16] + VLD1.P 64(XK), [V16.B16, V17.B16, V18.B16, V19.B16] + VLD1.P 48(XK), [V20.B16, V21.B16, V22.B16] + + AESE V12.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V12.B16, V1.B16 + AESMC V1.B16, V1.B16 + + AESE V13.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V13.B16, V1.B16 + AESMC V1.B16, V1.B16 + + AESE V14.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V14.B16, V1.B16 + AESMC V1.B16, V1.B16 + + AESE V15.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V15.B16, V1.B16 + AESMC V1.B16, V1.B16 + + AESE V16.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V16.B16, V1.B16 + AESMC V1.B16, V1.B16 + + AESE V17.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V17.B16, V1.B16 + AESMC V1.B16, V1.B16 + + AESE V18.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V18.B16, V1.B16 + AESMC V1.B16, V1.B16 + + AESE V19.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V19.B16, V1.B16 + AESMC V1.B16, V1.B16 + + AESE V20.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V20.B16, V1.B16 + AESMC V1.B16, V1.B16 + + AESE V21.B16, V0.B16 + AESE V21.B16, V1.B16 + + VEOR V0.B16, V22.B16, V0.B16 + VEOR V1.B16, V22.B16, V1.B16 + + VLD1.P 32(SRC), [V23.B16, V24.B16] + VEOR V23.B16, V0.B16, V23.B16 + VEOR V24.B16, V1.B16, V24.B16 + VST1.P [V23.B16, V24.B16], 32(DST) + + RET + +// func ctrBlocks4Asm(nr int, xk *uint32, dst, src, ivRev *byte, blockIndex uint64) +TEXT ·ctrBlocks4Asm(SB), NOSPLIT, $0 + MOVD nr+0(FP), NR + MOVD xk+8(FP), XK + MOVD dst+16(FP), DST + MOVD src+24(FP), SRC + MOVD ivRev+32(FP), IV_PTR + LDP (IV_PTR), (IV_LOW_LE, IV_HIGH_LE) + MOVD blockIndex+40(FP), BLOCK_INDEX + + ADDS BLOCK_INDEX, IV_LOW_LE + ADC $0, IV_HIGH_LE + + REV IV_LOW_LE, IV_LOW_BE + REV IV_HIGH_LE, IV_HIGH_BE + VMOV IV_LOW_BE, V0.D[1] + VMOV IV_HIGH_BE, V0.D[0] + ADDS $1, IV_LOW_LE + ADC $0, IV_HIGH_LE + + REV IV_LOW_LE, IV_LOW_BE + REV IV_HIGH_LE, IV_HIGH_BE + VMOV IV_LOW_BE, V1.D[1] + VMOV IV_HIGH_BE, V1.D[0] + ADDS $1, IV_LOW_LE + ADC $0, IV_HIGH_LE + + REV IV_LOW_LE, IV_LOW_BE + REV IV_HIGH_LE, IV_HIGH_BE + VMOV IV_LOW_BE, V2.D[1] + VMOV IV_HIGH_BE, V2.D[0] + ADDS $1, IV_LOW_LE + ADC $0, IV_HIGH_LE + + REV IV_LOW_LE, IV_LOW_BE + REV IV_HIGH_LE, IV_HIGH_BE + VMOV IV_LOW_BE, V3.D[1] + VMOV IV_HIGH_BE, V3.D[0] + + CMP $12, NR + BLT Lenc128 + BEQ Lenc192 + +Lenc256: + VLD1.P 32(XK), [V8.B16, V9.B16] + + AESE V8.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V8.B16, V1.B16 + AESMC V1.B16, V1.B16 + AESE V8.B16, V2.B16 + AESMC V2.B16, V2.B16 + AESE V8.B16, V3.B16 + AESMC V3.B16, V3.B16 + + AESE V9.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V9.B16, V1.B16 + AESMC V1.B16, V1.B16 + AESE V9.B16, V2.B16 + AESMC V2.B16, V2.B16 + AESE V9.B16, V3.B16 + AESMC V3.B16, V3.B16 + +Lenc192: + VLD1.P 32(XK), [V10.B16, V11.B16] + + AESE V10.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V10.B16, V1.B16 + AESMC V1.B16, V1.B16 + AESE V10.B16, V2.B16 + AESMC V2.B16, V2.B16 + AESE V10.B16, V3.B16 + AESMC V3.B16, V3.B16 + + AESE V11.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V11.B16, V1.B16 + AESMC V1.B16, V1.B16 + AESE V11.B16, V2.B16 + AESMC V2.B16, V2.B16 + AESE V11.B16, V3.B16 + AESMC V3.B16, V3.B16 + +Lenc128: + VLD1.P 64(XK), [V12.B16, V13.B16, V14.B16, V15.B16] + VLD1.P 64(XK), [V16.B16, V17.B16, V18.B16, V19.B16] + VLD1.P 48(XK), [V20.B16, V21.B16, V22.B16] + + AESE V12.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V12.B16, V1.B16 + AESMC V1.B16, V1.B16 + AESE V12.B16, V2.B16 + AESMC V2.B16, V2.B16 + AESE V12.B16, V3.B16 + AESMC V3.B16, V3.B16 + + AESE V13.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V13.B16, V1.B16 + AESMC V1.B16, V1.B16 + AESE V13.B16, V2.B16 + AESMC V2.B16, V2.B16 + AESE V13.B16, V3.B16 + AESMC V3.B16, V3.B16 + + AESE V14.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V14.B16, V1.B16 + AESMC V1.B16, V1.B16 + AESE V14.B16, V2.B16 + AESMC V2.B16, V2.B16 + AESE V14.B16, V3.B16 + AESMC V3.B16, V3.B16 + + AESE V15.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V15.B16, V1.B16 + AESMC V1.B16, V1.B16 + AESE V15.B16, V2.B16 + AESMC V2.B16, V2.B16 + AESE V15.B16, V3.B16 + AESMC V3.B16, V3.B16 + + AESE V16.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V16.B16, V1.B16 + AESMC V1.B16, V1.B16 + AESE V16.B16, V2.B16 + AESMC V2.B16, V2.B16 + AESE V16.B16, V3.B16 + AESMC V3.B16, V3.B16 + + AESE V17.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V17.B16, V1.B16 + AESMC V1.B16, V1.B16 + AESE V17.B16, V2.B16 + AESMC V2.B16, V2.B16 + AESE V17.B16, V3.B16 + AESMC V3.B16, V3.B16 + + AESE V18.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V18.B16, V1.B16 + AESMC V1.B16, V1.B16 + AESE V18.B16, V2.B16 + AESMC V2.B16, V2.B16 + AESE V18.B16, V3.B16 + AESMC V3.B16, V3.B16 + + AESE V19.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V19.B16, V1.B16 + AESMC V1.B16, V1.B16 + AESE V19.B16, V2.B16 + AESMC V2.B16, V2.B16 + AESE V19.B16, V3.B16 + AESMC V3.B16, V3.B16 + + AESE V20.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V20.B16, V1.B16 + AESMC V1.B16, V1.B16 + AESE V20.B16, V2.B16 + AESMC V2.B16, V2.B16 + AESE V20.B16, V3.B16 + AESMC V3.B16, V3.B16 + + AESE V21.B16, V0.B16 + AESE V21.B16, V1.B16 + AESE V21.B16, V2.B16 + AESE V21.B16, V3.B16 + + VEOR V0.B16, V22.B16, V0.B16 + VEOR V1.B16, V22.B16, V1.B16 + VEOR V2.B16, V22.B16, V2.B16 + VEOR V3.B16, V22.B16, V3.B16 + + VLD1.P 64(SRC), [V23.B16, V24.B16, V25.B16, V26.B16] + VEOR V23.B16, V0.B16, V23.B16 + VEOR V24.B16, V1.B16, V24.B16 + VEOR V25.B16, V2.B16, V25.B16 + VEOR V26.B16, V3.B16, V26.B16 + VST1.P [V23.B16, V24.B16, V25.B16, V26.B16], 64(DST) + + RET + +// func ctrBlocks8Asm(nr int, xk *uint32, dst, src, ivRev *byte, blockIndex uint64) +TEXT ·ctrBlocks8Asm(SB), NOSPLIT, $0 + MOVD nr+0(FP), NR + MOVD xk+8(FP), XK + MOVD dst+16(FP), DST + MOVD src+24(FP), SRC + MOVD ivRev+32(FP), IV_PTR + LDP (IV_PTR), (IV_LOW_LE, IV_HIGH_LE) + MOVD blockIndex+40(FP), BLOCK_INDEX + + ADDS BLOCK_INDEX, IV_LOW_LE + ADC $0, IV_HIGH_LE + + REV IV_LOW_LE, IV_LOW_BE + REV IV_HIGH_LE, IV_HIGH_BE + VMOV IV_LOW_BE, V0.D[1] + VMOV IV_HIGH_BE, V0.D[0] + ADDS $1, IV_LOW_LE + ADC $0, IV_HIGH_LE + + REV IV_LOW_LE, IV_LOW_BE + REV IV_HIGH_LE, IV_HIGH_BE + VMOV IV_LOW_BE, V1.D[1] + VMOV IV_HIGH_BE, V1.D[0] + ADDS $1, IV_LOW_LE + ADC $0, IV_HIGH_LE + + REV IV_LOW_LE, IV_LOW_BE + REV IV_HIGH_LE, IV_HIGH_BE + VMOV IV_LOW_BE, V2.D[1] + VMOV IV_HIGH_BE, V2.D[0] + ADDS $1, IV_LOW_LE + ADC $0, IV_HIGH_LE + + REV IV_LOW_LE, IV_LOW_BE + REV IV_HIGH_LE, IV_HIGH_BE + VMOV IV_LOW_BE, V3.D[1] + VMOV IV_HIGH_BE, V3.D[0] + ADDS $1, IV_LOW_LE + ADC $0, IV_HIGH_LE + + REV IV_LOW_LE, IV_LOW_BE + REV IV_HIGH_LE, IV_HIGH_BE + VMOV IV_LOW_BE, V4.D[1] + VMOV IV_HIGH_BE, V4.D[0] + ADDS $1, IV_LOW_LE + ADC $0, IV_HIGH_LE + + REV IV_LOW_LE, IV_LOW_BE + REV IV_HIGH_LE, IV_HIGH_BE + VMOV IV_LOW_BE, V5.D[1] + VMOV IV_HIGH_BE, V5.D[0] + ADDS $1, IV_LOW_LE + ADC $0, IV_HIGH_LE + + REV IV_LOW_LE, IV_LOW_BE + REV IV_HIGH_LE, IV_HIGH_BE + VMOV IV_LOW_BE, V6.D[1] + VMOV IV_HIGH_BE, V6.D[0] + ADDS $1, IV_LOW_LE + ADC $0, IV_HIGH_LE + + REV IV_LOW_LE, IV_LOW_BE + REV IV_HIGH_LE, IV_HIGH_BE + VMOV IV_LOW_BE, V7.D[1] + VMOV IV_HIGH_BE, V7.D[0] + + CMP $12, NR + BLT Lenc128 + BEQ Lenc192 + +Lenc256: + VLD1.P 32(XK), [V8.B16, V9.B16] + + AESE V8.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V8.B16, V1.B16 + AESMC V1.B16, V1.B16 + AESE V8.B16, V2.B16 + AESMC V2.B16, V2.B16 + AESE V8.B16, V3.B16 + AESMC V3.B16, V3.B16 + AESE V8.B16, V4.B16 + AESMC V4.B16, V4.B16 + AESE V8.B16, V5.B16 + AESMC V5.B16, V5.B16 + AESE V8.B16, V6.B16 + AESMC V6.B16, V6.B16 + AESE V8.B16, V7.B16 + AESMC V7.B16, V7.B16 + + AESE V9.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V9.B16, V1.B16 + AESMC V1.B16, V1.B16 + AESE V9.B16, V2.B16 + AESMC V2.B16, V2.B16 + AESE V9.B16, V3.B16 + AESMC V3.B16, V3.B16 + AESE V9.B16, V4.B16 + AESMC V4.B16, V4.B16 + AESE V9.B16, V5.B16 + AESMC V5.B16, V5.B16 + AESE V9.B16, V6.B16 + AESMC V6.B16, V6.B16 + AESE V9.B16, V7.B16 + AESMC V7.B16, V7.B16 + +Lenc192: + VLD1.P 32(XK), [V10.B16, V11.B16] + + AESE V10.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V10.B16, V1.B16 + AESMC V1.B16, V1.B16 + AESE V10.B16, V2.B16 + AESMC V2.B16, V2.B16 + AESE V10.B16, V3.B16 + AESMC V3.B16, V3.B16 + AESE V10.B16, V4.B16 + AESMC V4.B16, V4.B16 + AESE V10.B16, V5.B16 + AESMC V5.B16, V5.B16 + AESE V10.B16, V6.B16 + AESMC V6.B16, V6.B16 + AESE V10.B16, V7.B16 + AESMC V7.B16, V7.B16 + + AESE V11.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V11.B16, V1.B16 + AESMC V1.B16, V1.B16 + AESE V11.B16, V2.B16 + AESMC V2.B16, V2.B16 + AESE V11.B16, V3.B16 + AESMC V3.B16, V3.B16 + AESE V11.B16, V4.B16 + AESMC V4.B16, V4.B16 + AESE V11.B16, V5.B16 + AESMC V5.B16, V5.B16 + AESE V11.B16, V6.B16 + AESMC V6.B16, V6.B16 + AESE V11.B16, V7.B16 + AESMC V7.B16, V7.B16 + +Lenc128: + VLD1.P 64(XK), [V12.B16, V13.B16, V14.B16, V15.B16] + VLD1.P 64(XK), [V16.B16, V17.B16, V18.B16, V19.B16] + VLD1.P 48(XK), [V20.B16, V21.B16, V22.B16] + + AESE V12.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V12.B16, V1.B16 + AESMC V1.B16, V1.B16 + AESE V12.B16, V2.B16 + AESMC V2.B16, V2.B16 + AESE V12.B16, V3.B16 + AESMC V3.B16, V3.B16 + AESE V12.B16, V4.B16 + AESMC V4.B16, V4.B16 + AESE V12.B16, V5.B16 + AESMC V5.B16, V5.B16 + AESE V12.B16, V6.B16 + AESMC V6.B16, V6.B16 + AESE V12.B16, V7.B16 + AESMC V7.B16, V7.B16 + + AESE V13.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V13.B16, V1.B16 + AESMC V1.B16, V1.B16 + AESE V13.B16, V2.B16 + AESMC V2.B16, V2.B16 + AESE V13.B16, V3.B16 + AESMC V3.B16, V3.B16 + AESE V13.B16, V4.B16 + AESMC V4.B16, V4.B16 + AESE V13.B16, V5.B16 + AESMC V5.B16, V5.B16 + AESE V13.B16, V6.B16 + AESMC V6.B16, V6.B16 + AESE V13.B16, V7.B16 + AESMC V7.B16, V7.B16 + + AESE V14.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V14.B16, V1.B16 + AESMC V1.B16, V1.B16 + AESE V14.B16, V2.B16 + AESMC V2.B16, V2.B16 + AESE V14.B16, V3.B16 + AESMC V3.B16, V3.B16 + AESE V14.B16, V4.B16 + AESMC V4.B16, V4.B16 + AESE V14.B16, V5.B16 + AESMC V5.B16, V5.B16 + AESE V14.B16, V6.B16 + AESMC V6.B16, V6.B16 + AESE V14.B16, V7.B16 + AESMC V7.B16, V7.B16 + + AESE V15.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V15.B16, V1.B16 + AESMC V1.B16, V1.B16 + AESE V15.B16, V2.B16 + AESMC V2.B16, V2.B16 + AESE V15.B16, V3.B16 + AESMC V3.B16, V3.B16 + AESE V15.B16, V4.B16 + AESMC V4.B16, V4.B16 + AESE V15.B16, V5.B16 + AESMC V5.B16, V5.B16 + AESE V15.B16, V6.B16 + AESMC V6.B16, V6.B16 + AESE V15.B16, V7.B16 + AESMC V7.B16, V7.B16 + + AESE V16.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V16.B16, V1.B16 + AESMC V1.B16, V1.B16 + AESE V16.B16, V2.B16 + AESMC V2.B16, V2.B16 + AESE V16.B16, V3.B16 + AESMC V3.B16, V3.B16 + AESE V16.B16, V4.B16 + AESMC V4.B16, V4.B16 + AESE V16.B16, V5.B16 + AESMC V5.B16, V5.B16 + AESE V16.B16, V6.B16 + AESMC V6.B16, V6.B16 + AESE V16.B16, V7.B16 + AESMC V7.B16, V7.B16 + + AESE V17.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V17.B16, V1.B16 + AESMC V1.B16, V1.B16 + AESE V17.B16, V2.B16 + AESMC V2.B16, V2.B16 + AESE V17.B16, V3.B16 + AESMC V3.B16, V3.B16 + AESE V17.B16, V4.B16 + AESMC V4.B16, V4.B16 + AESE V17.B16, V5.B16 + AESMC V5.B16, V5.B16 + AESE V17.B16, V6.B16 + AESMC V6.B16, V6.B16 + AESE V17.B16, V7.B16 + AESMC V7.B16, V7.B16 + + AESE V18.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V18.B16, V1.B16 + AESMC V1.B16, V1.B16 + AESE V18.B16, V2.B16 + AESMC V2.B16, V2.B16 + AESE V18.B16, V3.B16 + AESMC V3.B16, V3.B16 + AESE V18.B16, V4.B16 + AESMC V4.B16, V4.B16 + AESE V18.B16, V5.B16 + AESMC V5.B16, V5.B16 + AESE V18.B16, V6.B16 + AESMC V6.B16, V6.B16 + AESE V18.B16, V7.B16 + AESMC V7.B16, V7.B16 + + AESE V19.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V19.B16, V1.B16 + AESMC V1.B16, V1.B16 + AESE V19.B16, V2.B16 + AESMC V2.B16, V2.B16 + AESE V19.B16, V3.B16 + AESMC V3.B16, V3.B16 + AESE V19.B16, V4.B16 + AESMC V4.B16, V4.B16 + AESE V19.B16, V5.B16 + AESMC V5.B16, V5.B16 + AESE V19.B16, V6.B16 + AESMC V6.B16, V6.B16 + AESE V19.B16, V7.B16 + AESMC V7.B16, V7.B16 + + AESE V20.B16, V0.B16 + AESMC V0.B16, V0.B16 + AESE V20.B16, V1.B16 + AESMC V1.B16, V1.B16 + AESE V20.B16, V2.B16 + AESMC V2.B16, V2.B16 + AESE V20.B16, V3.B16 + AESMC V3.B16, V3.B16 + AESE V20.B16, V4.B16 + AESMC V4.B16, V4.B16 + AESE V20.B16, V5.B16 + AESMC V5.B16, V5.B16 + AESE V20.B16, V6.B16 + AESMC V6.B16, V6.B16 + AESE V20.B16, V7.B16 + AESMC V7.B16, V7.B16 + + AESE V21.B16, V0.B16 + AESE V21.B16, V1.B16 + AESE V21.B16, V2.B16 + AESE V21.B16, V3.B16 + AESE V21.B16, V4.B16 + AESE V21.B16, V5.B16 + AESE V21.B16, V6.B16 + AESE V21.B16, V7.B16 + + VEOR V0.B16, V22.B16, V0.B16 + VEOR V1.B16, V22.B16, V1.B16 + VEOR V2.B16, V22.B16, V2.B16 + VEOR V3.B16, V22.B16, V3.B16 + VEOR V4.B16, V22.B16, V4.B16 + VEOR V5.B16, V22.B16, V5.B16 + VEOR V6.B16, V22.B16, V6.B16 + VEOR V7.B16, V22.B16, V7.B16 + + VLD1.P 64(SRC), [V23.B16, V24.B16, V25.B16, V26.B16] + VLD1.P 64(SRC), [V27.B16, V28.B16, V29.B16, V30.B16] + VEOR V23.B16, V0.B16, V23.B16 + VEOR V24.B16, V1.B16, V24.B16 + VEOR V25.B16, V2.B16, V25.B16 + VEOR V26.B16, V3.B16, V26.B16 + VEOR V27.B16, V4.B16, V27.B16 + VEOR V28.B16, V5.B16, V28.B16 + VEOR V29.B16, V6.B16, V29.B16 + VEOR V30.B16, V7.B16, V30.B16 + VST1.P [V23.B16, V24.B16, V25.B16, V26.B16], 64(DST) + VST1.P [V27.B16, V28.B16, V29.B16, V30.B16], 64(DST) + + RET + +// func rev16Asm(iv *byte) +TEXT ·rev16Asm(SB), NOSPLIT, $0 + MOVD iv+0(FP), IV_PTR + LDP (IV_PTR), (IV_HIGH_BE, IV_LOW_BE) + REV IV_LOW_BE, IV_LOW_LE + REV IV_HIGH_BE, IV_HIGH_LE + STP (IV_LOW_LE, IV_HIGH_LE), (IV_PTR) + + RET diff --git a/src/crypto/aes/ctr_multiblock_arm64_gen.go b/src/crypto/aes/ctr_multiblock_arm64_gen.go new file mode 100644 index 00000000000..303506a3dc9 --- /dev/null +++ b/src/crypto/aes/ctr_multiblock_arm64_gen.go @@ -0,0 +1,232 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build ignore + +// Generate Go assembly for XORing CTR output to n blocks at once with one key. +package main + +import ( + "fmt" + "os" + "strings" + "text/template" +) + +// First registers in their groups. +const ( + blockOffset = 0 + roundKeyOffset = 8 + dstOffset = 23 +) + +var tmplArm64Str = ` +// Code generated by ctr_multiblock_arm64_gen.go. DO NOT EDIT. + +#include "textflag.h" + +// See https://golang.org/src/crypto/aes/gcm_arm64.s +#define NR R9 +#define XK R10 +#define DST R11 +#define SRC R12 +// R13 is reserved. See https://www.keil.com/support/man/docs/armasm/armasm_dom1361289861367.htm +#define IV_PTR R14 +// R15 is reserved. See https://www.keil.com/support/man/docs/armasm/armasm_dom1361289861367.htm +#define IV_LOW_LE R16 +#define IV_HIGH_LE R17 +// R18 is reserved. +#define IV_LOW_BE R19 +#define IV_HIGH_BE R20 +#define BLOCK_INDEX R21 + +// V0.B16 - V7.B16 are for blocks (<=8). See BLOCK_OFFSET. +// V8.B16 - V22.B16 are for <=15 round keys (<=15). See ROUND_KEY_OFFSET. +// V23.B16 - V30.B16 are for destinations (<=8). See DST_OFFSET. + +{{define "load_keys"}} + {{- range regs_batches (round_key_reg $.FirstKey) $.NKeys }} + VLD1.P {{ .Size }}(XK), [{{ .Regs }}] + {{- end }} +{{ end }} + +{{define "enc"}} + {{ range $i := xrange $.N -}} + AESE V{{ round_key_reg $.Key}}.B16, V{{ block_reg $i }}.B16 + {{- if $.WithMc }} + AESMC V{{ block_reg $i }}.B16, V{{ block_reg $i }}.B16 + {{- end }} + {{ end }} +{{ end }} + +{{ range $N := $.Sizes }} +// func ctrBlocks{{ $N }}Asm(nr int, xk *uint32, dst, src, ivRev *byte, blockIndex uint64) +TEXT ·ctrBlocks{{ $N }}Asm(SB),NOSPLIT,$0 + MOVD nr+0(FP), NR + MOVD xk+8(FP), XK + MOVD dst+16(FP), DST + MOVD src+24(FP), SRC + MOVD ivRev+32(FP), IV_PTR + LDP (IV_PTR), (IV_LOW_LE, IV_HIGH_LE) + MOVD blockIndex+40(FP), BLOCK_INDEX + + {{/* Prepare plain from IV and blockIndex. */}} + + {{/* Add blockIndex. */}} + ADDS BLOCK_INDEX, IV_LOW_LE + ADC $0, IV_HIGH_LE + + {{/* Copy to plaintext registers. */}} + {{ range $i := xrange $N }} + REV IV_LOW_LE, IV_LOW_BE + REV IV_HIGH_LE, IV_HIGH_BE + {{- /* https://developer.arm.com/documentation/dui0801/g/A64-SIMD-Vector-Instructions/MOV--vector--from-general- */}} + VMOV IV_LOW_BE, V{{ block_reg $i }}.D[1] + VMOV IV_HIGH_BE, V{{ block_reg $i }}.D[0] + {{- if ne (add $i 1) $N }} + ADDS $1, IV_LOW_LE + ADC $0, IV_HIGH_LE + {{ end }} + {{ end }} + + {{/* Num rounds branching. */}} + CMP $12, NR + BLT Lenc128 + BEQ Lenc192 + + {{/* 2 extra rounds for 256-bit keys. */}} + Lenc256: + {{- template "load_keys" (load_keys_args 0 2) }} + {{- template "enc" (enc_args 0 $N true) }} + {{- template "enc" (enc_args 1 $N true) }} + + {{/* 2 extra rounds for 192-bit keys. */}} + Lenc192: + {{- template "load_keys" (load_keys_args 2 2) }} + {{- template "enc" (enc_args 2 $N true) }} + {{- template "enc" (enc_args 3 $N true) }} + + {{/* 10 rounds for 128-bit (with special handling for final). */}} + Lenc128: + {{- template "load_keys" (load_keys_args 4 11) }} + {{- range $r := xrange 9 }} + {{- template "enc" (enc_args (add $r 4) $N true) }} + {{ end }} + {{ template "enc" (enc_args 13 $N false) }} + + {{/* We need to XOR blocks with the last round key (key 14, register V22). */}} + {{ range $i := xrange $N }} + VEOR V{{ block_reg $i }}.B16, V{{ round_key_reg 14 }}.B16, V{{ block_reg $i }}.B16 + {{- end }} + + {{/* XOR results to destination. */}} + {{- range regs_batches $.DstOffset $N }} + VLD1.P {{ .Size }}(SRC), [{{ .Regs }}] + {{- end }} + {{- range $i := xrange $N }} + VEOR V{{ add $.DstOffset $i }}.B16, V{{ block_reg $i }}.B16, V{{ add $.DstOffset $i }}.B16 + {{- end }} + {{- range regs_batches $.DstOffset $N }} + VST1.P [{{ .Regs }}], {{ .Size }}(DST) + {{- end }} + + RET +{{ end }} + +// func rev16Asm(iv *byte) +TEXT ·rev16Asm(SB),NOSPLIT,$0 + MOVD iv+0(FP), IV_PTR + LDP (IV_PTR), (IV_HIGH_BE, IV_LOW_BE) + REV IV_LOW_BE, IV_LOW_LE + REV IV_HIGH_BE, IV_HIGH_LE + STP (IV_LOW_LE, IV_HIGH_LE), (IV_PTR) + + RET +` + +func main() { + type Params struct { + DstOffset int + Sizes []int + } + + params := Params{ + DstOffset: dstOffset, + Sizes: []int{1, 2, 4, 8}, + } + + type RegsBatch struct { + Size int + Regs string // Comma-separated list of registers. + } + + type LoadKeysArgs struct { + FirstKey int + NKeys int + } + + type EncArgs struct { + Key int + N int + WithMc bool + } + + funcs := template.FuncMap{ + "add": func(a, b int) int { + return a + b + }, + "xrange": func(n int) []int { + result := make([]int, n) + for i := 0; i < n; i++ { + result[i] = i + } + return result + }, + "block_reg": func(block int) int { + return blockOffset + block + }, + "round_key_reg": func(key int) int { + return roundKeyOffset + key + }, + "regs_batches": func(firstReg, nregs int) []RegsBatch { + result := make([]RegsBatch, 0) + for nregs != 0 { + batch := 4 + if nregs < batch { + batch = nregs + } + regsList := make([]string, 0, batch) + for j := firstReg; j < firstReg+batch; j++ { + regsList = append(regsList, fmt.Sprintf("V%d.B16", j)) + } + result = append(result, RegsBatch{ + Size: 16 * batch, + Regs: strings.Join(regsList, ", "), + }) + nregs -= batch + firstReg += batch + } + return result + }, + "enc_args": func(key, n int, withMc bool) EncArgs { + return EncArgs{ + Key: key, + N: n, + WithMc: withMc, + } + }, + "load_keys_args": func(firstKey, nkeys int) LoadKeysArgs { + return LoadKeysArgs{ + FirstKey: firstKey, + NKeys: nkeys, + } + }, + } + + var tmpl = template.Must(template.New("ctr_multiblock_arm64").Funcs(funcs).Parse(tmplArm64Str)) + + if err := tmpl.Execute(os.Stdout, params); err != nil { + panic(err) + } +} diff --git a/src/crypto/cipher/ctr_aes_test.go b/src/crypto/cipher/ctr_aes_test.go index d019ae0d022..89318f14415 100644 --- a/src/crypto/cipher/ctr_aes_test.go +++ b/src/crypto/cipher/ctr_aes_test.go @@ -14,6 +14,12 @@ import ( "bytes" "crypto/aes" "crypto/cipher" + "crypto/internal/boring" + "encoding/hex" + "fmt" + "math/rand" + "sort" + "strings" "testing" ) @@ -100,3 +106,228 @@ func TestCTR_AES(t *testing.T) { } } } + +// This wrapper type disables method NewCTR (interface ctrAble) +// to force generic implementation. +type nonCtrAble struct { + impl cipher.Block +} + +func (n *nonCtrAble) BlockSize() int { + return n.impl.BlockSize() +} + +func (n *nonCtrAble) Encrypt(dst, src []byte) { + n.impl.Encrypt(dst, src) +} + +func (n *nonCtrAble) Decrypt(dst, src []byte) { + panic("must not be called") +} + +func makeTestingCiphers(aesBlock cipher.Block, iv []byte) (genericCtr, multiblockCtr cipher.Stream) { + return cipher.NewCTR(&nonCtrAble{impl: aesBlock}, iv), cipher.NewCTR(aesBlock, iv) +} + +func randBytes(t *testing.T, r *rand.Rand, count int) []byte { + t.Helper() + buf := make([]byte, count) + n, err := r.Read(buf) + if err != nil { + t.Fatal(err) + } + if n != count { + t.Fatal("short read from Rand") + } + return buf +} + +const aesBlockSize = 16 + +type ctrAble interface { + NewCTR(iv []byte) cipher.Stream +} + +// Verify that multiblock AES CTR (src/crypto/aes/ctr_multiblock_*.s) +// produces the same results as generic single-block implementation. +// This test runs checks on random IV. +func TestCTR_AES_multiblock_random_IV(t *testing.T) { + r := rand.New(rand.NewSource(54321)) + iv := randBytes(t, r, aesBlockSize) + const Size = 100 + + for _, keySize := range []int{16, 24, 32} { + keySize := keySize + t.Run(fmt.Sprintf("keySize=%d", keySize), func(t *testing.T) { + key := randBytes(t, r, keySize) + aesBlock, err := aes.NewCipher(key) + if err != nil { + t.Fatal(err) + } + if _, ok := aesBlock.(ctrAble); !ok { + t.Skip("Skipping the test - multiblock implementation is not available") + } + genericCtr, _ := makeTestingCiphers(aesBlock, iv) + + plaintext := randBytes(t, r, Size) + + // Generate reference ciphertext. + genericCiphertext := make([]byte, len(plaintext)) + genericCtr.XORKeyStream(genericCiphertext, plaintext) + + // Split the text in 3 parts in all possible ways and encrypt them + // individually using multiblock implementation to catch edge cases. + + for part1 := 0; part1 <= Size; part1++ { + part1 := part1 + t.Run(fmt.Sprintf("part1=%d", part1), func(t *testing.T) { + for part2 := 0; part2 <= Size-part1; part2++ { + part2 := part2 + t.Run(fmt.Sprintf("part2=%d", part2), func(t *testing.T) { + _, multiblockCtr := makeTestingCiphers(aesBlock, iv) + multiblockCiphertext := make([]byte, len(plaintext)) + multiblockCtr.XORKeyStream(multiblockCiphertext[:part1], plaintext[:part1]) + multiblockCtr.XORKeyStream(multiblockCiphertext[part1:part1+part2], plaintext[part1:part1+part2]) + multiblockCtr.XORKeyStream(multiblockCiphertext[part1+part2:], plaintext[part1+part2:]) + if !bytes.Equal(genericCiphertext, multiblockCiphertext) { + t.Fatal("multiblock CTR's output does not match generic CTR's output") + } + }) + } + }) + } + }) + } +} + +func parseHex(str string) []byte { + b, err := hex.DecodeString(strings.ReplaceAll(str, " ", "")) + if err != nil { + panic(err) + } + return b +} + +// Verify that multiblock AES CTR (src/crypto/aes/ctr_multiblock_*.s) +// produces the same results as generic single-block implementation. +// This test runs checks on edge cases (IV overflows). +func TestCTR_AES_multiblock_overflow_IV(t *testing.T) { + r := rand.New(rand.NewSource(987654)) + + const Size = 4096 + plaintext := randBytes(t, r, Size) + + ivs := [][]byte{ + parseHex("00 00 00 00 00 00 00 00 FF FF FF FF FF FF FF FF"), + parseHex("FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF"), + parseHex("FF FF FF FF FF FF FF FF 00 00 00 00 00 00 00 00"), + parseHex("FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF fe"), + parseHex("00 00 00 00 00 00 00 00 FF FF FF FF FF FF FF fe"), + parseHex("FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF 00"), + parseHex("00 00 00 00 00 00 00 01 FF FF FF FF FF FF FF 00"), + parseHex("00 00 00 00 00 00 00 01 FF FF FF FF FF FF FF FF"), + parseHex("00 00 00 00 00 00 00 01 FF FF FF FF FF FF FF fe"), + parseHex("00 00 00 00 00 00 00 01 FF FF FF FF FF FF FF 00"), + } + + for _, keySize := range []int{16, 24, 32} { + keySize := keySize + t.Run(fmt.Sprintf("keySize=%d", keySize), func(t *testing.T) { + for _, iv := range ivs { + key := randBytes(t, r, keySize) + aesBlock, err := aes.NewCipher(key) + if err != nil { + t.Fatal(err) + } + if _, ok := aesBlock.(ctrAble); !ok { + t.Skip("Skipping the test - multiblock implementation is not available") + } + + t.Run(fmt.Sprintf("iv=%s", hex.EncodeToString(iv)), func(t *testing.T) { + for _, offset := range []int{0, 1, 16, 1024} { + offset := offset + t.Run(fmt.Sprintf("offset=%d", offset), func(t *testing.T) { + genericCtr, multiblockCtr := makeTestingCiphers(aesBlock, iv) + + // Generate reference ciphertext. + genericCiphertext := make([]byte, Size) + genericCtr.XORKeyStream(genericCiphertext, plaintext) + + multiblockCiphertext := make([]byte, Size) + multiblockCtr.XORKeyStream(multiblockCiphertext, plaintext[:offset]) + multiblockCtr.XORKeyStream(multiblockCiphertext[offset:], plaintext[offset:]) + if !bytes.Equal(genericCiphertext, multiblockCiphertext) { + t.Fatal("multiblock CTR's output does not match generic CTR's output") + } + }) + } + }) + } + }) + } +} + +// Check that method XORKeyStreamAt works correctly. +func TestCTR_AES_multiblock_XORKeyStreamAt(t *testing.T) { + if boring.Enabled { + t.Skip("XORKeyStreamAt is not available in boring mode") + } + + type XORKeyStreamAtable interface { + XORKeyStreamAt(dst, src []byte, offset uint64) + } + + r := rand.New(rand.NewSource(12345)) + const Size = 32 * 1024 * 1024 + plaintext := randBytes(t, r, Size) + + for _, keySize := range []int{16, 24, 32} { + keySize := keySize + t.Run(fmt.Sprintf("keySize=%d", keySize), func(t *testing.T) { + key := randBytes(t, r, keySize) + iv := randBytes(t, r, aesBlockSize) + + aesBlock, err := aes.NewCipher(key) + if err != nil { + t.Fatal(err) + } + if _, ok := aesBlock.(ctrAble); !ok { + t.Skip("Skipping the test - multiblock implementation is not available") + } + genericCtr, multiblockCtr := makeTestingCiphers(aesBlock, iv) + ctrAt, ok := multiblockCtr.(XORKeyStreamAtable) + if !ok { + t.Fatal("cipher is expected to have method XORKeyStreamAt") + } + + // Generate reference ciphertext. + genericCiphertext := make([]byte, Size) + genericCtr.XORKeyStream(genericCiphertext, plaintext) + + multiblockCiphertext := make([]byte, Size) + // Split the range to random slices. + const N = 1000 + boundaries := make([]int, 0, N+2) + for i := 0; i < N; i++ { + boundaries = append(boundaries, r.Intn(Size)) + } + boundaries = append(boundaries, 0) + boundaries = append(boundaries, Size) + sort.Ints(boundaries) + + for _, i := range r.Perm(N + 1) { + begin := boundaries[i] + end := boundaries[i+1] + ctrAt.XORKeyStreamAt( + multiblockCiphertext[begin:end], + plaintext[begin:end], + uint64(begin), + ) + } + + if !bytes.Equal(genericCiphertext, multiblockCiphertext) { + t.Fatal("multiblock CTR's output does not match generic CTR's output") + } + }) + } +}