From f9eb06c3022b20ae0b7944717d58d582eb5677ea Mon Sep 17 00:00:00 2001 From: Garrett Bodley Date: Tue, 16 Jul 2024 22:44:17 -0400 Subject: [PATCH] crypto/sha1: Avo port of sha1block_amd64.s This implementation utilizes the same registers found in the reference implementation, aiming to produce a minimal semantic diff between the Avo-generated output and the original hand-written assembly. To verify the Avo implementation, the reference and Avo-generated assembly files are fed to `go tool asm`, capturing the debug output into corresponding temp files. The debug output contains supplementary metadata (line numbers, instruction offsets, and source file references) that must be removed in order to obtain a semantic diff of the two files. This is accomplished via a small utility script written in awk. Commands used to verify Avo output: GOROOT=$(go env GOROOT) ASM_PATH="src/crypto/sha1/sha1block_amd64.s" REFERENCE="54fe0fd43fcf8609666c16ae6d15ed92873b1564" go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ <(git cat-file -p "$REFERENCE:$ASM_PATH") \ > /tmp/reference.s go tool asm -o /dev/null -I "$GOROOT"/src/runtime -debug \ "$ASM_PATH" \ > /tmp/avo.s normalize(){ awk '{ $1=$2=$3=""; print substr($0,4) }' } diff <(normalize < /tmp/reference.s) <(normalize < /tmp/avo.s) 1273c1273 < MOVQ $K_XMM_AR<>(SB), R8 --- > LEAQ K_XMM_AR<>(SB), R8 Change-Id: I39168fadb01baa9a96bc2b432fc94b492d036ce4 Reviewed-on: https://go-review.googlesource.com/c/go/+/598795 Reviewed-by: Dmitri Shuralyov LUCI-TryBot-Result: Go LUCI Reviewed-by: Filippo Valsorda Reviewed-by: Roland Shoemaker --- .../compile/internal/types2/stdlib_test.go | 1 + src/crypto/sha1/_asm/go.mod | 11 + src/crypto/sha1/_asm/go.sum | 8 + src/crypto/sha1/_asm/sha1block_amd64_asm.go | 1741 +++++++ src/crypto/sha1/sha1block_amd64.s | 4530 +++++++++++------ src/go/types/stdlib_test.go | 1 + 6 files changed, 4802 insertions(+), 1490 deletions(-) create mode 100644 src/crypto/sha1/_asm/go.mod create mode 100644 src/crypto/sha1/_asm/go.sum create mode 100644 src/crypto/sha1/_asm/sha1block_amd64_asm.go diff --git a/src/cmd/compile/internal/types2/stdlib_test.go b/src/cmd/compile/internal/types2/stdlib_test.go index 70bb0ae9229..0454e5b93aa 100644 --- a/src/cmd/compile/internal/types2/stdlib_test.go +++ b/src/cmd/compile/internal/types2/stdlib_test.go @@ -357,6 +357,7 @@ var excluded = map[string]bool{ // go.dev/issue/46027: some imports are missing for this submodule. "crypto/internal/bigmod/_asm": true, "crypto/internal/edwards25519/field/_asm": true, + "crypto/sha1/_asm": true, "crypto/sha256/_asm": true, } diff --git a/src/crypto/sha1/_asm/go.mod b/src/crypto/sha1/_asm/go.mod new file mode 100644 index 00000000000..d141682320c --- /dev/null +++ b/src/crypto/sha1/_asm/go.mod @@ -0,0 +1,11 @@ +module std/crypto/sha1/_asm + +go 1.24 + +require github.com/mmcloughlin/avo v0.6.0 + +require ( + golang.org/x/mod v0.20.0 // indirect + golang.org/x/sync v0.8.0 // indirect + golang.org/x/tools v0.24.0 // indirect +) diff --git a/src/crypto/sha1/_asm/go.sum b/src/crypto/sha1/_asm/go.sum new file mode 100644 index 00000000000..76af484b2eb --- /dev/null +++ b/src/crypto/sha1/_asm/go.sum @@ -0,0 +1,8 @@ +github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY= +github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8= +golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0= +golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24= +golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ= diff --git a/src/crypto/sha1/_asm/sha1block_amd64_asm.go b/src/crypto/sha1/_asm/sha1block_amd64_asm.go new file mode 100644 index 00000000000..750f5ce31c7 --- /dev/null +++ b/src/crypto/sha1/_asm/sha1block_amd64_asm.go @@ -0,0 +1,1741 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + . "github.com/mmcloughlin/avo/build" + . "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/reg" +) + +//go:generate go run . -out ../sha1block_amd64.s -pkg sha1 + +// AVX2 version by Intel, same algorithm as code in Linux kernel: +// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha1_avx2_x86_64_asm.S +// Authors: +// Ilya Albrekht +// Maxim Locktyukhin +// Ronen Zohar +// Chandramouli Narayanan + +func main() { + Package("crypto/sha1") + ConstraintExpr("!purego") + blockAMD64() + blockAVX2() + Generate() +} + +func LOAD(index int) { + MOVL(Mem{Base: SI}.Offset(index*4), R10L) + BSWAPL(R10L) + MOVL(R10L, Mem{Base: SP}.Offset(index*4)) +} + +func SHUFFLE(index int) { + MOVL(Mem{Base: SP}.Offset(((index)&0xf)*4), R10L) + XORL(Mem{Base: SP}.Offset(((index-3)&0xf)*4), R10L) + XORL(Mem{Base: SP}.Offset(((index-8)&0xf)*4), R10L) + XORL(Mem{Base: SP}.Offset(((index-14)&0xf)*4), R10L) + ROLL(Imm(1), R10L) + MOVL(R10L, Mem{Base: SP}.Offset(((index)&0xf)*4)) +} + +func FUNC1(a, b, c, d, e GPPhysical) { + MOVL(d, R9L) + XORL(c, R9L) + ANDL(b, R9L) + XORL(d, R9L) +} + +func FUNC2(a, b, c, d, e GPPhysical) { + MOVL(b, R9L) + XORL(c, R9L) + XORL(d, R9L) +} + +func FUNC3(a, b, c, d, e GPPhysical) { + MOVL(b, R8L) + ORL(c, R8L) + ANDL(d, R8L) + MOVL(b, R9L) + ANDL(c, R9L) + ORL(R8L, R9L) +} + +func FUNC4(a, b, c, d, e GPPhysical) { + FUNC2(a, b, c, d, e) +} + +func MIX(a, b, c, d, e GPPhysical, konst int) { + ROLL(Imm(30), b) + ADDL(R9L, e) + MOVL(a, R8L) + ROLL(Imm(5), R8L) + LEAL(Mem{Base: e, Index: R10L, Scale: 1}.Offset(konst), e) + ADDL(R8L, e) +} + +func ROUND1(a, b, c, d, e GPPhysical, index int) { + LOAD(index) + FUNC1(a, b, c, d, e) + MIX(a, b, c, d, e, 0x5A827999) +} + +func ROUND1x(a, b, c, d, e GPPhysical, index int) { + SHUFFLE(index) + FUNC1(a, b, c, d, e) + MIX(a, b, c, d, e, 0x5A827999) +} + +func ROUND2(a, b, c, d, e GPPhysical, index int) { + SHUFFLE(index) + FUNC2(a, b, c, d, e) + MIX(a, b, c, d, e, 0x6ED9EBA1) +} + +func ROUND3(a, b, c, d, e GPPhysical, index int) { + SHUFFLE(index) + FUNC3(a, b, c, d, e) + MIX(a, b, c, d, e, 0x8F1BBCDC) +} + +func ROUND4(a, b, c, d, e GPPhysical, index int) { + SHUFFLE(index) + FUNC4(a, b, c, d, e) + MIX(a, b, c, d, e, 0xCA62C1D6) +} + +func blockAMD64() { + Implement("blockAMD64") + Attributes(NOSPLIT) + AllocLocal(64) + + Load(Param("dig"), RBP) + Load(Param("p").Base(), RSI) + Load(Param("p").Len(), RDX) + SHRQ(Imm(6), RDX) + SHLQ(Imm(6), RDX) + + LEAQ(Mem{Base: SI, Index: DX, Scale: 1}, RDI) + MOVL(Mem{Base: BP}.Offset(0*4), EAX) + MOVL(Mem{Base: BP}.Offset(1*4), EBX) + MOVL(Mem{Base: BP}.Offset(2*4), ECX) + MOVL(Mem{Base: BP}.Offset(3*4), EDX) + MOVL(Mem{Base: BP}.Offset(4*4), EBP) + + CMPQ(RSI, RDI) + JEQ(LabelRef("end")) + + loop_amd64() + end() +} + +func loop_amd64() { + Label("loop") + MOVL(EAX, R11L) + MOVL(EBX, R12L) + MOVL(ECX, R13L) + MOVL(EDX, R14L) + MOVL(EBP, R15L) + + ROUND1(EAX, EBX, ECX, EDX, EBP, 0) + ROUND1(EBP, EAX, EBX, ECX, EDX, 1) + ROUND1(EDX, EBP, EAX, EBX, ECX, 2) + ROUND1(ECX, EDX, EBP, EAX, EBX, 3) + ROUND1(EBX, ECX, EDX, EBP, EAX, 4) + ROUND1(EAX, EBX, ECX, EDX, EBP, 5) + ROUND1(EBP, EAX, EBX, ECX, EDX, 6) + ROUND1(EDX, EBP, EAX, EBX, ECX, 7) + ROUND1(ECX, EDX, EBP, EAX, EBX, 8) + ROUND1(EBX, ECX, EDX, EBP, EAX, 9) + ROUND1(EAX, EBX, ECX, EDX, EBP, 10) + ROUND1(EBP, EAX, EBX, ECX, EDX, 11) + ROUND1(EDX, EBP, EAX, EBX, ECX, 12) + ROUND1(ECX, EDX, EBP, EAX, EBX, 13) + ROUND1(EBX, ECX, EDX, EBP, EAX, 14) + ROUND1(EAX, EBX, ECX, EDX, EBP, 15) + + ROUND1x(EBP, EAX, EBX, ECX, EDX, 16) + ROUND1x(EDX, EBP, EAX, EBX, ECX, 17) + ROUND1x(ECX, EDX, EBP, EAX, EBX, 18) + ROUND1x(EBX, ECX, EDX, EBP, EAX, 19) + + ROUND2(EAX, EBX, ECX, EDX, EBP, 20) + ROUND2(EBP, EAX, EBX, ECX, EDX, 21) + ROUND2(EDX, EBP, EAX, EBX, ECX, 22) + ROUND2(ECX, EDX, EBP, EAX, EBX, 23) + ROUND2(EBX, ECX, EDX, EBP, EAX, 24) + ROUND2(EAX, EBX, ECX, EDX, EBP, 25) + ROUND2(EBP, EAX, EBX, ECX, EDX, 26) + ROUND2(EDX, EBP, EAX, EBX, ECX, 27) + ROUND2(ECX, EDX, EBP, EAX, EBX, 28) + ROUND2(EBX, ECX, EDX, EBP, EAX, 29) + ROUND2(EAX, EBX, ECX, EDX, EBP, 30) + ROUND2(EBP, EAX, EBX, ECX, EDX, 31) + ROUND2(EDX, EBP, EAX, EBX, ECX, 32) + ROUND2(ECX, EDX, EBP, EAX, EBX, 33) + ROUND2(EBX, ECX, EDX, EBP, EAX, 34) + ROUND2(EAX, EBX, ECX, EDX, EBP, 35) + ROUND2(EBP, EAX, EBX, ECX, EDX, 36) + ROUND2(EDX, EBP, EAX, EBX, ECX, 37) + ROUND2(ECX, EDX, EBP, EAX, EBX, 38) + ROUND2(EBX, ECX, EDX, EBP, EAX, 39) + + ROUND3(EAX, EBX, ECX, EDX, EBP, 40) + ROUND3(EBP, EAX, EBX, ECX, EDX, 41) + ROUND3(EDX, EBP, EAX, EBX, ECX, 42) + ROUND3(ECX, EDX, EBP, EAX, EBX, 43) + ROUND3(EBX, ECX, EDX, EBP, EAX, 44) + ROUND3(EAX, EBX, ECX, EDX, EBP, 45) + ROUND3(EBP, EAX, EBX, ECX, EDX, 46) + ROUND3(EDX, EBP, EAX, EBX, ECX, 47) + ROUND3(ECX, EDX, EBP, EAX, EBX, 48) + ROUND3(EBX, ECX, EDX, EBP, EAX, 49) + ROUND3(EAX, EBX, ECX, EDX, EBP, 50) + ROUND3(EBP, EAX, EBX, ECX, EDX, 51) + ROUND3(EDX, EBP, EAX, EBX, ECX, 52) + ROUND3(ECX, EDX, EBP, EAX, EBX, 53) + ROUND3(EBX, ECX, EDX, EBP, EAX, 54) + ROUND3(EAX, EBX, ECX, EDX, EBP, 55) + ROUND3(EBP, EAX, EBX, ECX, EDX, 56) + ROUND3(EDX, EBP, EAX, EBX, ECX, 57) + ROUND3(ECX, EDX, EBP, EAX, EBX, 58) + ROUND3(EBX, ECX, EDX, EBP, EAX, 59) + + ROUND4(EAX, EBX, ECX, EDX, EBP, 60) + ROUND4(EBP, EAX, EBX, ECX, EDX, 61) + ROUND4(EDX, EBP, EAX, EBX, ECX, 62) + ROUND4(ECX, EDX, EBP, EAX, EBX, 63) + ROUND4(EBX, ECX, EDX, EBP, EAX, 64) + ROUND4(EAX, EBX, ECX, EDX, EBP, 65) + ROUND4(EBP, EAX, EBX, ECX, EDX, 66) + ROUND4(EDX, EBP, EAX, EBX, ECX, 67) + ROUND4(ECX, EDX, EBP, EAX, EBX, 68) + ROUND4(EBX, ECX, EDX, EBP, EAX, 69) + ROUND4(EAX, EBX, ECX, EDX, EBP, 70) + ROUND4(EBP, EAX, EBX, ECX, EDX, 71) + ROUND4(EDX, EBP, EAX, EBX, ECX, 72) + ROUND4(ECX, EDX, EBP, EAX, EBX, 73) + ROUND4(EBX, ECX, EDX, EBP, EAX, 74) + ROUND4(EAX, EBX, ECX, EDX, EBP, 75) + ROUND4(EBP, EAX, EBX, ECX, EDX, 76) + ROUND4(EDX, EBP, EAX, EBX, ECX, 77) + ROUND4(ECX, EDX, EBP, EAX, EBX, 78) + ROUND4(EBX, ECX, EDX, EBP, EAX, 79) + + ADDL(R11L, EAX) + ADDL(R12L, EBX) + ADDL(R13L, ECX) + ADDL(R14L, EDX) + ADDL(R15L, EBP) + + ADDQ(Imm(64), RSI) + CMPQ(RSI, RDI) + JB(LabelRef("loop")) +} + +func end() { + Label("end") + Load(Param("dig"), RDI) + MOVL(EAX, Mem{Base: DI}.Offset(0*4)) + MOVL(EBX, Mem{Base: DI}.Offset(1*4)) + MOVL(ECX, Mem{Base: DI}.Offset(2*4)) + MOVL(EDX, Mem{Base: DI}.Offset(3*4)) + MOVL(EBP, Mem{Base: DI}.Offset(4*4)) + RET() +} + +// This is the implementation using AVX2, BMI1 and BMI2. It is based on: +// "SHA-1 implementation with Intel(R) AVX2 instruction set extensions" +// From http://software.intel.com/en-us/articles +// (look for improving-the-performance-of-the-secure-hash-algorithm-1) +// This implementation is 2x unrolled, and interleaves vector instructions, +// used to precompute W, with scalar computation of current round +// for optimal scheduling. + +// Trivial helper macros. + +func UPDATE_HASH(A, TB, C, D, E GPPhysical) { + ADDL(Mem{Base: R9}, A) + MOVL(A, Mem{Base: R9}) + ADDL(Mem{Base: R9}.Offset(4), TB) + MOVL(TB, Mem{Base: R9}.Offset(4)) + ADDL(Mem{Base: R9}.Offset(8), C) + MOVL(C, Mem{Base: R9}.Offset(8)) + ADDL(Mem{Base: R9}.Offset(12), D) + MOVL(D, Mem{Base: R9}.Offset(12)) + ADDL(Mem{Base: R9}.Offset(16), E) + MOVL(E, Mem{Base: R9}.Offset(16)) +} + +// Helper macros for PRECALC, which does precomputations + +func PRECALC_0(OFFSET int) { + VMOVDQU(Mem{Base: R10}.Offset(OFFSET), X0) +} + +func PRECALC_1(OFFSET int) { + VINSERTI128(Imm(1), Mem{Base: R13}.Offset(OFFSET), Y0, Y0) +} + +func PRECALC_2(YREG VecPhysical) { + VPSHUFB(Y10, Y0, YREG) +} + +func PRECALC_4(YREG VecPhysical, K_OFFSET int) { + VPADDD(Mem{Base: R8}.Offset(K_OFFSET), YREG, Y0) +} + +func PRECALC_7(OFFSET int) { + VMOVDQU(Y0, Mem{Base: R14}.Offset(OFFSET*2)) +} + +// Message scheduling pre-compute for rounds 0-15 +// +// - R13 is a pointer to even 64-byte block +// - R10 is a pointer to odd 64-byte block +// - R14 is a pointer to temp buffer +// - X0 is used as temp register +// - YREG is clobbered as part of computation +// - OFFSET chooses 16 byte chunk within a block +// - R8 is a pointer to constants block +// - K_OFFSET chooses K constants relevant to this round +// - X10 holds swap mask +func PRECALC_00_15(OFFSET int, YREG VecPhysical) { + PRECALC_0(OFFSET) + PRECALC_1(OFFSET) + PRECALC_2(YREG) + PRECALC_4(YREG, 0x0) + PRECALC_7(OFFSET) +} + +// Helper macros for PRECALC_16_31 + +func PRECALC_16(REG_SUB_16, REG_SUB_12, REG_SUB_4, REG VecPhysical) { + VPALIGNR(Imm(8), REG_SUB_16, REG_SUB_12, REG) // w[i-14] + VPSRLDQ(Imm(4), REG_SUB_4, Y0) // w[i-3] +} + +func PRECALC_17(REG_SUB_16, REG_SUB_8, REG VecPhysical) { + VPXOR(REG_SUB_8, REG, REG) + VPXOR(REG_SUB_16, Y0, Y0) +} + +func PRECALC_18(REG VecPhysical) { + VPXOR(Y0, REG, REG) + VPSLLDQ(Imm(12), REG, Y9) +} + +func PRECALC_19(REG VecPhysical) { + VPSLLD(Imm(1), REG, Y0) + VPSRLD(Imm(31), REG, REG) +} + +func PRECALC_20(REG VecPhysical) { + VPOR(REG, Y0, Y0) + VPSLLD(Imm(2), Y9, REG) +} + +func PRECALC_21(REG VecPhysical) { + VPSRLD(Imm(30), Y9, Y9) + VPXOR(REG, Y0, Y0) +} + +func PRECALC_23(REG VecPhysical, K_OFFSET, OFFSET int) { + VPXOR(Y9, Y0, REG) + VPADDD(Mem{Base: R8}.Offset(K_OFFSET), REG, Y0) + VMOVDQU(Y0, Mem{Base: R14}.Offset(OFFSET)) +} + +// Message scheduling pre-compute for rounds 16-31 +// - calculating last 32 w[i] values in 8 XMM registers +// - pre-calculate K+w[i] values and store to mem +// - for later load by ALU add instruction. +// - "brute force" vectorization for rounds 16-31 only +// - due to w[i]->w[i-3] dependency. +// - clobbers 5 input ymm registers REG_SUB* +// - uses X0 and X9 as temp registers +// - As always, R8 is a pointer to constants block +// - and R14 is a pointer to temp buffer +func PRECALC_16_31(REG, REG_SUB_4, REG_SUB_8, REG_SUB_12, REG_SUB_16 VecPhysical, K_OFFSET, OFFSET int) { + PRECALC_16(REG_SUB_16, REG_SUB_12, REG_SUB_4, REG) + PRECALC_17(REG_SUB_16, REG_SUB_8, REG) + PRECALC_18(REG) + PRECALC_19(REG) + PRECALC_20(REG) + PRECALC_21(REG) + PRECALC_23(REG, K_OFFSET, OFFSET) +} + +// Helper macros for PRECALC_32_79 + +func PRECALC_32(REG_SUB_8, REG_SUB_4 VecPhysical) { + VPALIGNR(Imm(8), REG_SUB_8, REG_SUB_4, Y0) +} + +func PRECALC_33(REG_SUB_28, REG VecPhysical) { + VPXOR(REG_SUB_28, REG, REG) +} + +func PRECALC_34(REG_SUB_16 VecPhysical) { + VPXOR(REG_SUB_16, Y0, Y0) +} + +func PRECALC_35(REG VecPhysical) { + VPXOR(Y0, REG, REG) +} + +func PRECALC_36(REG VecPhysical) { + VPSLLD(Imm(2), REG, Y0) +} + +func PRECALC_37(REG VecPhysical) { + VPSRLD(Imm(30), REG, REG) + VPOR(REG, Y0, REG) +} + +func PRECALC_39(REG VecPhysical, K_OFFSET, OFFSET int) { + VPADDD(Mem{Base: R8}.Offset(K_OFFSET), REG, Y0) + VMOVDQU(Y0, Mem{Base: R14}.Offset(OFFSET)) +} + +// Message scheduling pre-compute for rounds 32-79 +// In SHA-1 specification we have: +// w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 +// Which is the same as: +// w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 +// This allows for more efficient vectorization, +// since w[i]->w[i-3] dependency is broken + +func PRECALC_32_79(REG, REG_SUB_4, REG_SUB_8, REG_SUB_16, REG_SUB_28 VecPhysical, K_OFFSET, OFFSET int) { + PRECALC_32(REG_SUB_8, REG_SUB_4) + PRECALC_33(REG_SUB_28, REG) + PRECALC_34(REG_SUB_16) + PRECALC_35(REG) + PRECALC_36(REG) + PRECALC_37(REG) + PRECALC_39(REG, K_OFFSET, OFFSET) +} + +func PRECALC() { + PRECALC_00_15(0, Y15) + PRECALC_00_15(0x10, Y14) + PRECALC_00_15(0x20, Y13) + PRECALC_00_15(0x30, Y12) + PRECALC_16_31(Y8, Y12, Y13, Y14, Y15, 0, 0x80) + PRECALC_16_31(Y7, Y8, Y12, Y13, Y14, 0x20, 0xa0) + PRECALC_16_31(Y5, Y7, Y8, Y12, Y13, 0x20, 0xc0) + PRECALC_16_31(Y3, Y5, Y7, Y8, Y12, 0x20, 0xe0) + PRECALC_32_79(Y15, Y3, Y5, Y8, Y14, 0x20, 0x100) + PRECALC_32_79(Y14, Y15, Y3, Y7, Y13, 0x20, 0x120) + PRECALC_32_79(Y13, Y14, Y15, Y5, Y12, 0x40, 0x140) + PRECALC_32_79(Y12, Y13, Y14, Y3, Y8, 0x40, 0x160) + PRECALC_32_79(Y8, Y12, Y13, Y15, Y7, 0x40, 0x180) + PRECALC_32_79(Y7, Y8, Y12, Y14, Y5, 0x40, 0x1a0) + PRECALC_32_79(Y5, Y7, Y8, Y13, Y3, 0x40, 0x1c0) + PRECALC_32_79(Y3, Y5, Y7, Y12, Y15, 0x60, 0x1e0) + PRECALC_32_79(Y15, Y3, Y5, Y8, Y14, 0x60, 0x200) + PRECALC_32_79(Y14, Y15, Y3, Y7, Y13, 0x60, 0x220) + PRECALC_32_79(Y13, Y14, Y15, Y5, Y12, 0x60, 0x240) + PRECALC_32_79(Y12, Y13, Y14, Y3, Y8, 0x60, 0x260) +} + +// Macros calculating individual rounds have general form +// CALC_ROUND_PRE + PRECALC_ROUND + CALC_ROUND_POST +// CALC_ROUND_{PRE,POST} macros follow + +func CALC_F1_PRE(OFFSET int, REG_A, REG_B, REG_C, REG_E GPPhysical) { + ADDL(Mem{Base: R15}.Offset(OFFSET), REG_E) + ANDNL(REG_C, REG_A, EBP) + LEAL(Mem{Base: REG_E, Index: REG_B, Scale: 1}, REG_E) // Add F from the previous round + RORXL(Imm(0x1b), REG_A, R12L) + RORXL(Imm(2), REG_A, REG_B) // for next round +} + +func CALC_F1_POST(REG_A, REG_B, REG_E GPPhysical) { + ANDL(REG_B, REG_A) // b&c + XORL(EBP, REG_A) // F1 = (b&c) ^ (~b&d) + LEAL(Mem{Base: REG_E, Index: R12, Scale: 1}, REG_E) // E += A >>> 5 +} + +// Registers are cyclically rotated DX -> AX -> DI -> SI -> BX -> CX + +func CALC_0() { + MOVL(ESI, EBX) // Precalculating first round + RORXL(Imm(2), ESI, ESI) + ANDNL(EAX, EBX, EBP) + ANDL(EDI, EBX) + XORL(EBP, EBX) + CALC_F1_PRE(0x0, ECX, EBX, EDI, EDX) + PRECALC_0(0x80) + CALC_F1_POST(ECX, ESI, EDX) +} + +func CALC_1() { + CALC_F1_PRE(0x4, EDX, ECX, ESI, EAX) + PRECALC_1(0x80) + CALC_F1_POST(EDX, EBX, EAX) +} + +func CALC_2() { + CALC_F1_PRE(0x8, EAX, EDX, EBX, EDI) + PRECALC_2(Y15) + CALC_F1_POST(EAX, ECX, EDI) +} + +func CALC_3() { + CALC_F1_PRE(0xc, EDI, EAX, ECX, ESI) + CALC_F1_POST(EDI, EDX, ESI) +} + +func CALC_4() { + CALC_F1_PRE(0x20, ESI, EDI, EDX, EBX) + PRECALC_4(Y15, 0x0) + CALC_F1_POST(ESI, EAX, EBX) +} + +func CALC_5() { + CALC_F1_PRE(0x24, EBX, ESI, EAX, ECX) + CALC_F1_POST(EBX, EDI, ECX) +} + +func CALC_6() { + CALC_F1_PRE(0x28, ECX, EBX, EDI, EDX) + CALC_F1_POST(ECX, ESI, EDX) +} + +func CALC_7() { + CALC_F1_PRE(0x2c, EDX, ECX, ESI, EAX) + PRECALC_7(0x0) + CALC_F1_POST(EDX, EBX, EAX) +} + +func CALC_8() { + CALC_F1_PRE(0x40, EAX, EDX, EBX, EDI) + PRECALC_0(0x90) + CALC_F1_POST(EAX, ECX, EDI) +} + +func CALC_9() { + CALC_F1_PRE(0x44, EDI, EAX, ECX, ESI) + PRECALC_1(0x90) + CALC_F1_POST(EDI, EDX, ESI) +} + +func CALC_10() { + CALC_F1_PRE(0x48, ESI, EDI, EDX, EBX) + PRECALC_2(Y14) + CALC_F1_POST(ESI, EAX, EBX) +} + +func CALC_11() { + CALC_F1_PRE(0x4c, EBX, ESI, EAX, ECX) + CALC_F1_POST(EBX, EDI, ECX) +} + +func CALC_12() { + CALC_F1_PRE(0x60, ECX, EBX, EDI, EDX) + PRECALC_4(Y14, 0x0) + CALC_F1_POST(ECX, ESI, EDX) +} + +func CALC_13() { + CALC_F1_PRE(0x64, EDX, ECX, ESI, EAX) + CALC_F1_POST(EDX, EBX, EAX) +} + +func CALC_14() { + CALC_F1_PRE(0x68, EAX, EDX, EBX, EDI) + CALC_F1_POST(EAX, ECX, EDI) +} + +func CALC_15() { + CALC_F1_PRE(0x6c, EDI, EAX, ECX, ESI) + PRECALC_7(0x10) + CALC_F1_POST(EDI, EDX, ESI) +} + +func CALC_16() { + CALC_F1_PRE(0x80, ESI, EDI, EDX, EBX) + PRECALC_0(0xa0) + CALC_F1_POST(ESI, EAX, EBX) +} + +func CALC_17() { + CALC_F1_PRE(0x84, EBX, ESI, EAX, ECX) + PRECALC_1(0xa0) + CALC_F1_POST(EBX, EDI, ECX) +} + +func CALC_18() { + CALC_F1_PRE(0x88, ECX, EBX, EDI, EDX) + PRECALC_2(Y13) + CALC_F1_POST(ECX, ESI, EDX) +} + +func CALC_F2_PRE(OFFSET int, REG_A, REG_B, REG_E GPPhysical) { + ADDL(Mem{Base: R15}.Offset(OFFSET), REG_E) + LEAL(Mem{Base: REG_E, Index: REG_B, Scale: 1}, REG_E) // Add F from the previous round + RORXL(Imm(0x1b), REG_A, R12L) + RORXL(Imm(2), REG_A, REG_B) // for next round +} + +func CALC_F2_POST(REG_A, REG_B, REG_C, REG_E GPPhysical) { + XORL(REG_B, REG_A) + ADDL(R12L, REG_E) + XORL(REG_C, REG_A) +} + +func CALC_19() { + CALC_F2_PRE(0x8c, EDX, ECX, EAX) + CALC_F2_POST(EDX, EBX, ESI, EAX) +} + +func CALC_20() { + CALC_F2_PRE(0xa0, EAX, EDX, EDI) + PRECALC_4(Y13, 0x0) + CALC_F2_POST(EAX, ECX, EBX, EDI) +} + +func CALC_21() { + CALC_F2_PRE(0xa4, EDI, EAX, ESI) + CALC_F2_POST(EDI, EDX, ECX, ESI) +} + +func CALC_22() { + CALC_F2_PRE(0xa8, ESI, EDI, EBX) + CALC_F2_POST(ESI, EAX, EDX, EBX) +} + +func CALC_23() { + CALC_F2_PRE(0xac, EBX, ESI, ECX) + PRECALC_7(0x20) + CALC_F2_POST(EBX, EDI, EAX, ECX) +} + +func CALC_24() { + CALC_F2_PRE(0xc0, ECX, EBX, EDX) + PRECALC_0(0xb0) + CALC_F2_POST(ECX, ESI, EDI, EDX) +} + +func CALC_25() { + CALC_F2_PRE(0xc4, EDX, ECX, EAX) + PRECALC_1(0xb0) + CALC_F2_POST(EDX, EBX, ESI, EAX) +} + +func CALC_26() { + CALC_F2_PRE(0xc8, EAX, EDX, EDI) + PRECALC_2(Y12) + CALC_F2_POST(EAX, ECX, EBX, EDI) +} + +func CALC_27() { + CALC_F2_PRE(0xcc, EDI, EAX, ESI) + CALC_F2_POST(EDI, EDX, ECX, ESI) +} + +func CALC_28() { + CALC_F2_PRE(0xe0, ESI, EDI, EBX) + PRECALC_4(Y12, 0x0) + CALC_F2_POST(ESI, EAX, EDX, EBX) +} + +func CALC_29() { + CALC_F2_PRE(0xe4, EBX, ESI, ECX) + CALC_F2_POST(EBX, EDI, EAX, ECX) +} + +func CALC_30() { + CALC_F2_PRE(0xe8, ECX, EBX, EDX) + CALC_F2_POST(ECX, ESI, EDI, EDX) +} + +func CALC_31() { + CALC_F2_PRE(0xec, EDX, ECX, EAX) + PRECALC_7(0x30) + CALC_F2_POST(EDX, EBX, ESI, EAX) +} + +func CALC_32() { + CALC_F2_PRE(0x100, EAX, EDX, EDI) + PRECALC_16(Y15, Y14, Y12, Y8) + CALC_F2_POST(EAX, ECX, EBX, EDI) +} + +func CALC_33() { + CALC_F2_PRE(0x104, EDI, EAX, ESI) + PRECALC_17(Y15, Y13, Y8) + CALC_F2_POST(EDI, EDX, ECX, ESI) +} + +func CALC_34() { + CALC_F2_PRE(0x108, ESI, EDI, EBX) + PRECALC_18(Y8) + CALC_F2_POST(ESI, EAX, EDX, EBX) +} + +func CALC_35() { + CALC_F2_PRE(0x10c, EBX, ESI, ECX) + PRECALC_19(Y8) + CALC_F2_POST(EBX, EDI, EAX, ECX) +} + +func CALC_36() { + CALC_F2_PRE(0x120, ECX, EBX, EDX) + PRECALC_20(Y8) + CALC_F2_POST(ECX, ESI, EDI, EDX) +} + +func CALC_37() { + CALC_F2_PRE(0x124, EDX, ECX, EAX) + PRECALC_21(Y8) + CALC_F2_POST(EDX, EBX, ESI, EAX) +} + +func CALC_38() { + CALC_F2_PRE(0x128, EAX, EDX, EDI) + CALC_F2_POST(EAX, ECX, EBX, EDI) +} + +func CALC_F3_PRE(OFFSET int, REG_E GPPhysical) { + ADDL(Mem{Base: R15}.Offset(OFFSET), REG_E) +} + +func CALC_F3_POST(REG_A, REG_B, REG_C, REG_E, REG_TB GPPhysical) { + LEAL(Mem{Base: REG_E, Index: REG_TB, Scale: 1}, REG_E) // Add F from the previous round + MOVL(REG_B, EBP) + ORL(REG_A, EBP) + RORXL(Imm(0x1b), REG_A, R12L) + RORXL(Imm(2), REG_A, REG_TB) + ANDL(REG_C, EBP) + ANDL(REG_B, REG_A) + ORL(EBP, REG_A) + ADDL(R12L, REG_E) +} + +func CALC_39() { + CALC_F3_PRE(0x12c, ESI) + PRECALC_23(Y8, 0x0, 0x80) + CALC_F3_POST(EDI, EDX, ECX, ESI, EAX) +} + +func CALC_40() { + CALC_F3_PRE(0x140, EBX) + PRECALC_16(Y14, Y13, Y8, Y7) + CALC_F3_POST(ESI, EAX, EDX, EBX, EDI) +} + +func CALC_41() { + CALC_F3_PRE(0x144, ECX) + PRECALC_17(Y14, Y12, Y7) + CALC_F3_POST(EBX, EDI, EAX, ECX, ESI) +} + +func CALC_42() { + CALC_F3_PRE(0x148, EDX) + PRECALC_18(Y7) + CALC_F3_POST(ECX, ESI, EDI, EDX, EBX) +} + +func CALC_43() { + CALC_F3_PRE(0x14c, EAX) + PRECALC_19(Y7) + CALC_F3_POST(EDX, EBX, ESI, EAX, ECX) +} + +func CALC_44() { + CALC_F3_PRE(0x160, EDI) + PRECALC_20(Y7) + CALC_F3_POST(EAX, ECX, EBX, EDI, EDX) +} + +func CALC_45() { + CALC_F3_PRE(0x164, ESI) + PRECALC_21(Y7) + CALC_F3_POST(EDI, EDX, ECX, ESI, EAX) +} + +func CALC_46() { + CALC_F3_PRE(0x168, EBX) + CALC_F3_POST(ESI, EAX, EDX, EBX, EDI) +} + +func CALC_47() { + CALC_F3_PRE(0x16c, ECX) + VPXOR(Y9, Y0, Y7) + VPADDD(Mem{Base: R8}.Offset(0x20), Y7, Y0) + VMOVDQU(Y0, Mem{Base: R14}.Offset(0xa0)) + CALC_F3_POST(EBX, EDI, EAX, ECX, ESI) +} + +func CALC_48() { + CALC_F3_PRE(0x180, EDX) + PRECALC_16(Y13, Y12, Y7, Y5) + CALC_F3_POST(ECX, ESI, EDI, EDX, EBX) +} + +func CALC_49() { + CALC_F3_PRE(0x184, EAX) + PRECALC_17(Y13, Y8, Y5) + CALC_F3_POST(EDX, EBX, ESI, EAX, ECX) +} + +func CALC_50() { + CALC_F3_PRE(0x188, EDI) + PRECALC_18(Y5) + CALC_F3_POST(EAX, ECX, EBX, EDI, EDX) +} + +func CALC_51() { + CALC_F3_PRE(0x18c, ESI) + PRECALC_19(Y5) + CALC_F3_POST(EDI, EDX, ECX, ESI, EAX) +} + +func CALC_52() { + CALC_F3_PRE(0x1a0, EBX) + PRECALC_20(Y5) + CALC_F3_POST(ESI, EAX, EDX, EBX, EDI) +} + +func CALC_53() { + CALC_F3_PRE(0x1a4, ECX) + PRECALC_21(Y5) + CALC_F3_POST(EBX, EDI, EAX, ECX, ESI) +} + +func CALC_54() { + CALC_F3_PRE(0x1a8, EDX) + CALC_F3_POST(ECX, ESI, EDI, EDX, EBX) +} + +func CALC_55() { + CALC_F3_PRE(0x1ac, EAX) + PRECALC_23(Y5, 0x20, 0xc0) + CALC_F3_POST(EDX, EBX, ESI, EAX, ECX) +} + +func CALC_56() { + CALC_F3_PRE(0x1c0, EDI) + PRECALC_16(Y12, Y8, Y5, Y3) + CALC_F3_POST(EAX, ECX, EBX, EDI, EDX) +} + +func CALC_57() { + CALC_F3_PRE(0x1c4, ESI) + PRECALC_17(Y12, Y7, Y3) + CALC_F3_POST(EDI, EDX, ECX, ESI, EAX) +} + +func CALC_58() { + CALC_F3_PRE(0x1c8, EBX) + PRECALC_18(Y3) + CALC_F3_POST(ESI, EAX, EDX, EBX, EDI) +} + +func CALC_59() { + CALC_F2_PRE(0x1cc, EBX, ESI, ECX) + PRECALC_19(Y3) + CALC_F2_POST(EBX, EDI, EAX, ECX) +} + +func CALC_60() { + CALC_F2_PRE(0x1e0, ECX, EBX, EDX) + PRECALC_20(Y3) + CALC_F2_POST(ECX, ESI, EDI, EDX) +} + +func CALC_61() { + CALC_F2_PRE(0x1e4, EDX, ECX, EAX) + PRECALC_21(Y3) + CALC_F2_POST(EDX, EBX, ESI, EAX) +} + +func CALC_62() { + CALC_F2_PRE(0x1e8, EAX, EDX, EDI) + CALC_F2_POST(EAX, ECX, EBX, EDI) +} + +func CALC_63() { + CALC_F2_PRE(0x1ec, EDI, EAX, ESI) + PRECALC_23(Y3, 0x20, 0xe0) + CALC_F2_POST(EDI, EDX, ECX, ESI) +} + +func CALC_64() { + CALC_F2_PRE(0x200, ESI, EDI, EBX) + PRECALC_32(Y5, Y3) + CALC_F2_POST(ESI, EAX, EDX, EBX) +} + +func CALC_65() { + CALC_F2_PRE(0x204, EBX, ESI, ECX) + PRECALC_33(Y14, Y15) + CALC_F2_POST(EBX, EDI, EAX, ECX) +} + +func CALC_66() { + CALC_F2_PRE(0x208, ECX, EBX, EDX) + PRECALC_34(Y8) + CALC_F2_POST(ECX, ESI, EDI, EDX) +} + +func CALC_67() { + CALC_F2_PRE(0x20c, EDX, ECX, EAX) + PRECALC_35(Y15) + CALC_F2_POST(EDX, EBX, ESI, EAX) +} + +func CALC_68() { + CALC_F2_PRE(0x220, EAX, EDX, EDI) + PRECALC_36(Y15) + CALC_F2_POST(EAX, ECX, EBX, EDI) +} + +func CALC_69() { + CALC_F2_PRE(0x224, EDI, EAX, ESI) + PRECALC_37(Y15) + CALC_F2_POST(EDI, EDX, ECX, ESI) +} + +func CALC_70() { + CALC_F2_PRE(0x228, ESI, EDI, EBX) + CALC_F2_POST(ESI, EAX, EDX, EBX) +} + +func CALC_71() { + CALC_F2_PRE(0x22c, EBX, ESI, ECX) + PRECALC_39(Y15, 0x20, 0x100) + CALC_F2_POST(EBX, EDI, EAX, ECX) +} + +func CALC_72() { + CALC_F2_PRE(0x240, ECX, EBX, EDX) + PRECALC_32(Y3, Y15) + CALC_F2_POST(ECX, ESI, EDI, EDX) +} + +func CALC_73() { + CALC_F2_PRE(0x244, EDX, ECX, EAX) + PRECALC_33(Y13, Y14) + CALC_F2_POST(EDX, EBX, ESI, EAX) +} + +func CALC_74() { + CALC_F2_PRE(0x248, EAX, EDX, EDI) + PRECALC_34(Y7) + CALC_F2_POST(EAX, ECX, EBX, EDI) +} + +func CALC_75() { + CALC_F2_PRE(0x24c, EDI, EAX, ESI) + PRECALC_35(Y14) + CALC_F2_POST(EDI, EDX, ECX, ESI) +} + +func CALC_76() { + CALC_F2_PRE(0x260, ESI, EDI, EBX) + PRECALC_36(Y14) + CALC_F2_POST(ESI, EAX, EDX, EBX) +} + +func CALC_77() { + CALC_F2_PRE(0x264, EBX, ESI, ECX) + PRECALC_37(Y14) + CALC_F2_POST(EBX, EDI, EAX, ECX) +} + +func CALC_78() { + CALC_F2_PRE(0x268, ECX, EBX, EDX) + CALC_F2_POST(ECX, ESI, EDI, EDX) +} + +func CALC_79() { + ADDL(Mem{Base: R15}.Offset(0x26c), EAX) + LEAL(Mem{Base: AX, Index: CX, Scale: 1}, EAX) + RORXL(Imm(0x1b), EDX, R12L) + PRECALC_39(Y14, 0x20, 0x120) + ADDL(R12L, EAX) +} + +// Similar to CALC_0 +func CALC_80() { + MOVL(ECX, EDX) + RORXL(Imm(2), ECX, ECX) + ANDNL(ESI, EDX, EBP) + ANDL(EBX, EDX) + XORL(EBP, EDX) + CALC_F1_PRE(0x10, EAX, EDX, EBX, EDI) + PRECALC_32(Y15, Y14) + CALC_F1_POST(EAX, ECX, EDI) +} + +func CALC_81() { + CALC_F1_PRE(0x14, EDI, EAX, ECX, ESI) + PRECALC_33(Y12, Y13) + CALC_F1_POST(EDI, EDX, ESI) +} + +func CALC_82() { + CALC_F1_PRE(0x18, ESI, EDI, EDX, EBX) + PRECALC_34(Y5) + CALC_F1_POST(ESI, EAX, EBX) +} + +func CALC_83() { + CALC_F1_PRE(0x1c, EBX, ESI, EAX, ECX) + PRECALC_35(Y13) + CALC_F1_POST(EBX, EDI, ECX) +} + +func CALC_84() { + CALC_F1_PRE(0x30, ECX, EBX, EDI, EDX) + PRECALC_36(Y13) + CALC_F1_POST(ECX, ESI, EDX) +} + +func CALC_85() { + CALC_F1_PRE(0x34, EDX, ECX, ESI, EAX) + PRECALC_37(Y13) + CALC_F1_POST(EDX, EBX, EAX) +} + +func CALC_86() { + CALC_F1_PRE(0x38, EAX, EDX, EBX, EDI) + CALC_F1_POST(EAX, ECX, EDI) +} + +func CALC_87() { + CALC_F1_PRE(0x3c, EDI, EAX, ECX, ESI) + PRECALC_39(Y13, 0x40, 0x140) + CALC_F1_POST(EDI, EDX, ESI) +} + +func CALC_88() { + CALC_F1_PRE(0x50, ESI, EDI, EDX, EBX) + PRECALC_32(Y14, Y13) + CALC_F1_POST(ESI, EAX, EBX) +} + +func CALC_89() { + CALC_F1_PRE(0x54, EBX, ESI, EAX, ECX) + PRECALC_33(Y8, Y12) + CALC_F1_POST(EBX, EDI, ECX) +} + +func CALC_90() { + CALC_F1_PRE(0x58, ECX, EBX, EDI, EDX) + PRECALC_34(Y3) + CALC_F1_POST(ECX, ESI, EDX) +} + +func CALC_91() { + CALC_F1_PRE(0x5c, EDX, ECX, ESI, EAX) + PRECALC_35(Y12) + CALC_F1_POST(EDX, EBX, EAX) +} + +func CALC_92() { + CALC_F1_PRE(0x70, EAX, EDX, EBX, EDI) + PRECALC_36(Y12) + CALC_F1_POST(EAX, ECX, EDI) +} + +func CALC_93() { + CALC_F1_PRE(0x74, EDI, EAX, ECX, ESI) + PRECALC_37(Y12) + CALC_F1_POST(EDI, EDX, ESI) +} + +func CALC_94() { + CALC_F1_PRE(0x78, ESI, EDI, EDX, EBX) + CALC_F1_POST(ESI, EAX, EBX) +} + +func CALC_95() { + CALC_F1_PRE(0x7c, EBX, ESI, EAX, ECX) + PRECALC_39(Y12, 0x40, 0x160) + CALC_F1_POST(EBX, EDI, ECX) +} + +func CALC_96() { + CALC_F1_PRE(0x90, ECX, EBX, EDI, EDX) + PRECALC_32(Y13, Y12) + CALC_F1_POST(ECX, ESI, EDX) +} + +func CALC_97() { + CALC_F1_PRE(0x94, EDX, ECX, ESI, EAX) + PRECALC_33(Y7, Y8) + CALC_F1_POST(EDX, EBX, EAX) +} + +func CALC_98() { + CALC_F1_PRE(0x98, EAX, EDX, EBX, EDI) + PRECALC_34(Y15) + CALC_F1_POST(EAX, ECX, EDI) +} + +func CALC_99() { + CALC_F2_PRE(0x9c, EDI, EAX, ESI) + PRECALC_35(Y8) + CALC_F2_POST(EDI, EDX, ECX, ESI) +} + +func CALC_100() { + CALC_F2_PRE(0xb0, ESI, EDI, EBX) + PRECALC_36(Y8) + CALC_F2_POST(ESI, EAX, EDX, EBX) +} + +func CALC_101() { + CALC_F2_PRE(0xb4, EBX, ESI, ECX) + PRECALC_37(Y8) + CALC_F2_POST(EBX, EDI, EAX, ECX) +} + +func CALC_102() { + CALC_F2_PRE(0xb8, ECX, EBX, EDX) + CALC_F2_POST(ECX, ESI, EDI, EDX) +} + +func CALC_103() { + CALC_F2_PRE(0xbc, EDX, ECX, EAX) + PRECALC_39(Y8, 0x40, 0x180) + CALC_F2_POST(EDX, EBX, ESI, EAX) +} + +func CALC_104() { + CALC_F2_PRE(0xd0, EAX, EDX, EDI) + PRECALC_32(Y12, Y8) + CALC_F2_POST(EAX, ECX, EBX, EDI) +} + +func CALC_105() { + CALC_F2_PRE(0xd4, EDI, EAX, ESI) + PRECALC_33(Y5, Y7) + CALC_F2_POST(EDI, EDX, ECX, ESI) +} + +func CALC_106() { + CALC_F2_PRE(0xd8, ESI, EDI, EBX) + PRECALC_34(Y14) + CALC_F2_POST(ESI, EAX, EDX, EBX) +} + +func CALC_107() { + CALC_F2_PRE(0xdc, EBX, ESI, ECX) + PRECALC_35(Y7) + CALC_F2_POST(EBX, EDI, EAX, ECX) +} + +func CALC_108() { + CALC_F2_PRE(0xf0, ECX, EBX, EDX) + PRECALC_36(Y7) + CALC_F2_POST(ECX, ESI, EDI, EDX) +} + +func CALC_109() { + CALC_F2_PRE(0xf4, EDX, ECX, EAX) + PRECALC_37(Y7) + CALC_F2_POST(EDX, EBX, ESI, EAX) +} + +func CALC_110() { + CALC_F2_PRE(0xf8, EAX, EDX, EDI) + CALC_F2_POST(EAX, ECX, EBX, EDI) +} + +func CALC_111() { + CALC_F2_PRE(0xfc, EDI, EAX, ESI) + PRECALC_39(Y7, 0x40, 0x1a0) + CALC_F2_POST(EDI, EDX, ECX, ESI) +} + +func CALC_112() { + CALC_F2_PRE(0x110, ESI, EDI, EBX) + PRECALC_32(Y8, Y7) + CALC_F2_POST(ESI, EAX, EDX, EBX) +} + +func CALC_113() { + CALC_F2_PRE(0x114, EBX, ESI, ECX) + PRECALC_33(Y3, Y5) + CALC_F2_POST(EBX, EDI, EAX, ECX) +} + +func CALC_114() { + CALC_F2_PRE(0x118, ECX, EBX, EDX) + PRECALC_34(Y13) + CALC_F2_POST(ECX, ESI, EDI, EDX) +} + +func CALC_115() { + CALC_F2_PRE(0x11c, EDX, ECX, EAX) + PRECALC_35(Y5) + CALC_F2_POST(EDX, EBX, ESI, EAX) +} + +func CALC_116() { + CALC_F2_PRE(0x130, EAX, EDX, EDI) + PRECALC_36(Y5) + CALC_F2_POST(EAX, ECX, EBX, EDI) +} + +func CALC_117() { + CALC_F2_PRE(0x134, EDI, EAX, ESI) + PRECALC_37(Y5) + CALC_F2_POST(EDI, EDX, ECX, ESI) +} + +func CALC_118() { + CALC_F2_PRE(0x138, ESI, EDI, EBX) + CALC_F2_POST(ESI, EAX, EDX, EBX) +} + +func CALC_119() { + CALC_F3_PRE(0x13c, ECX) + PRECALC_39(Y5, 0x40, 0x1c0) + CALC_F3_POST(EBX, EDI, EAX, ECX, ESI) +} + +func CALC_120() { + CALC_F3_PRE(0x150, EDX) + PRECALC_32(Y7, Y5) + CALC_F3_POST(ECX, ESI, EDI, EDX, EBX) +} + +func CALC_121() { + CALC_F3_PRE(0x154, EAX) + PRECALC_33(Y15, Y3) + CALC_F3_POST(EDX, EBX, ESI, EAX, ECX) +} + +func CALC_122() { + CALC_F3_PRE(0x158, EDI) + PRECALC_34(Y12) + CALC_F3_POST(EAX, ECX, EBX, EDI, EDX) +} + +func CALC_123() { + CALC_F3_PRE(0x15c, ESI) + PRECALC_35(Y3) + CALC_F3_POST(EDI, EDX, ECX, ESI, EAX) +} + +func CALC_124() { + CALC_F3_PRE(0x170, EBX) + PRECALC_36(Y3) + CALC_F3_POST(ESI, EAX, EDX, EBX, EDI) +} + +func CALC_125() { + CALC_F3_PRE(0x174, ECX) + PRECALC_37(Y3) + CALC_F3_POST(EBX, EDI, EAX, ECX, ESI) +} + +func CALC_126() { + CALC_F3_PRE(0x178, EDX) + CALC_F3_POST(ECX, ESI, EDI, EDX, EBX) +} + +func CALC_127() { + CALC_F3_PRE(0x17c, EAX) + PRECALC_39(Y3, 0x60, 0x1e0) + CALC_F3_POST(EDX, EBX, ESI, EAX, ECX) +} + +func CALC_128() { + CALC_F3_PRE(0x190, EDI) + PRECALC_32(Y5, Y3) + CALC_F3_POST(EAX, ECX, EBX, EDI, EDX) +} + +func CALC_129() { + CALC_F3_PRE(0x194, ESI) + PRECALC_33(Y14, Y15) + CALC_F3_POST(EDI, EDX, ECX, ESI, EAX) +} + +func CALC_130() { + CALC_F3_PRE(0x198, EBX) + PRECALC_34(Y8) + CALC_F3_POST(ESI, EAX, EDX, EBX, EDI) +} + +func CALC_131() { + CALC_F3_PRE(0x19c, ECX) + PRECALC_35(Y15) + CALC_F3_POST(EBX, EDI, EAX, ECX, ESI) +} + +func CALC_132() { + CALC_F3_PRE(0x1b0, EDX) + PRECALC_36(Y15) + CALC_F3_POST(ECX, ESI, EDI, EDX, EBX) +} + +func CALC_133() { + CALC_F3_PRE(0x1b4, EAX) + PRECALC_37(Y15) + CALC_F3_POST(EDX, EBX, ESI, EAX, ECX) +} + +func CALC_134() { + CALC_F3_PRE(0x1b8, EDI) + CALC_F3_POST(EAX, ECX, EBX, EDI, EDX) +} + +func CALC_135() { + CALC_F3_PRE(0x1bc, ESI) + PRECALC_39(Y15, 0x60, 0x200) + CALC_F3_POST(EDI, EDX, ECX, ESI, EAX) +} + +func CALC_136() { + CALC_F3_PRE(0x1d0, EBX) + PRECALC_32(Y3, Y15) + CALC_F3_POST(ESI, EAX, EDX, EBX, EDI) +} + +func CALC_137() { + CALC_F3_PRE(0x1d4, ECX) + PRECALC_33(Y13, Y14) + CALC_F3_POST(EBX, EDI, EAX, ECX, ESI) +} + +func CALC_138() { + CALC_F3_PRE(0x1d8, EDX) + PRECALC_34(Y7) + CALC_F3_POST(ECX, ESI, EDI, EDX, EBX) +} + +func CALC_139() { + CALC_F2_PRE(0x1dc, EDX, ECX, EAX) + PRECALC_35(Y14) + CALC_F2_POST(EDX, EBX, ESI, EAX) +} + +func CALC_140() { + CALC_F2_PRE(0x1f0, EAX, EDX, EDI) + PRECALC_36(Y14) + CALC_F2_POST(EAX, ECX, EBX, EDI) +} + +func CALC_141() { + CALC_F2_PRE(0x1f4, EDI, EAX, ESI) + PRECALC_37(Y14) + CALC_F2_POST(EDI, EDX, ECX, ESI) +} + +func CALC_142() { + CALC_F2_PRE(0x1f8, ESI, EDI, EBX) + CALC_F2_POST(ESI, EAX, EDX, EBX) +} + +func CALC_143() { + CALC_F2_PRE(0x1fc, EBX, ESI, ECX) + PRECALC_39(Y14, 0x60, 0x220) + CALC_F2_POST(EBX, EDI, EAX, ECX) +} + +func CALC_144() { + CALC_F2_PRE(0x210, ECX, EBX, EDX) + PRECALC_32(Y15, Y14) + CALC_F2_POST(ECX, ESI, EDI, EDX) +} + +func CALC_145() { + CALC_F2_PRE(0x214, EDX, ECX, EAX) + PRECALC_33(Y12, Y13) + CALC_F2_POST(EDX, EBX, ESI, EAX) +} + +func CALC_146() { + CALC_F2_PRE(0x218, EAX, EDX, EDI) + PRECALC_34(Y5) + CALC_F2_POST(EAX, ECX, EBX, EDI) +} + +func CALC_147() { + CALC_F2_PRE(0x21c, EDI, EAX, ESI) + PRECALC_35(Y13) + CALC_F2_POST(EDI, EDX, ECX, ESI) +} + +func CALC_148() { + CALC_F2_PRE(0x230, ESI, EDI, EBX) + PRECALC_36(Y13) + CALC_F2_POST(ESI, EAX, EDX, EBX) +} + +func CALC_149() { + CALC_F2_PRE(0x234, EBX, ESI, ECX) + PRECALC_37(Y13) + CALC_F2_POST(EBX, EDI, EAX, ECX) +} + +func CALC_150() { + CALC_F2_PRE(0x238, ECX, EBX, EDX) + CALC_F2_POST(ECX, ESI, EDI, EDX) +} + +func CALC_151() { + CALC_F2_PRE(0x23c, EDX, ECX, EAX) + PRECALC_39(Y13, 0x60, 0x240) + CALC_F2_POST(EDX, EBX, ESI, EAX) +} + +func CALC_152() { + CALC_F2_PRE(0x250, EAX, EDX, EDI) + PRECALC_32(Y14, Y13) + CALC_F2_POST(EAX, ECX, EBX, EDI) +} + +func CALC_153() { + CALC_F2_PRE(0x254, EDI, EAX, ESI) + PRECALC_33(Y8, Y12) + CALC_F2_POST(EDI, EDX, ECX, ESI) +} + +func CALC_154() { + CALC_F2_PRE(0x258, ESI, EDI, EBX) + PRECALC_34(Y3) + CALC_F2_POST(ESI, EAX, EDX, EBX) +} + +func CALC_155() { + CALC_F2_PRE(0x25c, EBX, ESI, ECX) + PRECALC_35(Y12) + CALC_F2_POST(EBX, EDI, EAX, ECX) +} + +func CALC_156() { + CALC_F2_PRE(0x270, ECX, EBX, EDX) + PRECALC_36(Y12) + CALC_F2_POST(ECX, ESI, EDI, EDX) +} + +func CALC_157() { + CALC_F2_PRE(0x274, EDX, ECX, EAX) + PRECALC_37(Y12) + CALC_F2_POST(EDX, EBX, ESI, EAX) +} + +func CALC_158() { + CALC_F2_PRE(0x278, EAX, EDX, EDI) + CALC_F2_POST(EAX, ECX, EBX, EDI) +} + +func CALC_159() { + ADDL(Mem{Base: R15}.Offset(0x27c), ESI) + LEAL(Mem{Base: SI, Index: AX, Scale: 1}, ESI) + RORXL(Imm(0x1b), EDI, R12L) + PRECALC_39(Y12, 0x60, 0x260) + ADDL(R12L, ESI) +} + +func CALC() { + MOVL(Mem{Base: R9}, ECX) + MOVL(Mem{Base: R9}.Offset(4), ESI) + MOVL(Mem{Base: R9}.Offset(8), EDI) + MOVL(Mem{Base: R9}.Offset(12), EAX) + MOVL(Mem{Base: R9}.Offset(16), EDX) + MOVQ(RSP, R14) + LEAQ(Mem{Base: SP}.Offset(2*4*80+32), R15) + PRECALC() // Precalc WK for first 2 blocks + XCHGQ(R15, R14) + loop_avx2() + begin() +} + +// this loops is unrolled +func loop_avx2() { + Label("loop") + CMPQ(R10, R8) // we use R8 value (set below) as a signal of a last block + JNE(LabelRef("begin")) + VZEROUPPER() + RET() +} + +func begin() { + Label("begin") + CALC_0() + CALC_1() + CALC_2() + CALC_3() + CALC_4() + CALC_5() + CALC_6() + CALC_7() + CALC_8() + CALC_9() + CALC_10() + CALC_11() + CALC_12() + CALC_13() + CALC_14() + CALC_15() + CALC_16() + CALC_17() + CALC_18() + CALC_19() + CALC_20() + CALC_21() + CALC_22() + CALC_23() + CALC_24() + CALC_25() + CALC_26() + CALC_27() + CALC_28() + CALC_29() + CALC_30() + CALC_31() + CALC_32() + CALC_33() + CALC_34() + CALC_35() + CALC_36() + CALC_37() + CALC_38() + CALC_39() + CALC_40() + CALC_41() + CALC_42() + CALC_43() + CALC_44() + CALC_45() + CALC_46() + CALC_47() + CALC_48() + CALC_49() + CALC_50() + CALC_51() + CALC_52() + CALC_53() + CALC_54() + CALC_55() + CALC_56() + CALC_57() + CALC_58() + CALC_59() + ADDQ(Imm(128), R10) // move to next even-64-byte block + CMPQ(R10, R11) // is current block the last one? + CMOVQCC(R8, R10) // signal the last iteration smartly + CALC_60() + CALC_61() + CALC_62() + CALC_63() + CALC_64() + CALC_65() + CALC_66() + CALC_67() + CALC_68() + CALC_69() + CALC_70() + CALC_71() + CALC_72() + CALC_73() + CALC_74() + CALC_75() + CALC_76() + CALC_77() + CALC_78() + CALC_79() + UPDATE_HASH(EAX, EDX, EBX, ESI, EDI) + CMPQ(R10, R8) // is current block the last one? + JE(LabelRef("loop")) + MOVL(EDX, ECX) + CALC_80() + CALC_81() + CALC_82() + CALC_83() + CALC_84() + CALC_85() + CALC_86() + CALC_87() + CALC_88() + CALC_89() + CALC_90() + CALC_91() + CALC_92() + CALC_93() + CALC_94() + CALC_95() + CALC_96() + CALC_97() + CALC_98() + CALC_99() + CALC_100() + CALC_101() + CALC_102() + CALC_103() + CALC_104() + CALC_105() + CALC_106() + CALC_107() + CALC_108() + CALC_109() + CALC_110() + CALC_111() + CALC_112() + CALC_113() + CALC_114() + CALC_115() + CALC_116() + CALC_117() + CALC_118() + CALC_119() + CALC_120() + CALC_121() + CALC_122() + CALC_123() + CALC_124() + CALC_125() + CALC_126() + CALC_127() + CALC_128() + CALC_129() + CALC_130() + CALC_131() + CALC_132() + CALC_133() + CALC_134() + CALC_135() + CALC_136() + CALC_137() + CALC_138() + CALC_139() + ADDQ(Imm(128), R13) //move to next even-64-byte block + CMPQ(R13, R11) //is current block the last one? + CMOVQCC(R8, R10) + CALC_140() + CALC_141() + CALC_142() + CALC_143() + CALC_144() + CALC_145() + CALC_146() + CALC_147() + CALC_148() + CALC_149() + CALC_150() + CALC_151() + CALC_152() + CALC_153() + CALC_154() + CALC_155() + CALC_156() + CALC_157() + CALC_158() + CALC_159() + UPDATE_HASH(ESI, EDI, EDX, ECX, EBX) + MOVL(ESI, R12L) + MOVL(EDI, ESI) + MOVL(EDX, EDI) + MOVL(EBX, EDX) + MOVL(ECX, EAX) + MOVL(R12L, ECX) + XCHGQ(R15, R14) + JMP(LabelRef("loop")) +} + +func blockAVX2() { + Implement("blockAVX2") + AllocLocal(1408) + + Load(Param("dig"), RDI) + Load(Param("p").Base(), RSI) + Load(Param("p").Len(), RDX) + SHRQ(Imm(6), RDX) + SHLQ(Imm(6), RDX) + + K_XMM_AR := K_XMM_AR_DATA() + LEAQ(K_XMM_AR, R8) + + MOVQ(RDI, R9) + MOVQ(RSI, R10) + LEAQ(Mem{Base: SI}.Offset(64), R13) + + ADDQ(RSI, RDX) + ADDQ(Imm(64), RDX) + MOVQ(RDX, R11) + + CMPQ(R13, R11) + CMOVQCC(R8, R13) + + BSWAP_SHUFB_CTL := BSWAP_SHUFB_CTL_DATA() + VMOVDQU(BSWAP_SHUFB_CTL, Y10) + CALC() +} + +// ##~~~~~~~~~~~~~~~~~~~~~~~~~~DATA SECTION~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~## + +// Pointers for memoizing Data section symbols +var ( + K_XMM_AR_ptr, BSWAP_SHUFB_CTL_ptr *Mem +) + +// To hold Round Constants for K_XMM_AR_DATA + +var _K = []uint32{ + 0x5A827999, + 0x6ED9EBA1, + 0x8F1BBCDC, + 0xCA62C1D6, +} + +func K_XMM_AR_DATA() Mem { + if K_XMM_AR_ptr != nil { + return *K_XMM_AR_ptr + } + + K_XMM_AR := GLOBL("K_XMM_AR", RODATA) + K_XMM_AR_ptr = &K_XMM_AR + + offset_idx := 0 + for _, v := range _K { + DATA((offset_idx+0)*4, U32(v)) + DATA((offset_idx+1)*4, U32(v)) + DATA((offset_idx+2)*4, U32(v)) + DATA((offset_idx+3)*4, U32(v)) + DATA((offset_idx+4)*4, U32(v)) + DATA((offset_idx+5)*4, U32(v)) + DATA((offset_idx+6)*4, U32(v)) + DATA((offset_idx+7)*4, U32(v)) + offset_idx += 8 + } + return K_XMM_AR +} + +var BSWAP_SHUFB_CTL_CONSTANTS = [8]uint32{ + 0x00010203, + 0x04050607, + 0x08090a0b, + 0x0c0d0e0f, + 0x00010203, + 0x04050607, + 0x08090a0b, + 0x0c0d0e0f, +} + +func BSWAP_SHUFB_CTL_DATA() Mem { + if BSWAP_SHUFB_CTL_ptr != nil { + return *BSWAP_SHUFB_CTL_ptr + } + + BSWAP_SHUFB_CTL := GLOBL("BSWAP_SHUFB_CTL", RODATA) + BSWAP_SHUFB_CTL_ptr = &BSWAP_SHUFB_CTL + for i, v := range BSWAP_SHUFB_CTL_CONSTANTS { + + DATA(i*4, U32(v)) + } + return BSWAP_SHUFB_CTL +} diff --git a/src/crypto/sha1/sha1block_amd64.s b/src/crypto/sha1/sha1block_amd64.s index 6508612d890..9c7aa146774 100644 --- a/src/crypto/sha1/sha1block_amd64.s +++ b/src/crypto/sha1/sha1block_amd64.s @@ -1,1501 +1,3051 @@ -// Copyright 2013 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// AVX2 version by Intel, same algorithm as code in Linux kernel: -// https://github.com/torvalds/linux/blob/master/arch/x86/crypto/sha1_avx2_x86_64_asm.S -// Authors: -// Ilya Albrekht -// Maxim Locktyukhin -// Ronen Zohar -// Chandramouli Narayanan +// Code generated by command: go run sha1block_amd64_asm.go -out ../sha1block_amd64.s -pkg sha1. DO NOT EDIT. //go:build !purego #include "textflag.h" -// SHA-1 block routine. See sha1block.go for Go equivalent. -// -// There are 80 rounds of 4 types: -// - rounds 0-15 are type 1 and load data (ROUND1 macro). -// - rounds 16-19 are type 1 and do not load data (ROUND1x macro). -// - rounds 20-39 are type 2 and do not load data (ROUND2 macro). -// - rounds 40-59 are type 3 and do not load data (ROUND3 macro). -// - rounds 60-79 are type 4 and do not load data (ROUND4 macro). -// -// Each round loads or shuffles the data, then computes a per-round -// function of b, c, d, and then mixes the result into and rotates the -// five registers a, b, c, d, e holding the intermediate results. -// -// The register rotation is implemented by rotating the arguments to -// the round macros instead of by explicit move instructions. - -#define LOAD(index) \ - MOVL (index*4)(SI), R10; \ - BSWAPL R10; \ - MOVL R10, (index*4)(SP) - -#define SHUFFLE(index) \ - MOVL (((index)&0xf)*4)(SP), R10; \ - XORL (((index-3)&0xf)*4)(SP), R10; \ - XORL (((index-8)&0xf)*4)(SP), R10; \ - XORL (((index-14)&0xf)*4)(SP), R10; \ - ROLL $1, R10; \ - MOVL R10, (((index)&0xf)*4)(SP) - -#define FUNC1(a, b, c, d, e) \ - MOVL d, R9; \ - XORL c, R9; \ - ANDL b, R9; \ - XORL d, R9 - -#define FUNC2(a, b, c, d, e) \ - MOVL b, R9; \ - XORL c, R9; \ - XORL d, R9 - -#define FUNC3(a, b, c, d, e) \ - MOVL b, R8; \ - ORL c, R8; \ - ANDL d, R8; \ - MOVL b, R9; \ - ANDL c, R9; \ - ORL R8, R9 - -#define FUNC4 FUNC2 - -#define MIX(a, b, c, d, e, const) \ - ROLL $30, b; \ - ADDL R9, e; \ - MOVL a, R8; \ - ROLL $5, R8; \ - LEAL const(e)(R10*1), e; \ - ADDL R8, e - -#define ROUND1(a, b, c, d, e, index) \ - LOAD(index); \ - FUNC1(a, b, c, d, e); \ - MIX(a, b, c, d, e, 0x5A827999) - -#define ROUND1x(a, b, c, d, e, index) \ - SHUFFLE(index); \ - FUNC1(a, b, c, d, e); \ - MIX(a, b, c, d, e, 0x5A827999) - -#define ROUND2(a, b, c, d, e, index) \ - SHUFFLE(index); \ - FUNC2(a, b, c, d, e); \ - MIX(a, b, c, d, e, 0x6ED9EBA1) - -#define ROUND3(a, b, c, d, e, index) \ - SHUFFLE(index); \ - FUNC3(a, b, c, d, e); \ - MIX(a, b, c, d, e, 0x8F1BBCDC) - -#define ROUND4(a, b, c, d, e, index) \ - SHUFFLE(index); \ - FUNC4(a, b, c, d, e); \ - MIX(a, b, c, d, e, 0xCA62C1D6) - -TEXT ·blockAMD64(SB),NOSPLIT,$64-32 - MOVQ dig+0(FP), BP - MOVQ p_base+8(FP), SI - MOVQ p_len+16(FP), DX - SHRQ $6, DX - SHLQ $6, DX - - LEAQ (SI)(DX*1), DI - MOVL (0*4)(BP), AX - MOVL (1*4)(BP), BX - MOVL (2*4)(BP), CX - MOVL (3*4)(BP), DX - MOVL (4*4)(BP), BP - - CMPQ SI, DI - JEQ end +// func blockAMD64(dig *digest, p []byte) +TEXT ·blockAMD64(SB), NOSPLIT, $64-32 + MOVQ dig+0(FP), BP + MOVQ p_base+8(FP), SI + MOVQ p_len+16(FP), DX + SHRQ $0x06, DX + SHLQ $0x06, DX + LEAQ (SI)(DX*1), DI + MOVL (BP), AX + MOVL 4(BP), BX + MOVL 8(BP), CX + MOVL 12(BP), DX + MOVL 16(BP), BP + CMPQ SI, DI + JEQ end loop: - MOVL AX, R11 - MOVL BX, R12 - MOVL CX, R13 - MOVL DX, R14 - MOVL BP, R15 - - ROUND1(AX, BX, CX, DX, BP, 0) - ROUND1(BP, AX, BX, CX, DX, 1) - ROUND1(DX, BP, AX, BX, CX, 2) - ROUND1(CX, DX, BP, AX, BX, 3) - ROUND1(BX, CX, DX, BP, AX, 4) - ROUND1(AX, BX, CX, DX, BP, 5) - ROUND1(BP, AX, BX, CX, DX, 6) - ROUND1(DX, BP, AX, BX, CX, 7) - ROUND1(CX, DX, BP, AX, BX, 8) - ROUND1(BX, CX, DX, BP, AX, 9) - ROUND1(AX, BX, CX, DX, BP, 10) - ROUND1(BP, AX, BX, CX, DX, 11) - ROUND1(DX, BP, AX, BX, CX, 12) - ROUND1(CX, DX, BP, AX, BX, 13) - ROUND1(BX, CX, DX, BP, AX, 14) - ROUND1(AX, BX, CX, DX, BP, 15) - - ROUND1x(BP, AX, BX, CX, DX, 16) - ROUND1x(DX, BP, AX, BX, CX, 17) - ROUND1x(CX, DX, BP, AX, BX, 18) - ROUND1x(BX, CX, DX, BP, AX, 19) - - ROUND2(AX, BX, CX, DX, BP, 20) - ROUND2(BP, AX, BX, CX, DX, 21) - ROUND2(DX, BP, AX, BX, CX, 22) - ROUND2(CX, DX, BP, AX, BX, 23) - ROUND2(BX, CX, DX, BP, AX, 24) - ROUND2(AX, BX, CX, DX, BP, 25) - ROUND2(BP, AX, BX, CX, DX, 26) - ROUND2(DX, BP, AX, BX, CX, 27) - ROUND2(CX, DX, BP, AX, BX, 28) - ROUND2(BX, CX, DX, BP, AX, 29) - ROUND2(AX, BX, CX, DX, BP, 30) - ROUND2(BP, AX, BX, CX, DX, 31) - ROUND2(DX, BP, AX, BX, CX, 32) - ROUND2(CX, DX, BP, AX, BX, 33) - ROUND2(BX, CX, DX, BP, AX, 34) - ROUND2(AX, BX, CX, DX, BP, 35) - ROUND2(BP, AX, BX, CX, DX, 36) - ROUND2(DX, BP, AX, BX, CX, 37) - ROUND2(CX, DX, BP, AX, BX, 38) - ROUND2(BX, CX, DX, BP, AX, 39) - - ROUND3(AX, BX, CX, DX, BP, 40) - ROUND3(BP, AX, BX, CX, DX, 41) - ROUND3(DX, BP, AX, BX, CX, 42) - ROUND3(CX, DX, BP, AX, BX, 43) - ROUND3(BX, CX, DX, BP, AX, 44) - ROUND3(AX, BX, CX, DX, BP, 45) - ROUND3(BP, AX, BX, CX, DX, 46) - ROUND3(DX, BP, AX, BX, CX, 47) - ROUND3(CX, DX, BP, AX, BX, 48) - ROUND3(BX, CX, DX, BP, AX, 49) - ROUND3(AX, BX, CX, DX, BP, 50) - ROUND3(BP, AX, BX, CX, DX, 51) - ROUND3(DX, BP, AX, BX, CX, 52) - ROUND3(CX, DX, BP, AX, BX, 53) - ROUND3(BX, CX, DX, BP, AX, 54) - ROUND3(AX, BX, CX, DX, BP, 55) - ROUND3(BP, AX, BX, CX, DX, 56) - ROUND3(DX, BP, AX, BX, CX, 57) - ROUND3(CX, DX, BP, AX, BX, 58) - ROUND3(BX, CX, DX, BP, AX, 59) - - ROUND4(AX, BX, CX, DX, BP, 60) - ROUND4(BP, AX, BX, CX, DX, 61) - ROUND4(DX, BP, AX, BX, CX, 62) - ROUND4(CX, DX, BP, AX, BX, 63) - ROUND4(BX, CX, DX, BP, AX, 64) - ROUND4(AX, BX, CX, DX, BP, 65) - ROUND4(BP, AX, BX, CX, DX, 66) - ROUND4(DX, BP, AX, BX, CX, 67) - ROUND4(CX, DX, BP, AX, BX, 68) - ROUND4(BX, CX, DX, BP, AX, 69) - ROUND4(AX, BX, CX, DX, BP, 70) - ROUND4(BP, AX, BX, CX, DX, 71) - ROUND4(DX, BP, AX, BX, CX, 72) - ROUND4(CX, DX, BP, AX, BX, 73) - ROUND4(BX, CX, DX, BP, AX, 74) - ROUND4(AX, BX, CX, DX, BP, 75) - ROUND4(BP, AX, BX, CX, DX, 76) - ROUND4(DX, BP, AX, BX, CX, 77) - ROUND4(CX, DX, BP, AX, BX, 78) - ROUND4(BX, CX, DX, BP, AX, 79) - - ADDL R11, AX - ADDL R12, BX - ADDL R13, CX - ADDL R14, DX - ADDL R15, BP - - ADDQ $64, SI - CMPQ SI, DI - JB loop + MOVL AX, R11 + MOVL BX, R12 + MOVL CX, R13 + MOVL DX, R14 + MOVL BP, R15 + MOVL (SI), R10 + BSWAPL R10 + MOVL R10, (SP) + MOVL DX, R9 + XORL CX, R9 + ANDL BX, R9 + XORL DX, R9 + ROLL $0x1e, BX + ADDL R9, BP + MOVL AX, R8 + ROLL $0x05, R8 + LEAL 1518500249(BP)(R10*1), BP + ADDL R8, BP + MOVL 4(SI), R10 + BSWAPL R10 + MOVL R10, 4(SP) + MOVL CX, R9 + XORL BX, R9 + ANDL AX, R9 + XORL CX, R9 + ROLL $0x1e, AX + ADDL R9, DX + MOVL BP, R8 + ROLL $0x05, R8 + LEAL 1518500249(DX)(R10*1), DX + ADDL R8, DX + MOVL 8(SI), R10 + BSWAPL R10 + MOVL R10, 8(SP) + MOVL BX, R9 + XORL AX, R9 + ANDL BP, R9 + XORL BX, R9 + ROLL $0x1e, BP + ADDL R9, CX + MOVL DX, R8 + ROLL $0x05, R8 + LEAL 1518500249(CX)(R10*1), CX + ADDL R8, CX + MOVL 12(SI), R10 + BSWAPL R10 + MOVL R10, 12(SP) + MOVL AX, R9 + XORL BP, R9 + ANDL DX, R9 + XORL AX, R9 + ROLL $0x1e, DX + ADDL R9, BX + MOVL CX, R8 + ROLL $0x05, R8 + LEAL 1518500249(BX)(R10*1), BX + ADDL R8, BX + MOVL 16(SI), R10 + BSWAPL R10 + MOVL R10, 16(SP) + MOVL BP, R9 + XORL DX, R9 + ANDL CX, R9 + XORL BP, R9 + ROLL $0x1e, CX + ADDL R9, AX + MOVL BX, R8 + ROLL $0x05, R8 + LEAL 1518500249(AX)(R10*1), AX + ADDL R8, AX + MOVL 20(SI), R10 + BSWAPL R10 + MOVL R10, 20(SP) + MOVL DX, R9 + XORL CX, R9 + ANDL BX, R9 + XORL DX, R9 + ROLL $0x1e, BX + ADDL R9, BP + MOVL AX, R8 + ROLL $0x05, R8 + LEAL 1518500249(BP)(R10*1), BP + ADDL R8, BP + MOVL 24(SI), R10 + BSWAPL R10 + MOVL R10, 24(SP) + MOVL CX, R9 + XORL BX, R9 + ANDL AX, R9 + XORL CX, R9 + ROLL $0x1e, AX + ADDL R9, DX + MOVL BP, R8 + ROLL $0x05, R8 + LEAL 1518500249(DX)(R10*1), DX + ADDL R8, DX + MOVL 28(SI), R10 + BSWAPL R10 + MOVL R10, 28(SP) + MOVL BX, R9 + XORL AX, R9 + ANDL BP, R9 + XORL BX, R9 + ROLL $0x1e, BP + ADDL R9, CX + MOVL DX, R8 + ROLL $0x05, R8 + LEAL 1518500249(CX)(R10*1), CX + ADDL R8, CX + MOVL 32(SI), R10 + BSWAPL R10 + MOVL R10, 32(SP) + MOVL AX, R9 + XORL BP, R9 + ANDL DX, R9 + XORL AX, R9 + ROLL $0x1e, DX + ADDL R9, BX + MOVL CX, R8 + ROLL $0x05, R8 + LEAL 1518500249(BX)(R10*1), BX + ADDL R8, BX + MOVL 36(SI), R10 + BSWAPL R10 + MOVL R10, 36(SP) + MOVL BP, R9 + XORL DX, R9 + ANDL CX, R9 + XORL BP, R9 + ROLL $0x1e, CX + ADDL R9, AX + MOVL BX, R8 + ROLL $0x05, R8 + LEAL 1518500249(AX)(R10*1), AX + ADDL R8, AX + MOVL 40(SI), R10 + BSWAPL R10 + MOVL R10, 40(SP) + MOVL DX, R9 + XORL CX, R9 + ANDL BX, R9 + XORL DX, R9 + ROLL $0x1e, BX + ADDL R9, BP + MOVL AX, R8 + ROLL $0x05, R8 + LEAL 1518500249(BP)(R10*1), BP + ADDL R8, BP + MOVL 44(SI), R10 + BSWAPL R10 + MOVL R10, 44(SP) + MOVL CX, R9 + XORL BX, R9 + ANDL AX, R9 + XORL CX, R9 + ROLL $0x1e, AX + ADDL R9, DX + MOVL BP, R8 + ROLL $0x05, R8 + LEAL 1518500249(DX)(R10*1), DX + ADDL R8, DX + MOVL 48(SI), R10 + BSWAPL R10 + MOVL R10, 48(SP) + MOVL BX, R9 + XORL AX, R9 + ANDL BP, R9 + XORL BX, R9 + ROLL $0x1e, BP + ADDL R9, CX + MOVL DX, R8 + ROLL $0x05, R8 + LEAL 1518500249(CX)(R10*1), CX + ADDL R8, CX + MOVL 52(SI), R10 + BSWAPL R10 + MOVL R10, 52(SP) + MOVL AX, R9 + XORL BP, R9 + ANDL DX, R9 + XORL AX, R9 + ROLL $0x1e, DX + ADDL R9, BX + MOVL CX, R8 + ROLL $0x05, R8 + LEAL 1518500249(BX)(R10*1), BX + ADDL R8, BX + MOVL 56(SI), R10 + BSWAPL R10 + MOVL R10, 56(SP) + MOVL BP, R9 + XORL DX, R9 + ANDL CX, R9 + XORL BP, R9 + ROLL $0x1e, CX + ADDL R9, AX + MOVL BX, R8 + ROLL $0x05, R8 + LEAL 1518500249(AX)(R10*1), AX + ADDL R8, AX + MOVL 60(SI), R10 + BSWAPL R10 + MOVL R10, 60(SP) + MOVL DX, R9 + XORL CX, R9 + ANDL BX, R9 + XORL DX, R9 + ROLL $0x1e, BX + ADDL R9, BP + MOVL AX, R8 + ROLL $0x05, R8 + LEAL 1518500249(BP)(R10*1), BP + ADDL R8, BP + MOVL (SP), R10 + XORL 52(SP), R10 + XORL 32(SP), R10 + XORL 8(SP), R10 + ROLL $0x01, R10 + MOVL R10, (SP) + MOVL CX, R9 + XORL BX, R9 + ANDL AX, R9 + XORL CX, R9 + ROLL $0x1e, AX + ADDL R9, DX + MOVL BP, R8 + ROLL $0x05, R8 + LEAL 1518500249(DX)(R10*1), DX + ADDL R8, DX + MOVL 4(SP), R10 + XORL 56(SP), R10 + XORL 36(SP), R10 + XORL 12(SP), R10 + ROLL $0x01, R10 + MOVL R10, 4(SP) + MOVL BX, R9 + XORL AX, R9 + ANDL BP, R9 + XORL BX, R9 + ROLL $0x1e, BP + ADDL R9, CX + MOVL DX, R8 + ROLL $0x05, R8 + LEAL 1518500249(CX)(R10*1), CX + ADDL R8, CX + MOVL 8(SP), R10 + XORL 60(SP), R10 + XORL 40(SP), R10 + XORL 16(SP), R10 + ROLL $0x01, R10 + MOVL R10, 8(SP) + MOVL AX, R9 + XORL BP, R9 + ANDL DX, R9 + XORL AX, R9 + ROLL $0x1e, DX + ADDL R9, BX + MOVL CX, R8 + ROLL $0x05, R8 + LEAL 1518500249(BX)(R10*1), BX + ADDL R8, BX + MOVL 12(SP), R10 + XORL (SP), R10 + XORL 44(SP), R10 + XORL 20(SP), R10 + ROLL $0x01, R10 + MOVL R10, 12(SP) + MOVL BP, R9 + XORL DX, R9 + ANDL CX, R9 + XORL BP, R9 + ROLL $0x1e, CX + ADDL R9, AX + MOVL BX, R8 + ROLL $0x05, R8 + LEAL 1518500249(AX)(R10*1), AX + ADDL R8, AX + MOVL 16(SP), R10 + XORL 4(SP), R10 + XORL 48(SP), R10 + XORL 24(SP), R10 + ROLL $0x01, R10 + MOVL R10, 16(SP) + MOVL BX, R9 + XORL CX, R9 + XORL DX, R9 + ROLL $0x1e, BX + ADDL R9, BP + MOVL AX, R8 + ROLL $0x05, R8 + LEAL 1859775393(BP)(R10*1), BP + ADDL R8, BP + MOVL 20(SP), R10 + XORL 8(SP), R10 + XORL 52(SP), R10 + XORL 28(SP), R10 + ROLL $0x01, R10 + MOVL R10, 20(SP) + MOVL AX, R9 + XORL BX, R9 + XORL CX, R9 + ROLL $0x1e, AX + ADDL R9, DX + MOVL BP, R8 + ROLL $0x05, R8 + LEAL 1859775393(DX)(R10*1), DX + ADDL R8, DX + MOVL 24(SP), R10 + XORL 12(SP), R10 + XORL 56(SP), R10 + XORL 32(SP), R10 + ROLL $0x01, R10 + MOVL R10, 24(SP) + MOVL BP, R9 + XORL AX, R9 + XORL BX, R9 + ROLL $0x1e, BP + ADDL R9, CX + MOVL DX, R8 + ROLL $0x05, R8 + LEAL 1859775393(CX)(R10*1), CX + ADDL R8, CX + MOVL 28(SP), R10 + XORL 16(SP), R10 + XORL 60(SP), R10 + XORL 36(SP), R10 + ROLL $0x01, R10 + MOVL R10, 28(SP) + MOVL DX, R9 + XORL BP, R9 + XORL AX, R9 + ROLL $0x1e, DX + ADDL R9, BX + MOVL CX, R8 + ROLL $0x05, R8 + LEAL 1859775393(BX)(R10*1), BX + ADDL R8, BX + MOVL 32(SP), R10 + XORL 20(SP), R10 + XORL (SP), R10 + XORL 40(SP), R10 + ROLL $0x01, R10 + MOVL R10, 32(SP) + MOVL CX, R9 + XORL DX, R9 + XORL BP, R9 + ROLL $0x1e, CX + ADDL R9, AX + MOVL BX, R8 + ROLL $0x05, R8 + LEAL 1859775393(AX)(R10*1), AX + ADDL R8, AX + MOVL 36(SP), R10 + XORL 24(SP), R10 + XORL 4(SP), R10 + XORL 44(SP), R10 + ROLL $0x01, R10 + MOVL R10, 36(SP) + MOVL BX, R9 + XORL CX, R9 + XORL DX, R9 + ROLL $0x1e, BX + ADDL R9, BP + MOVL AX, R8 + ROLL $0x05, R8 + LEAL 1859775393(BP)(R10*1), BP + ADDL R8, BP + MOVL 40(SP), R10 + XORL 28(SP), R10 + XORL 8(SP), R10 + XORL 48(SP), R10 + ROLL $0x01, R10 + MOVL R10, 40(SP) + MOVL AX, R9 + XORL BX, R9 + XORL CX, R9 + ROLL $0x1e, AX + ADDL R9, DX + MOVL BP, R8 + ROLL $0x05, R8 + LEAL 1859775393(DX)(R10*1), DX + ADDL R8, DX + MOVL 44(SP), R10 + XORL 32(SP), R10 + XORL 12(SP), R10 + XORL 52(SP), R10 + ROLL $0x01, R10 + MOVL R10, 44(SP) + MOVL BP, R9 + XORL AX, R9 + XORL BX, R9 + ROLL $0x1e, BP + ADDL R9, CX + MOVL DX, R8 + ROLL $0x05, R8 + LEAL 1859775393(CX)(R10*1), CX + ADDL R8, CX + MOVL 48(SP), R10 + XORL 36(SP), R10 + XORL 16(SP), R10 + XORL 56(SP), R10 + ROLL $0x01, R10 + MOVL R10, 48(SP) + MOVL DX, R9 + XORL BP, R9 + XORL AX, R9 + ROLL $0x1e, DX + ADDL R9, BX + MOVL CX, R8 + ROLL $0x05, R8 + LEAL 1859775393(BX)(R10*1), BX + ADDL R8, BX + MOVL 52(SP), R10 + XORL 40(SP), R10 + XORL 20(SP), R10 + XORL 60(SP), R10 + ROLL $0x01, R10 + MOVL R10, 52(SP) + MOVL CX, R9 + XORL DX, R9 + XORL BP, R9 + ROLL $0x1e, CX + ADDL R9, AX + MOVL BX, R8 + ROLL $0x05, R8 + LEAL 1859775393(AX)(R10*1), AX + ADDL R8, AX + MOVL 56(SP), R10 + XORL 44(SP), R10 + XORL 24(SP), R10 + XORL (SP), R10 + ROLL $0x01, R10 + MOVL R10, 56(SP) + MOVL BX, R9 + XORL CX, R9 + XORL DX, R9 + ROLL $0x1e, BX + ADDL R9, BP + MOVL AX, R8 + ROLL $0x05, R8 + LEAL 1859775393(BP)(R10*1), BP + ADDL R8, BP + MOVL 60(SP), R10 + XORL 48(SP), R10 + XORL 28(SP), R10 + XORL 4(SP), R10 + ROLL $0x01, R10 + MOVL R10, 60(SP) + MOVL AX, R9 + XORL BX, R9 + XORL CX, R9 + ROLL $0x1e, AX + ADDL R9, DX + MOVL BP, R8 + ROLL $0x05, R8 + LEAL 1859775393(DX)(R10*1), DX + ADDL R8, DX + MOVL (SP), R10 + XORL 52(SP), R10 + XORL 32(SP), R10 + XORL 8(SP), R10 + ROLL $0x01, R10 + MOVL R10, (SP) + MOVL BP, R9 + XORL AX, R9 + XORL BX, R9 + ROLL $0x1e, BP + ADDL R9, CX + MOVL DX, R8 + ROLL $0x05, R8 + LEAL 1859775393(CX)(R10*1), CX + ADDL R8, CX + MOVL 4(SP), R10 + XORL 56(SP), R10 + XORL 36(SP), R10 + XORL 12(SP), R10 + ROLL $0x01, R10 + MOVL R10, 4(SP) + MOVL DX, R9 + XORL BP, R9 + XORL AX, R9 + ROLL $0x1e, DX + ADDL R9, BX + MOVL CX, R8 + ROLL $0x05, R8 + LEAL 1859775393(BX)(R10*1), BX + ADDL R8, BX + MOVL 8(SP), R10 + XORL 60(SP), R10 + XORL 40(SP), R10 + XORL 16(SP), R10 + ROLL $0x01, R10 + MOVL R10, 8(SP) + MOVL CX, R9 + XORL DX, R9 + XORL BP, R9 + ROLL $0x1e, CX + ADDL R9, AX + MOVL BX, R8 + ROLL $0x05, R8 + LEAL 1859775393(AX)(R10*1), AX + ADDL R8, AX + MOVL 12(SP), R10 + XORL (SP), R10 + XORL 44(SP), R10 + XORL 20(SP), R10 + ROLL $0x01, R10 + MOVL R10, 12(SP) + MOVL BX, R9 + XORL CX, R9 + XORL DX, R9 + ROLL $0x1e, BX + ADDL R9, BP + MOVL AX, R8 + ROLL $0x05, R8 + LEAL 1859775393(BP)(R10*1), BP + ADDL R8, BP + MOVL 16(SP), R10 + XORL 4(SP), R10 + XORL 48(SP), R10 + XORL 24(SP), R10 + ROLL $0x01, R10 + MOVL R10, 16(SP) + MOVL AX, R9 + XORL BX, R9 + XORL CX, R9 + ROLL $0x1e, AX + ADDL R9, DX + MOVL BP, R8 + ROLL $0x05, R8 + LEAL 1859775393(DX)(R10*1), DX + ADDL R8, DX + MOVL 20(SP), R10 + XORL 8(SP), R10 + XORL 52(SP), R10 + XORL 28(SP), R10 + ROLL $0x01, R10 + MOVL R10, 20(SP) + MOVL BP, R9 + XORL AX, R9 + XORL BX, R9 + ROLL $0x1e, BP + ADDL R9, CX + MOVL DX, R8 + ROLL $0x05, R8 + LEAL 1859775393(CX)(R10*1), CX + ADDL R8, CX + MOVL 24(SP), R10 + XORL 12(SP), R10 + XORL 56(SP), R10 + XORL 32(SP), R10 + ROLL $0x01, R10 + MOVL R10, 24(SP) + MOVL DX, R9 + XORL BP, R9 + XORL AX, R9 + ROLL $0x1e, DX + ADDL R9, BX + MOVL CX, R8 + ROLL $0x05, R8 + LEAL 1859775393(BX)(R10*1), BX + ADDL R8, BX + MOVL 28(SP), R10 + XORL 16(SP), R10 + XORL 60(SP), R10 + XORL 36(SP), R10 + ROLL $0x01, R10 + MOVL R10, 28(SP) + MOVL CX, R9 + XORL DX, R9 + XORL BP, R9 + ROLL $0x1e, CX + ADDL R9, AX + MOVL BX, R8 + ROLL $0x05, R8 + LEAL 1859775393(AX)(R10*1), AX + ADDL R8, AX + MOVL 32(SP), R10 + XORL 20(SP), R10 + XORL (SP), R10 + XORL 40(SP), R10 + ROLL $0x01, R10 + MOVL R10, 32(SP) + MOVL BX, R8 + ORL CX, R8 + ANDL DX, R8 + MOVL BX, R9 + ANDL CX, R9 + ORL R8, R9 + ROLL $0x1e, BX + ADDL R9, BP + MOVL AX, R8 + ROLL $0x05, R8 + LEAL 2400959708(BP)(R10*1), BP + ADDL R8, BP + MOVL 36(SP), R10 + XORL 24(SP), R10 + XORL 4(SP), R10 + XORL 44(SP), R10 + ROLL $0x01, R10 + MOVL R10, 36(SP) + MOVL AX, R8 + ORL BX, R8 + ANDL CX, R8 + MOVL AX, R9 + ANDL BX, R9 + ORL R8, R9 + ROLL $0x1e, AX + ADDL R9, DX + MOVL BP, R8 + ROLL $0x05, R8 + LEAL 2400959708(DX)(R10*1), DX + ADDL R8, DX + MOVL 40(SP), R10 + XORL 28(SP), R10 + XORL 8(SP), R10 + XORL 48(SP), R10 + ROLL $0x01, R10 + MOVL R10, 40(SP) + MOVL BP, R8 + ORL AX, R8 + ANDL BX, R8 + MOVL BP, R9 + ANDL AX, R9 + ORL R8, R9 + ROLL $0x1e, BP + ADDL R9, CX + MOVL DX, R8 + ROLL $0x05, R8 + LEAL 2400959708(CX)(R10*1), CX + ADDL R8, CX + MOVL 44(SP), R10 + XORL 32(SP), R10 + XORL 12(SP), R10 + XORL 52(SP), R10 + ROLL $0x01, R10 + MOVL R10, 44(SP) + MOVL DX, R8 + ORL BP, R8 + ANDL AX, R8 + MOVL DX, R9 + ANDL BP, R9 + ORL R8, R9 + ROLL $0x1e, DX + ADDL R9, BX + MOVL CX, R8 + ROLL $0x05, R8 + LEAL 2400959708(BX)(R10*1), BX + ADDL R8, BX + MOVL 48(SP), R10 + XORL 36(SP), R10 + XORL 16(SP), R10 + XORL 56(SP), R10 + ROLL $0x01, R10 + MOVL R10, 48(SP) + MOVL CX, R8 + ORL DX, R8 + ANDL BP, R8 + MOVL CX, R9 + ANDL DX, R9 + ORL R8, R9 + ROLL $0x1e, CX + ADDL R9, AX + MOVL BX, R8 + ROLL $0x05, R8 + LEAL 2400959708(AX)(R10*1), AX + ADDL R8, AX + MOVL 52(SP), R10 + XORL 40(SP), R10 + XORL 20(SP), R10 + XORL 60(SP), R10 + ROLL $0x01, R10 + MOVL R10, 52(SP) + MOVL BX, R8 + ORL CX, R8 + ANDL DX, R8 + MOVL BX, R9 + ANDL CX, R9 + ORL R8, R9 + ROLL $0x1e, BX + ADDL R9, BP + MOVL AX, R8 + ROLL $0x05, R8 + LEAL 2400959708(BP)(R10*1), BP + ADDL R8, BP + MOVL 56(SP), R10 + XORL 44(SP), R10 + XORL 24(SP), R10 + XORL (SP), R10 + ROLL $0x01, R10 + MOVL R10, 56(SP) + MOVL AX, R8 + ORL BX, R8 + ANDL CX, R8 + MOVL AX, R9 + ANDL BX, R9 + ORL R8, R9 + ROLL $0x1e, AX + ADDL R9, DX + MOVL BP, R8 + ROLL $0x05, R8 + LEAL 2400959708(DX)(R10*1), DX + ADDL R8, DX + MOVL 60(SP), R10 + XORL 48(SP), R10 + XORL 28(SP), R10 + XORL 4(SP), R10 + ROLL $0x01, R10 + MOVL R10, 60(SP) + MOVL BP, R8 + ORL AX, R8 + ANDL BX, R8 + MOVL BP, R9 + ANDL AX, R9 + ORL R8, R9 + ROLL $0x1e, BP + ADDL R9, CX + MOVL DX, R8 + ROLL $0x05, R8 + LEAL 2400959708(CX)(R10*1), CX + ADDL R8, CX + MOVL (SP), R10 + XORL 52(SP), R10 + XORL 32(SP), R10 + XORL 8(SP), R10 + ROLL $0x01, R10 + MOVL R10, (SP) + MOVL DX, R8 + ORL BP, R8 + ANDL AX, R8 + MOVL DX, R9 + ANDL BP, R9 + ORL R8, R9 + ROLL $0x1e, DX + ADDL R9, BX + MOVL CX, R8 + ROLL $0x05, R8 + LEAL 2400959708(BX)(R10*1), BX + ADDL R8, BX + MOVL 4(SP), R10 + XORL 56(SP), R10 + XORL 36(SP), R10 + XORL 12(SP), R10 + ROLL $0x01, R10 + MOVL R10, 4(SP) + MOVL CX, R8 + ORL DX, R8 + ANDL BP, R8 + MOVL CX, R9 + ANDL DX, R9 + ORL R8, R9 + ROLL $0x1e, CX + ADDL R9, AX + MOVL BX, R8 + ROLL $0x05, R8 + LEAL 2400959708(AX)(R10*1), AX + ADDL R8, AX + MOVL 8(SP), R10 + XORL 60(SP), R10 + XORL 40(SP), R10 + XORL 16(SP), R10 + ROLL $0x01, R10 + MOVL R10, 8(SP) + MOVL BX, R8 + ORL CX, R8 + ANDL DX, R8 + MOVL BX, R9 + ANDL CX, R9 + ORL R8, R9 + ROLL $0x1e, BX + ADDL R9, BP + MOVL AX, R8 + ROLL $0x05, R8 + LEAL 2400959708(BP)(R10*1), BP + ADDL R8, BP + MOVL 12(SP), R10 + XORL (SP), R10 + XORL 44(SP), R10 + XORL 20(SP), R10 + ROLL $0x01, R10 + MOVL R10, 12(SP) + MOVL AX, R8 + ORL BX, R8 + ANDL CX, R8 + MOVL AX, R9 + ANDL BX, R9 + ORL R8, R9 + ROLL $0x1e, AX + ADDL R9, DX + MOVL BP, R8 + ROLL $0x05, R8 + LEAL 2400959708(DX)(R10*1), DX + ADDL R8, DX + MOVL 16(SP), R10 + XORL 4(SP), R10 + XORL 48(SP), R10 + XORL 24(SP), R10 + ROLL $0x01, R10 + MOVL R10, 16(SP) + MOVL BP, R8 + ORL AX, R8 + ANDL BX, R8 + MOVL BP, R9 + ANDL AX, R9 + ORL R8, R9 + ROLL $0x1e, BP + ADDL R9, CX + MOVL DX, R8 + ROLL $0x05, R8 + LEAL 2400959708(CX)(R10*1), CX + ADDL R8, CX + MOVL 20(SP), R10 + XORL 8(SP), R10 + XORL 52(SP), R10 + XORL 28(SP), R10 + ROLL $0x01, R10 + MOVL R10, 20(SP) + MOVL DX, R8 + ORL BP, R8 + ANDL AX, R8 + MOVL DX, R9 + ANDL BP, R9 + ORL R8, R9 + ROLL $0x1e, DX + ADDL R9, BX + MOVL CX, R8 + ROLL $0x05, R8 + LEAL 2400959708(BX)(R10*1), BX + ADDL R8, BX + MOVL 24(SP), R10 + XORL 12(SP), R10 + XORL 56(SP), R10 + XORL 32(SP), R10 + ROLL $0x01, R10 + MOVL R10, 24(SP) + MOVL CX, R8 + ORL DX, R8 + ANDL BP, R8 + MOVL CX, R9 + ANDL DX, R9 + ORL R8, R9 + ROLL $0x1e, CX + ADDL R9, AX + MOVL BX, R8 + ROLL $0x05, R8 + LEAL 2400959708(AX)(R10*1), AX + ADDL R8, AX + MOVL 28(SP), R10 + XORL 16(SP), R10 + XORL 60(SP), R10 + XORL 36(SP), R10 + ROLL $0x01, R10 + MOVL R10, 28(SP) + MOVL BX, R8 + ORL CX, R8 + ANDL DX, R8 + MOVL BX, R9 + ANDL CX, R9 + ORL R8, R9 + ROLL $0x1e, BX + ADDL R9, BP + MOVL AX, R8 + ROLL $0x05, R8 + LEAL 2400959708(BP)(R10*1), BP + ADDL R8, BP + MOVL 32(SP), R10 + XORL 20(SP), R10 + XORL (SP), R10 + XORL 40(SP), R10 + ROLL $0x01, R10 + MOVL R10, 32(SP) + MOVL AX, R8 + ORL BX, R8 + ANDL CX, R8 + MOVL AX, R9 + ANDL BX, R9 + ORL R8, R9 + ROLL $0x1e, AX + ADDL R9, DX + MOVL BP, R8 + ROLL $0x05, R8 + LEAL 2400959708(DX)(R10*1), DX + ADDL R8, DX + MOVL 36(SP), R10 + XORL 24(SP), R10 + XORL 4(SP), R10 + XORL 44(SP), R10 + ROLL $0x01, R10 + MOVL R10, 36(SP) + MOVL BP, R8 + ORL AX, R8 + ANDL BX, R8 + MOVL BP, R9 + ANDL AX, R9 + ORL R8, R9 + ROLL $0x1e, BP + ADDL R9, CX + MOVL DX, R8 + ROLL $0x05, R8 + LEAL 2400959708(CX)(R10*1), CX + ADDL R8, CX + MOVL 40(SP), R10 + XORL 28(SP), R10 + XORL 8(SP), R10 + XORL 48(SP), R10 + ROLL $0x01, R10 + MOVL R10, 40(SP) + MOVL DX, R8 + ORL BP, R8 + ANDL AX, R8 + MOVL DX, R9 + ANDL BP, R9 + ORL R8, R9 + ROLL $0x1e, DX + ADDL R9, BX + MOVL CX, R8 + ROLL $0x05, R8 + LEAL 2400959708(BX)(R10*1), BX + ADDL R8, BX + MOVL 44(SP), R10 + XORL 32(SP), R10 + XORL 12(SP), R10 + XORL 52(SP), R10 + ROLL $0x01, R10 + MOVL R10, 44(SP) + MOVL CX, R8 + ORL DX, R8 + ANDL BP, R8 + MOVL CX, R9 + ANDL DX, R9 + ORL R8, R9 + ROLL $0x1e, CX + ADDL R9, AX + MOVL BX, R8 + ROLL $0x05, R8 + LEAL 2400959708(AX)(R10*1), AX + ADDL R8, AX + MOVL 48(SP), R10 + XORL 36(SP), R10 + XORL 16(SP), R10 + XORL 56(SP), R10 + ROLL $0x01, R10 + MOVL R10, 48(SP) + MOVL BX, R9 + XORL CX, R9 + XORL DX, R9 + ROLL $0x1e, BX + ADDL R9, BP + MOVL AX, R8 + ROLL $0x05, R8 + LEAL 3395469782(BP)(R10*1), BP + ADDL R8, BP + MOVL 52(SP), R10 + XORL 40(SP), R10 + XORL 20(SP), R10 + XORL 60(SP), R10 + ROLL $0x01, R10 + MOVL R10, 52(SP) + MOVL AX, R9 + XORL BX, R9 + XORL CX, R9 + ROLL $0x1e, AX + ADDL R9, DX + MOVL BP, R8 + ROLL $0x05, R8 + LEAL 3395469782(DX)(R10*1), DX + ADDL R8, DX + MOVL 56(SP), R10 + XORL 44(SP), R10 + XORL 24(SP), R10 + XORL (SP), R10 + ROLL $0x01, R10 + MOVL R10, 56(SP) + MOVL BP, R9 + XORL AX, R9 + XORL BX, R9 + ROLL $0x1e, BP + ADDL R9, CX + MOVL DX, R8 + ROLL $0x05, R8 + LEAL 3395469782(CX)(R10*1), CX + ADDL R8, CX + MOVL 60(SP), R10 + XORL 48(SP), R10 + XORL 28(SP), R10 + XORL 4(SP), R10 + ROLL $0x01, R10 + MOVL R10, 60(SP) + MOVL DX, R9 + XORL BP, R9 + XORL AX, R9 + ROLL $0x1e, DX + ADDL R9, BX + MOVL CX, R8 + ROLL $0x05, R8 + LEAL 3395469782(BX)(R10*1), BX + ADDL R8, BX + MOVL (SP), R10 + XORL 52(SP), R10 + XORL 32(SP), R10 + XORL 8(SP), R10 + ROLL $0x01, R10 + MOVL R10, (SP) + MOVL CX, R9 + XORL DX, R9 + XORL BP, R9 + ROLL $0x1e, CX + ADDL R9, AX + MOVL BX, R8 + ROLL $0x05, R8 + LEAL 3395469782(AX)(R10*1), AX + ADDL R8, AX + MOVL 4(SP), R10 + XORL 56(SP), R10 + XORL 36(SP), R10 + XORL 12(SP), R10 + ROLL $0x01, R10 + MOVL R10, 4(SP) + MOVL BX, R9 + XORL CX, R9 + XORL DX, R9 + ROLL $0x1e, BX + ADDL R9, BP + MOVL AX, R8 + ROLL $0x05, R8 + LEAL 3395469782(BP)(R10*1), BP + ADDL R8, BP + MOVL 8(SP), R10 + XORL 60(SP), R10 + XORL 40(SP), R10 + XORL 16(SP), R10 + ROLL $0x01, R10 + MOVL R10, 8(SP) + MOVL AX, R9 + XORL BX, R9 + XORL CX, R9 + ROLL $0x1e, AX + ADDL R9, DX + MOVL BP, R8 + ROLL $0x05, R8 + LEAL 3395469782(DX)(R10*1), DX + ADDL R8, DX + MOVL 12(SP), R10 + XORL (SP), R10 + XORL 44(SP), R10 + XORL 20(SP), R10 + ROLL $0x01, R10 + MOVL R10, 12(SP) + MOVL BP, R9 + XORL AX, R9 + XORL BX, R9 + ROLL $0x1e, BP + ADDL R9, CX + MOVL DX, R8 + ROLL $0x05, R8 + LEAL 3395469782(CX)(R10*1), CX + ADDL R8, CX + MOVL 16(SP), R10 + XORL 4(SP), R10 + XORL 48(SP), R10 + XORL 24(SP), R10 + ROLL $0x01, R10 + MOVL R10, 16(SP) + MOVL DX, R9 + XORL BP, R9 + XORL AX, R9 + ROLL $0x1e, DX + ADDL R9, BX + MOVL CX, R8 + ROLL $0x05, R8 + LEAL 3395469782(BX)(R10*1), BX + ADDL R8, BX + MOVL 20(SP), R10 + XORL 8(SP), R10 + XORL 52(SP), R10 + XORL 28(SP), R10 + ROLL $0x01, R10 + MOVL R10, 20(SP) + MOVL CX, R9 + XORL DX, R9 + XORL BP, R9 + ROLL $0x1e, CX + ADDL R9, AX + MOVL BX, R8 + ROLL $0x05, R8 + LEAL 3395469782(AX)(R10*1), AX + ADDL R8, AX + MOVL 24(SP), R10 + XORL 12(SP), R10 + XORL 56(SP), R10 + XORL 32(SP), R10 + ROLL $0x01, R10 + MOVL R10, 24(SP) + MOVL BX, R9 + XORL CX, R9 + XORL DX, R9 + ROLL $0x1e, BX + ADDL R9, BP + MOVL AX, R8 + ROLL $0x05, R8 + LEAL 3395469782(BP)(R10*1), BP + ADDL R8, BP + MOVL 28(SP), R10 + XORL 16(SP), R10 + XORL 60(SP), R10 + XORL 36(SP), R10 + ROLL $0x01, R10 + MOVL R10, 28(SP) + MOVL AX, R9 + XORL BX, R9 + XORL CX, R9 + ROLL $0x1e, AX + ADDL R9, DX + MOVL BP, R8 + ROLL $0x05, R8 + LEAL 3395469782(DX)(R10*1), DX + ADDL R8, DX + MOVL 32(SP), R10 + XORL 20(SP), R10 + XORL (SP), R10 + XORL 40(SP), R10 + ROLL $0x01, R10 + MOVL R10, 32(SP) + MOVL BP, R9 + XORL AX, R9 + XORL BX, R9 + ROLL $0x1e, BP + ADDL R9, CX + MOVL DX, R8 + ROLL $0x05, R8 + LEAL 3395469782(CX)(R10*1), CX + ADDL R8, CX + MOVL 36(SP), R10 + XORL 24(SP), R10 + XORL 4(SP), R10 + XORL 44(SP), R10 + ROLL $0x01, R10 + MOVL R10, 36(SP) + MOVL DX, R9 + XORL BP, R9 + XORL AX, R9 + ROLL $0x1e, DX + ADDL R9, BX + MOVL CX, R8 + ROLL $0x05, R8 + LEAL 3395469782(BX)(R10*1), BX + ADDL R8, BX + MOVL 40(SP), R10 + XORL 28(SP), R10 + XORL 8(SP), R10 + XORL 48(SP), R10 + ROLL $0x01, R10 + MOVL R10, 40(SP) + MOVL CX, R9 + XORL DX, R9 + XORL BP, R9 + ROLL $0x1e, CX + ADDL R9, AX + MOVL BX, R8 + ROLL $0x05, R8 + LEAL 3395469782(AX)(R10*1), AX + ADDL R8, AX + MOVL 44(SP), R10 + XORL 32(SP), R10 + XORL 12(SP), R10 + XORL 52(SP), R10 + ROLL $0x01, R10 + MOVL R10, 44(SP) + MOVL BX, R9 + XORL CX, R9 + XORL DX, R9 + ROLL $0x1e, BX + ADDL R9, BP + MOVL AX, R8 + ROLL $0x05, R8 + LEAL 3395469782(BP)(R10*1), BP + ADDL R8, BP + MOVL 48(SP), R10 + XORL 36(SP), R10 + XORL 16(SP), R10 + XORL 56(SP), R10 + ROLL $0x01, R10 + MOVL R10, 48(SP) + MOVL AX, R9 + XORL BX, R9 + XORL CX, R9 + ROLL $0x1e, AX + ADDL R9, DX + MOVL BP, R8 + ROLL $0x05, R8 + LEAL 3395469782(DX)(R10*1), DX + ADDL R8, DX + MOVL 52(SP), R10 + XORL 40(SP), R10 + XORL 20(SP), R10 + XORL 60(SP), R10 + ROLL $0x01, R10 + MOVL R10, 52(SP) + MOVL BP, R9 + XORL AX, R9 + XORL BX, R9 + ROLL $0x1e, BP + ADDL R9, CX + MOVL DX, R8 + ROLL $0x05, R8 + LEAL 3395469782(CX)(R10*1), CX + ADDL R8, CX + MOVL 56(SP), R10 + XORL 44(SP), R10 + XORL 24(SP), R10 + XORL (SP), R10 + ROLL $0x01, R10 + MOVL R10, 56(SP) + MOVL DX, R9 + XORL BP, R9 + XORL AX, R9 + ROLL $0x1e, DX + ADDL R9, BX + MOVL CX, R8 + ROLL $0x05, R8 + LEAL 3395469782(BX)(R10*1), BX + ADDL R8, BX + MOVL 60(SP), R10 + XORL 48(SP), R10 + XORL 28(SP), R10 + XORL 4(SP), R10 + ROLL $0x01, R10 + MOVL R10, 60(SP) + MOVL CX, R9 + XORL DX, R9 + XORL BP, R9 + ROLL $0x1e, CX + ADDL R9, AX + MOVL BX, R8 + ROLL $0x05, R8 + LEAL 3395469782(AX)(R10*1), AX + ADDL R8, AX + ADDL R11, AX + ADDL R12, BX + ADDL R13, CX + ADDL R14, DX + ADDL R15, BP + ADDQ $0x40, SI + CMPQ SI, DI + JB loop end: - MOVQ dig+0(FP), DI - MOVL AX, (0*4)(DI) - MOVL BX, (1*4)(DI) - MOVL CX, (2*4)(DI) - MOVL DX, (3*4)(DI) - MOVL BP, (4*4)(DI) + MOVQ dig+0(FP), DI + MOVL AX, (DI) + MOVL BX, 4(DI) + MOVL CX, 8(DI) + MOVL DX, 12(DI) + MOVL BP, 16(DI) RET - -// This is the implementation using AVX2, BMI1 and BMI2. It is based on: -// "SHA-1 implementation with Intel(R) AVX2 instruction set extensions" -// From http://software.intel.com/en-us/articles -// (look for improving-the-performance-of-the-secure-hash-algorithm-1) -// This implementation is 2x unrolled, and interleaves vector instructions, -// used to precompute W, with scalar computation of current round -// for optimal scheduling. - -// Trivial helper macros. -#define UPDATE_HASH(A,TB,C,D,E) \ - ADDL (R9), A \ - MOVL A, (R9) \ - ADDL 4(R9), TB \ - MOVL TB, 4(R9) \ - ADDL 8(R9), C \ - MOVL C, 8(R9) \ - ADDL 12(R9), D \ - MOVL D, 12(R9) \ - ADDL 16(R9), E \ - MOVL E, 16(R9) - - - -// Helper macros for PRECALC, which does precomputations -#define PRECALC_0(OFFSET) \ - VMOVDQU OFFSET(R10),X0 - -#define PRECALC_1(OFFSET) \ - VINSERTI128 $1, OFFSET(R13), Y0, Y0 - -#define PRECALC_2(YREG) \ - VPSHUFB Y10, Y0, YREG - -#define PRECALC_4(YREG,K_OFFSET) \ - VPADDD K_OFFSET(R8), YREG, Y0 - -#define PRECALC_7(OFFSET) \ - VMOVDQU Y0, (OFFSET*2)(R14) - - -// Message scheduling pre-compute for rounds 0-15 -// R13 is a pointer to even 64-byte block -// R10 is a pointer to odd 64-byte block -// R14 is a pointer to temp buffer -// X0 is used as temp register -// YREG is clobbered as part of computation -// OFFSET chooses 16 byte chunk within a block -// R8 is a pointer to constants block -// K_OFFSET chooses K constants relevant to this round -// X10 holds swap mask -#define PRECALC_00_15(OFFSET,YREG) \ - PRECALC_0(OFFSET) \ - PRECALC_1(OFFSET) \ - PRECALC_2(YREG) \ - PRECALC_4(YREG,0x0) \ - PRECALC_7(OFFSET) - - -// Helper macros for PRECALC_16_31 -#define PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \ - VPALIGNR $8, REG_SUB_16, REG_SUB_12, REG \ // w[i-14] - VPSRLDQ $4, REG_SUB_4, Y0 // w[i-3] - -#define PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \ - VPXOR REG_SUB_8, REG, REG \ - VPXOR REG_SUB_16, Y0, Y0 - -#define PRECALC_18(REG) \ - VPXOR Y0, REG, REG \ - VPSLLDQ $12, REG, Y9 - -#define PRECALC_19(REG) \ - VPSLLD $1, REG, Y0 \ - VPSRLD $31, REG, REG - -#define PRECALC_20(REG) \ - VPOR REG, Y0, Y0 \ - VPSLLD $2, Y9, REG - -#define PRECALC_21(REG) \ - VPSRLD $30, Y9, Y9 \ - VPXOR REG, Y0, Y0 - -#define PRECALC_23(REG,K_OFFSET,OFFSET) \ - VPXOR Y9, Y0, REG \ - VPADDD K_OFFSET(R8), REG, Y0 \ - VMOVDQU Y0, (OFFSET)(R14) - -// Message scheduling pre-compute for rounds 16-31 -// calculating last 32 w[i] values in 8 XMM registers -// pre-calculate K+w[i] values and store to mem -// for later load by ALU add instruction. -// "brute force" vectorization for rounds 16-31 only -// due to w[i]->w[i-3] dependency. -// clobbers 5 input ymm registers REG_SUB* -// uses X0 and X9 as temp registers -// As always, R8 is a pointer to constants block -// and R14 is a pointer to temp buffer -#define PRECALC_16_31(REG,REG_SUB_4,REG_SUB_8,REG_SUB_12,REG_SUB_16,K_OFFSET,OFFSET) \ - PRECALC_16(REG_SUB_16,REG_SUB_12,REG_SUB_4,REG) \ - PRECALC_17(REG_SUB_16,REG_SUB_8,REG) \ - PRECALC_18(REG) \ - PRECALC_19(REG) \ - PRECALC_20(REG) \ - PRECALC_21(REG) \ - PRECALC_23(REG,K_OFFSET,OFFSET) - - -// Helper macros for PRECALC_32_79 -#define PRECALC_32(REG_SUB_8,REG_SUB_4) \ - VPALIGNR $8, REG_SUB_8, REG_SUB_4, Y0 - -#define PRECALC_33(REG_SUB_28,REG) \ - VPXOR REG_SUB_28, REG, REG - -#define PRECALC_34(REG_SUB_16) \ - VPXOR REG_SUB_16, Y0, Y0 - -#define PRECALC_35(REG) \ - VPXOR Y0, REG, REG - -#define PRECALC_36(REG) \ - VPSLLD $2, REG, Y0 - -#define PRECALC_37(REG) \ - VPSRLD $30, REG, REG \ - VPOR REG, Y0, REG - -#define PRECALC_39(REG,K_OFFSET,OFFSET) \ - VPADDD K_OFFSET(R8), REG, Y0 \ - VMOVDQU Y0, (OFFSET)(R14) - -// Message scheduling pre-compute for rounds 32-79 -// In SHA-1 specification we have: -// w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 -// Which is the same as: -// w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 -// This allows for more efficient vectorization, -// since w[i]->w[i-3] dependency is broken -#define PRECALC_32_79(REG,REG_SUB_4,REG_SUB_8,REG_SUB_16,REG_SUB_28,K_OFFSET,OFFSET) \ - PRECALC_32(REG_SUB_8,REG_SUB_4) \ - PRECALC_33(REG_SUB_28,REG) \ - PRECALC_34(REG_SUB_16) \ - PRECALC_35(REG) \ - PRECALC_36(REG) \ - PRECALC_37(REG) \ - PRECALC_39(REG,K_OFFSET,OFFSET) - -#define PRECALC \ - PRECALC_00_15(0,Y15) \ - PRECALC_00_15(0x10,Y14) \ - PRECALC_00_15(0x20,Y13) \ - PRECALC_00_15(0x30,Y12) \ - PRECALC_16_31(Y8,Y12,Y13,Y14,Y15,0,0x80) \ - PRECALC_16_31(Y7,Y8,Y12,Y13,Y14,0x20,0xa0) \ - PRECALC_16_31(Y5,Y7,Y8,Y12,Y13,0x20,0xc0) \ - PRECALC_16_31(Y3,Y5,Y7,Y8,Y12,0x20,0xe0) \ - PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x20,0x100) \ - PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x20,0x120) \ - PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x40,0x140) \ - PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x40,0x160) \ - PRECALC_32_79(Y8,Y12,Y13,Y15,Y7,0x40,0x180) \ - PRECALC_32_79(Y7,Y8,Y12,Y14,Y5,0x40,0x1a0) \ - PRECALC_32_79(Y5,Y7,Y8,Y13,Y3,0x40,0x1c0) \ - PRECALC_32_79(Y3,Y5,Y7,Y12,Y15,0x60,0x1e0) \ - PRECALC_32_79(Y15,Y3,Y5,Y8,Y14,0x60,0x200) \ - PRECALC_32_79(Y14,Y15,Y3,Y7,Y13,0x60,0x220) \ - PRECALC_32_79(Y13,Y14,Y15,Y5,Y12,0x60,0x240) \ - PRECALC_32_79(Y12,Y13,Y14,Y3,Y8,0x60,0x260) - -// Macros calculating individual rounds have general form -// CALC_ROUND_PRE + PRECALC_ROUND + CALC_ROUND_POST -// CALC_ROUND_{PRE,POST} macros follow - -#define CALC_F1_PRE(OFFSET,REG_A,REG_B,REG_C,REG_E) \ - ADDL OFFSET(R15),REG_E \ - ANDNL REG_C,REG_A,BP \ - LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round - RORXL $0x1b, REG_A, R12 \ - RORXL $2, REG_A, REG_B // for next round - -// Calculate F for the next round -#define CALC_F1_POST(REG_A,REG_B,REG_E) \ - ANDL REG_B,REG_A \ // b&c - XORL BP, REG_A \ // F1 = (b&c) ^ (~b&d) - LEAL (REG_E)(R12*1), REG_E // E += A >>> 5 - - -// Registers are cyclically rotated DX -> AX -> DI -> SI -> BX -> CX -#define CALC_0 \ - MOVL SI, BX \ // Precalculating first round - RORXL $2, SI, SI \ - ANDNL AX, BX, BP \ - ANDL DI, BX \ - XORL BP, BX \ - CALC_F1_PRE(0x0,CX,BX,DI,DX) \ - PRECALC_0(0x80) \ - CALC_F1_POST(CX,SI,DX) - -#define CALC_1 \ - CALC_F1_PRE(0x4,DX,CX,SI,AX) \ - PRECALC_1(0x80) \ - CALC_F1_POST(DX,BX,AX) - -#define CALC_2 \ - CALC_F1_PRE(0x8,AX,DX,BX,DI) \ - PRECALC_2(Y15) \ - CALC_F1_POST(AX,CX,DI) - -#define CALC_3 \ - CALC_F1_PRE(0xc,DI,AX,CX,SI) \ - CALC_F1_POST(DI,DX,SI) - -#define CALC_4 \ - CALC_F1_PRE(0x20,SI,DI,DX,BX) \ - PRECALC_4(Y15,0x0) \ - CALC_F1_POST(SI,AX,BX) - -#define CALC_5 \ - CALC_F1_PRE(0x24,BX,SI,AX,CX) \ - CALC_F1_POST(BX,DI,CX) - -#define CALC_6 \ - CALC_F1_PRE(0x28,CX,BX,DI,DX) \ - CALC_F1_POST(CX,SI,DX) - -#define CALC_7 \ - CALC_F1_PRE(0x2c,DX,CX,SI,AX) \ - PRECALC_7(0x0) \ - CALC_F1_POST(DX,BX,AX) - -#define CALC_8 \ - CALC_F1_PRE(0x40,AX,DX,BX,DI) \ - PRECALC_0(0x90) \ - CALC_F1_POST(AX,CX,DI) - -#define CALC_9 \ - CALC_F1_PRE(0x44,DI,AX,CX,SI) \ - PRECALC_1(0x90) \ - CALC_F1_POST(DI,DX,SI) - -#define CALC_10 \ - CALC_F1_PRE(0x48,SI,DI,DX,BX) \ - PRECALC_2(Y14) \ - CALC_F1_POST(SI,AX,BX) - -#define CALC_11 \ - CALC_F1_PRE(0x4c,BX,SI,AX,CX) \ - CALC_F1_POST(BX,DI,CX) - -#define CALC_12 \ - CALC_F1_PRE(0x60,CX,BX,DI,DX) \ - PRECALC_4(Y14,0x0) \ - CALC_F1_POST(CX,SI,DX) - -#define CALC_13 \ - CALC_F1_PRE(0x64,DX,CX,SI,AX) \ - CALC_F1_POST(DX,BX,AX) - -#define CALC_14 \ - CALC_F1_PRE(0x68,AX,DX,BX,DI) \ - CALC_F1_POST(AX,CX,DI) - -#define CALC_15 \ - CALC_F1_PRE(0x6c,DI,AX,CX,SI) \ - PRECALC_7(0x10) \ - CALC_F1_POST(DI,DX,SI) - -#define CALC_16 \ - CALC_F1_PRE(0x80,SI,DI,DX,BX) \ - PRECALC_0(0xa0) \ - CALC_F1_POST(SI,AX,BX) - -#define CALC_17 \ - CALC_F1_PRE(0x84,BX,SI,AX,CX) \ - PRECALC_1(0xa0) \ - CALC_F1_POST(BX,DI,CX) - -#define CALC_18 \ - CALC_F1_PRE(0x88,CX,BX,DI,DX) \ - PRECALC_2(Y13) \ - CALC_F1_POST(CX,SI,DX) - - -#define CALC_F2_PRE(OFFSET,REG_A,REG_B,REG_E) \ - ADDL OFFSET(R15),REG_E \ - LEAL (REG_E)(REG_B*1), REG_E \ // Add F from the previous round - RORXL $0x1b, REG_A, R12 \ - RORXL $2, REG_A, REG_B // for next round - -#define CALC_F2_POST(REG_A,REG_B,REG_C,REG_E) \ - XORL REG_B, REG_A \ - ADDL R12, REG_E \ - XORL REG_C, REG_A - -#define CALC_19 \ - CALC_F2_PRE(0x8c,DX,CX,AX) \ - CALC_F2_POST(DX,BX,SI,AX) - -#define CALC_20 \ - CALC_F2_PRE(0xa0,AX,DX,DI) \ - PRECALC_4(Y13,0x0) \ - CALC_F2_POST(AX,CX,BX,DI) - -#define CALC_21 \ - CALC_F2_PRE(0xa4,DI,AX,SI) \ - CALC_F2_POST(DI,DX,CX,SI) - -#define CALC_22 \ - CALC_F2_PRE(0xa8,SI,DI,BX) \ - CALC_F2_POST(SI,AX,DX,BX) - -#define CALC_23 \ - CALC_F2_PRE(0xac,BX,SI,CX) \ - PRECALC_7(0x20) \ - CALC_F2_POST(BX,DI,AX,CX) - -#define CALC_24 \ - CALC_F2_PRE(0xc0,CX,BX,DX) \ - PRECALC_0(0xb0) \ - CALC_F2_POST(CX,SI,DI,DX) - -#define CALC_25 \ - CALC_F2_PRE(0xc4,DX,CX,AX) \ - PRECALC_1(0xb0) \ - CALC_F2_POST(DX,BX,SI,AX) - -#define CALC_26 \ - CALC_F2_PRE(0xc8,AX,DX,DI) \ - PRECALC_2(Y12) \ - CALC_F2_POST(AX,CX,BX,DI) - -#define CALC_27 \ - CALC_F2_PRE(0xcc,DI,AX,SI) \ - CALC_F2_POST(DI,DX,CX,SI) - -#define CALC_28 \ - CALC_F2_PRE(0xe0,SI,DI,BX) \ - PRECALC_4(Y12,0x0) \ - CALC_F2_POST(SI,AX,DX,BX) - -#define CALC_29 \ - CALC_F2_PRE(0xe4,BX,SI,CX) \ - CALC_F2_POST(BX,DI,AX,CX) - -#define CALC_30 \ - CALC_F2_PRE(0xe8,CX,BX,DX) \ - CALC_F2_POST(CX,SI,DI,DX) - -#define CALC_31 \ - CALC_F2_PRE(0xec,DX,CX,AX) \ - PRECALC_7(0x30) \ - CALC_F2_POST(DX,BX,SI,AX) - -#define CALC_32 \ - CALC_F2_PRE(0x100,AX,DX,DI) \ - PRECALC_16(Y15,Y14,Y12,Y8) \ - CALC_F2_POST(AX,CX,BX,DI) - -#define CALC_33 \ - CALC_F2_PRE(0x104,DI,AX,SI) \ - PRECALC_17(Y15,Y13,Y8) \ - CALC_F2_POST(DI,DX,CX,SI) - -#define CALC_34 \ - CALC_F2_PRE(0x108,SI,DI,BX) \ - PRECALC_18(Y8) \ - CALC_F2_POST(SI,AX,DX,BX) - -#define CALC_35 \ - CALC_F2_PRE(0x10c,BX,SI,CX) \ - PRECALC_19(Y8) \ - CALC_F2_POST(BX,DI,AX,CX) - -#define CALC_36 \ - CALC_F2_PRE(0x120,CX,BX,DX) \ - PRECALC_20(Y8) \ - CALC_F2_POST(CX,SI,DI,DX) - -#define CALC_37 \ - CALC_F2_PRE(0x124,DX,CX,AX) \ - PRECALC_21(Y8) \ - CALC_F2_POST(DX,BX,SI,AX) - -#define CALC_38 \ - CALC_F2_PRE(0x128,AX,DX,DI) \ - CALC_F2_POST(AX,CX,BX,DI) - - -#define CALC_F3_PRE(OFFSET,REG_E) \ - ADDL OFFSET(R15),REG_E - -#define CALC_F3_POST(REG_A,REG_B,REG_C,REG_E,REG_TB) \ - LEAL (REG_E)(REG_TB*1), REG_E \ // Add F from the previous round - MOVL REG_B, BP \ - ORL REG_A, BP \ - RORXL $0x1b, REG_A, R12 \ - RORXL $2, REG_A, REG_TB \ - ANDL REG_C, BP \ // Calculate F for the next round - ANDL REG_B, REG_A \ - ORL BP, REG_A \ - ADDL R12, REG_E - -#define CALC_39 \ - CALC_F3_PRE(0x12c,SI) \ - PRECALC_23(Y8,0x0,0x80) \ - CALC_F3_POST(DI,DX,CX,SI,AX) - -#define CALC_40 \ - CALC_F3_PRE(0x140,BX) \ - PRECALC_16(Y14,Y13,Y8,Y7) \ - CALC_F3_POST(SI,AX,DX,BX,DI) - -#define CALC_41 \ - CALC_F3_PRE(0x144,CX) \ - PRECALC_17(Y14,Y12,Y7) \ - CALC_F3_POST(BX,DI,AX,CX,SI) - -#define CALC_42 \ - CALC_F3_PRE(0x148,DX) \ - PRECALC_18(Y7) \ - CALC_F3_POST(CX,SI,DI,DX,BX) - -#define CALC_43 \ - CALC_F3_PRE(0x14c,AX) \ - PRECALC_19(Y7) \ - CALC_F3_POST(DX,BX,SI,AX,CX) - -#define CALC_44 \ - CALC_F3_PRE(0x160,DI) \ - PRECALC_20(Y7) \ - CALC_F3_POST(AX,CX,BX,DI,DX) - -#define CALC_45 \ - CALC_F3_PRE(0x164,SI) \ - PRECALC_21(Y7) \ - CALC_F3_POST(DI,DX,CX,SI,AX) - -#define CALC_46 \ - CALC_F3_PRE(0x168,BX) \ - CALC_F3_POST(SI,AX,DX,BX,DI) - -#define CALC_47 \ - CALC_F3_PRE(0x16c,CX) \ - VPXOR Y9, Y0, Y7 \ - VPADDD 0x20(R8), Y7, Y0 \ - VMOVDQU Y0, 0xa0(R14) \ - CALC_F3_POST(BX,DI,AX,CX,SI) - -#define CALC_48 \ - CALC_F3_PRE(0x180,DX) \ - PRECALC_16(Y13,Y12,Y7,Y5) \ - CALC_F3_POST(CX,SI,DI,DX,BX) - -#define CALC_49 \ - CALC_F3_PRE(0x184,AX) \ - PRECALC_17(Y13,Y8,Y5) \ - CALC_F3_POST(DX,BX,SI,AX,CX) - -#define CALC_50 \ - CALC_F3_PRE(0x188,DI) \ - PRECALC_18(Y5) \ - CALC_F3_POST(AX,CX,BX,DI,DX) - -#define CALC_51 \ - CALC_F3_PRE(0x18c,SI) \ - PRECALC_19(Y5) \ - CALC_F3_POST(DI,DX,CX,SI,AX) - -#define CALC_52 \ - CALC_F3_PRE(0x1a0,BX) \ - PRECALC_20(Y5) \ - CALC_F3_POST(SI,AX,DX,BX,DI) - -#define CALC_53 \ - CALC_F3_PRE(0x1a4,CX) \ - PRECALC_21(Y5) \ - CALC_F3_POST(BX,DI,AX,CX,SI) - -#define CALC_54 \ - CALC_F3_PRE(0x1a8,DX) \ - CALC_F3_POST(CX,SI,DI,DX,BX) - -#define CALC_55 \ - CALC_F3_PRE(0x1ac,AX) \ - PRECALC_23(Y5,0x20,0xc0) \ - CALC_F3_POST(DX,BX,SI,AX,CX) - -#define CALC_56 \ - CALC_F3_PRE(0x1c0,DI) \ - PRECALC_16(Y12,Y8,Y5,Y3) \ - CALC_F3_POST(AX,CX,BX,DI,DX) - -#define CALC_57 \ - CALC_F3_PRE(0x1c4,SI) \ - PRECALC_17(Y12,Y7,Y3) \ - CALC_F3_POST(DI,DX,CX,SI,AX) - -#define CALC_58 \ - CALC_F3_PRE(0x1c8,BX) \ - PRECALC_18(Y3) \ - CALC_F3_POST(SI,AX,DX,BX,DI) - -#define CALC_59 \ - CALC_F2_PRE(0x1cc,BX,SI,CX) \ - PRECALC_19(Y3) \ - CALC_F2_POST(BX,DI,AX,CX) - -#define CALC_60 \ - CALC_F2_PRE(0x1e0,CX,BX,DX) \ - PRECALC_20(Y3) \ - CALC_F2_POST(CX,SI,DI,DX) - -#define CALC_61 \ - CALC_F2_PRE(0x1e4,DX,CX,AX) \ - PRECALC_21(Y3) \ - CALC_F2_POST(DX,BX,SI,AX) - -#define CALC_62 \ - CALC_F2_PRE(0x1e8,AX,DX,DI) \ - CALC_F2_POST(AX,CX,BX,DI) - -#define CALC_63 \ - CALC_F2_PRE(0x1ec,DI,AX,SI) \ - PRECALC_23(Y3,0x20,0xe0) \ - CALC_F2_POST(DI,DX,CX,SI) - -#define CALC_64 \ - CALC_F2_PRE(0x200,SI,DI,BX) \ - PRECALC_32(Y5,Y3) \ - CALC_F2_POST(SI,AX,DX,BX) - -#define CALC_65 \ - CALC_F2_PRE(0x204,BX,SI,CX) \ - PRECALC_33(Y14,Y15) \ - CALC_F2_POST(BX,DI,AX,CX) - -#define CALC_66 \ - CALC_F2_PRE(0x208,CX,BX,DX) \ - PRECALC_34(Y8) \ - CALC_F2_POST(CX,SI,DI,DX) - -#define CALC_67 \ - CALC_F2_PRE(0x20c,DX,CX,AX) \ - PRECALC_35(Y15) \ - CALC_F2_POST(DX,BX,SI,AX) - -#define CALC_68 \ - CALC_F2_PRE(0x220,AX,DX,DI) \ - PRECALC_36(Y15) \ - CALC_F2_POST(AX,CX,BX,DI) - -#define CALC_69 \ - CALC_F2_PRE(0x224,DI,AX,SI) \ - PRECALC_37(Y15) \ - CALC_F2_POST(DI,DX,CX,SI) - -#define CALC_70 \ - CALC_F2_PRE(0x228,SI,DI,BX) \ - CALC_F2_POST(SI,AX,DX,BX) - -#define CALC_71 \ - CALC_F2_PRE(0x22c,BX,SI,CX) \ - PRECALC_39(Y15,0x20,0x100) \ - CALC_F2_POST(BX,DI,AX,CX) - -#define CALC_72 \ - CALC_F2_PRE(0x240,CX,BX,DX) \ - PRECALC_32(Y3,Y15) \ - CALC_F2_POST(CX,SI,DI,DX) - -#define CALC_73 \ - CALC_F2_PRE(0x244,DX,CX,AX) \ - PRECALC_33(Y13,Y14) \ - CALC_F2_POST(DX,BX,SI,AX) - -#define CALC_74 \ - CALC_F2_PRE(0x248,AX,DX,DI) \ - PRECALC_34(Y7) \ - CALC_F2_POST(AX,CX,BX,DI) - -#define CALC_75 \ - CALC_F2_PRE(0x24c,DI,AX,SI) \ - PRECALC_35(Y14) \ - CALC_F2_POST(DI,DX,CX,SI) - -#define CALC_76 \ - CALC_F2_PRE(0x260,SI,DI,BX) \ - PRECALC_36(Y14) \ - CALC_F2_POST(SI,AX,DX,BX) - -#define CALC_77 \ - CALC_F2_PRE(0x264,BX,SI,CX) \ - PRECALC_37(Y14) \ - CALC_F2_POST(BX,DI,AX,CX) - -#define CALC_78 \ - CALC_F2_PRE(0x268,CX,BX,DX) \ - CALC_F2_POST(CX,SI,DI,DX) - -#define CALC_79 \ - ADDL 0x26c(R15), AX \ - LEAL (AX)(CX*1), AX \ - RORXL $0x1b, DX, R12 \ - PRECALC_39(Y14,0x20,0x120) \ - ADDL R12, AX - -// Similar to CALC_0 -#define CALC_80 \ - MOVL CX, DX \ - RORXL $2, CX, CX \ - ANDNL SI, DX, BP \ - ANDL BX, DX \ - XORL BP, DX \ - CALC_F1_PRE(0x10,AX,DX,BX,DI) \ - PRECALC_32(Y15,Y14) \ - CALC_F1_POST(AX,CX,DI) - -#define CALC_81 \ - CALC_F1_PRE(0x14,DI,AX,CX,SI) \ - PRECALC_33(Y12,Y13) \ - CALC_F1_POST(DI,DX,SI) - -#define CALC_82 \ - CALC_F1_PRE(0x18,SI,DI,DX,BX) \ - PRECALC_34(Y5) \ - CALC_F1_POST(SI,AX,BX) - -#define CALC_83 \ - CALC_F1_PRE(0x1c,BX,SI,AX,CX) \ - PRECALC_35(Y13) \ - CALC_F1_POST(BX,DI,CX) - -#define CALC_84 \ - CALC_F1_PRE(0x30,CX,BX,DI,DX) \ - PRECALC_36(Y13) \ - CALC_F1_POST(CX,SI,DX) - -#define CALC_85 \ - CALC_F1_PRE(0x34,DX,CX,SI,AX) \ - PRECALC_37(Y13) \ - CALC_F1_POST(DX,BX,AX) - -#define CALC_86 \ - CALC_F1_PRE(0x38,AX,DX,BX,DI) \ - CALC_F1_POST(AX,CX,DI) - -#define CALC_87 \ - CALC_F1_PRE(0x3c,DI,AX,CX,SI) \ - PRECALC_39(Y13,0x40,0x140) \ - CALC_F1_POST(DI,DX,SI) - -#define CALC_88 \ - CALC_F1_PRE(0x50,SI,DI,DX,BX) \ - PRECALC_32(Y14,Y13) \ - CALC_F1_POST(SI,AX,BX) - -#define CALC_89 \ - CALC_F1_PRE(0x54,BX,SI,AX,CX) \ - PRECALC_33(Y8,Y12) \ - CALC_F1_POST(BX,DI,CX) - -#define CALC_90 \ - CALC_F1_PRE(0x58,CX,BX,DI,DX) \ - PRECALC_34(Y3) \ - CALC_F1_POST(CX,SI,DX) - -#define CALC_91 \ - CALC_F1_PRE(0x5c,DX,CX,SI,AX) \ - PRECALC_35(Y12) \ - CALC_F1_POST(DX,BX,AX) - -#define CALC_92 \ - CALC_F1_PRE(0x70,AX,DX,BX,DI) \ - PRECALC_36(Y12) \ - CALC_F1_POST(AX,CX,DI) - -#define CALC_93 \ - CALC_F1_PRE(0x74,DI,AX,CX,SI) \ - PRECALC_37(Y12) \ - CALC_F1_POST(DI,DX,SI) - -#define CALC_94 \ - CALC_F1_PRE(0x78,SI,DI,DX,BX) \ - CALC_F1_POST(SI,AX,BX) - -#define CALC_95 \ - CALC_F1_PRE(0x7c,BX,SI,AX,CX) \ - PRECALC_39(Y12,0x40,0x160) \ - CALC_F1_POST(BX,DI,CX) - -#define CALC_96 \ - CALC_F1_PRE(0x90,CX,BX,DI,DX) \ - PRECALC_32(Y13,Y12) \ - CALC_F1_POST(CX,SI,DX) - -#define CALC_97 \ - CALC_F1_PRE(0x94,DX,CX,SI,AX) \ - PRECALC_33(Y7,Y8) \ - CALC_F1_POST(DX,BX,AX) - -#define CALC_98 \ - CALC_F1_PRE(0x98,AX,DX,BX,DI) \ - PRECALC_34(Y15) \ - CALC_F1_POST(AX,CX,DI) - -#define CALC_99 \ - CALC_F2_PRE(0x9c,DI,AX,SI) \ - PRECALC_35(Y8) \ - CALC_F2_POST(DI,DX,CX,SI) - -#define CALC_100 \ - CALC_F2_PRE(0xb0,SI,DI,BX) \ - PRECALC_36(Y8) \ - CALC_F2_POST(SI,AX,DX,BX) - -#define CALC_101 \ - CALC_F2_PRE(0xb4,BX,SI,CX) \ - PRECALC_37(Y8) \ - CALC_F2_POST(BX,DI,AX,CX) - -#define CALC_102 \ - CALC_F2_PRE(0xb8,CX,BX,DX) \ - CALC_F2_POST(CX,SI,DI,DX) - -#define CALC_103 \ - CALC_F2_PRE(0xbc,DX,CX,AX) \ - PRECALC_39(Y8,0x40,0x180) \ - CALC_F2_POST(DX,BX,SI,AX) - -#define CALC_104 \ - CALC_F2_PRE(0xd0,AX,DX,DI) \ - PRECALC_32(Y12,Y8) \ - CALC_F2_POST(AX,CX,BX,DI) - -#define CALC_105 \ - CALC_F2_PRE(0xd4,DI,AX,SI) \ - PRECALC_33(Y5,Y7) \ - CALC_F2_POST(DI,DX,CX,SI) - -#define CALC_106 \ - CALC_F2_PRE(0xd8,SI,DI,BX) \ - PRECALC_34(Y14) \ - CALC_F2_POST(SI,AX,DX,BX) - -#define CALC_107 \ - CALC_F2_PRE(0xdc,BX,SI,CX) \ - PRECALC_35(Y7) \ - CALC_F2_POST(BX,DI,AX,CX) - -#define CALC_108 \ - CALC_F2_PRE(0xf0,CX,BX,DX) \ - PRECALC_36(Y7) \ - CALC_F2_POST(CX,SI,DI,DX) - -#define CALC_109 \ - CALC_F2_PRE(0xf4,DX,CX,AX) \ - PRECALC_37(Y7) \ - CALC_F2_POST(DX,BX,SI,AX) - -#define CALC_110 \ - CALC_F2_PRE(0xf8,AX,DX,DI) \ - CALC_F2_POST(AX,CX,BX,DI) - -#define CALC_111 \ - CALC_F2_PRE(0xfc,DI,AX,SI) \ - PRECALC_39(Y7,0x40,0x1a0) \ - CALC_F2_POST(DI,DX,CX,SI) - -#define CALC_112 \ - CALC_F2_PRE(0x110,SI,DI,BX) \ - PRECALC_32(Y8,Y7) \ - CALC_F2_POST(SI,AX,DX,BX) - -#define CALC_113 \ - CALC_F2_PRE(0x114,BX,SI,CX) \ - PRECALC_33(Y3,Y5) \ - CALC_F2_POST(BX,DI,AX,CX) - -#define CALC_114 \ - CALC_F2_PRE(0x118,CX,BX,DX) \ - PRECALC_34(Y13) \ - CALC_F2_POST(CX,SI,DI,DX) - -#define CALC_115 \ - CALC_F2_PRE(0x11c,DX,CX,AX) \ - PRECALC_35(Y5) \ - CALC_F2_POST(DX,BX,SI,AX) - -#define CALC_116 \ - CALC_F2_PRE(0x130,AX,DX,DI) \ - PRECALC_36(Y5) \ - CALC_F2_POST(AX,CX,BX,DI) - -#define CALC_117 \ - CALC_F2_PRE(0x134,DI,AX,SI) \ - PRECALC_37(Y5) \ - CALC_F2_POST(DI,DX,CX,SI) - -#define CALC_118 \ - CALC_F2_PRE(0x138,SI,DI,BX) \ - CALC_F2_POST(SI,AX,DX,BX) - -#define CALC_119 \ - CALC_F3_PRE(0x13c,CX) \ - PRECALC_39(Y5,0x40,0x1c0) \ - CALC_F3_POST(BX,DI,AX,CX,SI) - -#define CALC_120 \ - CALC_F3_PRE(0x150,DX) \ - PRECALC_32(Y7,Y5) \ - CALC_F3_POST(CX,SI,DI,DX,BX) - -#define CALC_121 \ - CALC_F3_PRE(0x154,AX) \ - PRECALC_33(Y15,Y3) \ - CALC_F3_POST(DX,BX,SI,AX,CX) - -#define CALC_122 \ - CALC_F3_PRE(0x158,DI) \ - PRECALC_34(Y12) \ - CALC_F3_POST(AX,CX,BX,DI,DX) - -#define CALC_123 \ - CALC_F3_PRE(0x15c,SI) \ - PRECALC_35(Y3) \ - CALC_F3_POST(DI,DX,CX,SI,AX) - -#define CALC_124 \ - CALC_F3_PRE(0x170,BX) \ - PRECALC_36(Y3) \ - CALC_F3_POST(SI,AX,DX,BX,DI) - -#define CALC_125 \ - CALC_F3_PRE(0x174,CX) \ - PRECALC_37(Y3) \ - CALC_F3_POST(BX,DI,AX,CX,SI) - -#define CALC_126 \ - CALC_F3_PRE(0x178,DX) \ - CALC_F3_POST(CX,SI,DI,DX,BX) - -#define CALC_127 \ - CALC_F3_PRE(0x17c,AX) \ - PRECALC_39(Y3,0x60,0x1e0) \ - CALC_F3_POST(DX,BX,SI,AX,CX) - -#define CALC_128 \ - CALC_F3_PRE(0x190,DI) \ - PRECALC_32(Y5,Y3) \ - CALC_F3_POST(AX,CX,BX,DI,DX) - -#define CALC_129 \ - CALC_F3_PRE(0x194,SI) \ - PRECALC_33(Y14,Y15) \ - CALC_F3_POST(DI,DX,CX,SI,AX) - -#define CALC_130 \ - CALC_F3_PRE(0x198,BX) \ - PRECALC_34(Y8) \ - CALC_F3_POST(SI,AX,DX,BX,DI) - -#define CALC_131 \ - CALC_F3_PRE(0x19c,CX) \ - PRECALC_35(Y15) \ - CALC_F3_POST(BX,DI,AX,CX,SI) - -#define CALC_132 \ - CALC_F3_PRE(0x1b0,DX) \ - PRECALC_36(Y15) \ - CALC_F3_POST(CX,SI,DI,DX,BX) - -#define CALC_133 \ - CALC_F3_PRE(0x1b4,AX) \ - PRECALC_37(Y15) \ - CALC_F3_POST(DX,BX,SI,AX,CX) - -#define CALC_134 \ - CALC_F3_PRE(0x1b8,DI) \ - CALC_F3_POST(AX,CX,BX,DI,DX) - -#define CALC_135 \ - CALC_F3_PRE(0x1bc,SI) \ - PRECALC_39(Y15,0x60,0x200) \ - CALC_F3_POST(DI,DX,CX,SI,AX) - -#define CALC_136 \ - CALC_F3_PRE(0x1d0,BX) \ - PRECALC_32(Y3,Y15) \ - CALC_F3_POST(SI,AX,DX,BX,DI) - -#define CALC_137 \ - CALC_F3_PRE(0x1d4,CX) \ - PRECALC_33(Y13,Y14) \ - CALC_F3_POST(BX,DI,AX,CX,SI) - -#define CALC_138 \ - CALC_F3_PRE(0x1d8,DX) \ - PRECALC_34(Y7) \ - CALC_F3_POST(CX,SI,DI,DX,BX) - -#define CALC_139 \ - CALC_F2_PRE(0x1dc,DX,CX,AX) \ - PRECALC_35(Y14) \ - CALC_F2_POST(DX,BX,SI,AX) - -#define CALC_140 \ - CALC_F2_PRE(0x1f0,AX,DX,DI) \ - PRECALC_36(Y14) \ - CALC_F2_POST(AX,CX,BX,DI) - -#define CALC_141 \ - CALC_F2_PRE(0x1f4,DI,AX,SI) \ - PRECALC_37(Y14) \ - CALC_F2_POST(DI,DX,CX,SI) - -#define CALC_142 \ - CALC_F2_PRE(0x1f8,SI,DI,BX) \ - CALC_F2_POST(SI,AX,DX,BX) - -#define CALC_143 \ - CALC_F2_PRE(0x1fc,BX,SI,CX) \ - PRECALC_39(Y14,0x60,0x220) \ - CALC_F2_POST(BX,DI,AX,CX) - -#define CALC_144 \ - CALC_F2_PRE(0x210,CX,BX,DX) \ - PRECALC_32(Y15,Y14) \ - CALC_F2_POST(CX,SI,DI,DX) - -#define CALC_145 \ - CALC_F2_PRE(0x214,DX,CX,AX) \ - PRECALC_33(Y12,Y13) \ - CALC_F2_POST(DX,BX,SI,AX) - -#define CALC_146 \ - CALC_F2_PRE(0x218,AX,DX,DI) \ - PRECALC_34(Y5) \ - CALC_F2_POST(AX,CX,BX,DI) - -#define CALC_147 \ - CALC_F2_PRE(0x21c,DI,AX,SI) \ - PRECALC_35(Y13) \ - CALC_F2_POST(DI,DX,CX,SI) - -#define CALC_148 \ - CALC_F2_PRE(0x230,SI,DI,BX) \ - PRECALC_36(Y13) \ - CALC_F2_POST(SI,AX,DX,BX) - -#define CALC_149 \ - CALC_F2_PRE(0x234,BX,SI,CX) \ - PRECALC_37(Y13) \ - CALC_F2_POST(BX,DI,AX,CX) - -#define CALC_150 \ - CALC_F2_PRE(0x238,CX,BX,DX) \ - CALC_F2_POST(CX,SI,DI,DX) - -#define CALC_151 \ - CALC_F2_PRE(0x23c,DX,CX,AX) \ - PRECALC_39(Y13,0x60,0x240) \ - CALC_F2_POST(DX,BX,SI,AX) - -#define CALC_152 \ - CALC_F2_PRE(0x250,AX,DX,DI) \ - PRECALC_32(Y14,Y13) \ - CALC_F2_POST(AX,CX,BX,DI) - -#define CALC_153 \ - CALC_F2_PRE(0x254,DI,AX,SI) \ - PRECALC_33(Y8,Y12) \ - CALC_F2_POST(DI,DX,CX,SI) - -#define CALC_154 \ - CALC_F2_PRE(0x258,SI,DI,BX) \ - PRECALC_34(Y3) \ - CALC_F2_POST(SI,AX,DX,BX) - -#define CALC_155 \ - CALC_F2_PRE(0x25c,BX,SI,CX) \ - PRECALC_35(Y12) \ - CALC_F2_POST(BX,DI,AX,CX) - -#define CALC_156 \ - CALC_F2_PRE(0x270,CX,BX,DX) \ - PRECALC_36(Y12) \ - CALC_F2_POST(CX,SI,DI,DX) - -#define CALC_157 \ - CALC_F2_PRE(0x274,DX,CX,AX) \ - PRECALC_37(Y12) \ - CALC_F2_POST(DX,BX,SI,AX) - -#define CALC_158 \ - CALC_F2_PRE(0x278,AX,DX,DI) \ - CALC_F2_POST(AX,CX,BX,DI) - -#define CALC_159 \ - ADDL 0x27c(R15),SI \ - LEAL (SI)(AX*1), SI \ - RORXL $0x1b, DI, R12 \ - PRECALC_39(Y12,0x60,0x260) \ - ADDL R12, SI - - - -#define CALC \ - MOVL (R9), CX \ - MOVL 4(R9), SI \ - MOVL 8(R9), DI \ - MOVL 12(R9), AX \ - MOVL 16(R9), DX \ - MOVQ SP, R14 \ - LEAQ (2*4*80+32)(SP), R15 \ - PRECALC \ // Precalc WK for first 2 blocks - XCHGQ R15, R14 \ -loop: \ // this loops is unrolled - CMPQ R10, R8 \ // we use R8 value (set below) as a signal of a last block - JNE begin \ - VZEROUPPER \ - RET \ -begin: \ - CALC_0 \ - CALC_1 \ - CALC_2 \ - CALC_3 \ - CALC_4 \ - CALC_5 \ - CALC_6 \ - CALC_7 \ - CALC_8 \ - CALC_9 \ - CALC_10 \ - CALC_11 \ - CALC_12 \ - CALC_13 \ - CALC_14 \ - CALC_15 \ - CALC_16 \ - CALC_17 \ - CALC_18 \ - CALC_19 \ - CALC_20 \ - CALC_21 \ - CALC_22 \ - CALC_23 \ - CALC_24 \ - CALC_25 \ - CALC_26 \ - CALC_27 \ - CALC_28 \ - CALC_29 \ - CALC_30 \ - CALC_31 \ - CALC_32 \ - CALC_33 \ - CALC_34 \ - CALC_35 \ - CALC_36 \ - CALC_37 \ - CALC_38 \ - CALC_39 \ - CALC_40 \ - CALC_41 \ - CALC_42 \ - CALC_43 \ - CALC_44 \ - CALC_45 \ - CALC_46 \ - CALC_47 \ - CALC_48 \ - CALC_49 \ - CALC_50 \ - CALC_51 \ - CALC_52 \ - CALC_53 \ - CALC_54 \ - CALC_55 \ - CALC_56 \ - CALC_57 \ - CALC_58 \ - CALC_59 \ - ADDQ $128, R10 \ // move to next even-64-byte block - CMPQ R10, R11 \ // is current block the last one? - CMOVQCC R8, R10 \ // signal the last iteration smartly - CALC_60 \ - CALC_61 \ - CALC_62 \ - CALC_63 \ - CALC_64 \ - CALC_65 \ - CALC_66 \ - CALC_67 \ - CALC_68 \ - CALC_69 \ - CALC_70 \ - CALC_71 \ - CALC_72 \ - CALC_73 \ - CALC_74 \ - CALC_75 \ - CALC_76 \ - CALC_77 \ - CALC_78 \ - CALC_79 \ - UPDATE_HASH(AX,DX,BX,SI,DI) \ - CMPQ R10, R8 \ // is current block the last one? - JE loop\ - MOVL DX, CX \ - CALC_80 \ - CALC_81 \ - CALC_82 \ - CALC_83 \ - CALC_84 \ - CALC_85 \ - CALC_86 \ - CALC_87 \ - CALC_88 \ - CALC_89 \ - CALC_90 \ - CALC_91 \ - CALC_92 \ - CALC_93 \ - CALC_94 \ - CALC_95 \ - CALC_96 \ - CALC_97 \ - CALC_98 \ - CALC_99 \ - CALC_100 \ - CALC_101 \ - CALC_102 \ - CALC_103 \ - CALC_104 \ - CALC_105 \ - CALC_106 \ - CALC_107 \ - CALC_108 \ - CALC_109 \ - CALC_110 \ - CALC_111 \ - CALC_112 \ - CALC_113 \ - CALC_114 \ - CALC_115 \ - CALC_116 \ - CALC_117 \ - CALC_118 \ - CALC_119 \ - CALC_120 \ - CALC_121 \ - CALC_122 \ - CALC_123 \ - CALC_124 \ - CALC_125 \ - CALC_126 \ - CALC_127 \ - CALC_128 \ - CALC_129 \ - CALC_130 \ - CALC_131 \ - CALC_132 \ - CALC_133 \ - CALC_134 \ - CALC_135 \ - CALC_136 \ - CALC_137 \ - CALC_138 \ - CALC_139 \ - ADDQ $128, R13 \ //move to next even-64-byte block - CMPQ R13, R11 \ //is current block the last one? - CMOVQCC R8, R10 \ - CALC_140 \ - CALC_141 \ - CALC_142 \ - CALC_143 \ - CALC_144 \ - CALC_145 \ - CALC_146 \ - CALC_147 \ - CALC_148 \ - CALC_149 \ - CALC_150 \ - CALC_151 \ - CALC_152 \ - CALC_153 \ - CALC_154 \ - CALC_155 \ - CALC_156 \ - CALC_157 \ - CALC_158 \ - CALC_159 \ - UPDATE_HASH(SI,DI,DX,CX,BX) \ - MOVL SI, R12 \ //Reset state for AVX2 reg permutation - MOVL DI, SI \ - MOVL DX, DI \ - MOVL BX, DX \ - MOVL CX, AX \ - MOVL R12, CX \ - XCHGQ R15, R14 \ - JMP loop - - - -TEXT ·blockAVX2(SB),$1408-32 - - MOVQ dig+0(FP), DI - MOVQ p_base+8(FP), SI - MOVQ p_len+16(FP), DX - SHRQ $6, DX - SHLQ $6, DX - - MOVQ $K_XMM_AR<>(SB), R8 - - MOVQ DI, R9 - MOVQ SI, R10 - LEAQ 64(SI), R13 - - ADDQ SI, DX - ADDQ $64, DX - MOVQ DX, R11 - - CMPQ R13, R11 - CMOVQCC R8, R13 - - VMOVDQU BSWAP_SHUFB_CTL<>(SB), Y10 - - CALC // RET is inside macros - -DATA K_XMM_AR<>+0x00(SB)/4,$0x5a827999 -DATA K_XMM_AR<>+0x04(SB)/4,$0x5a827999 -DATA K_XMM_AR<>+0x08(SB)/4,$0x5a827999 -DATA K_XMM_AR<>+0x0c(SB)/4,$0x5a827999 -DATA K_XMM_AR<>+0x10(SB)/4,$0x5a827999 -DATA K_XMM_AR<>+0x14(SB)/4,$0x5a827999 -DATA K_XMM_AR<>+0x18(SB)/4,$0x5a827999 -DATA K_XMM_AR<>+0x1c(SB)/4,$0x5a827999 -DATA K_XMM_AR<>+0x20(SB)/4,$0x6ed9eba1 -DATA K_XMM_AR<>+0x24(SB)/4,$0x6ed9eba1 -DATA K_XMM_AR<>+0x28(SB)/4,$0x6ed9eba1 -DATA K_XMM_AR<>+0x2c(SB)/4,$0x6ed9eba1 -DATA K_XMM_AR<>+0x30(SB)/4,$0x6ed9eba1 -DATA K_XMM_AR<>+0x34(SB)/4,$0x6ed9eba1 -DATA K_XMM_AR<>+0x38(SB)/4,$0x6ed9eba1 -DATA K_XMM_AR<>+0x3c(SB)/4,$0x6ed9eba1 -DATA K_XMM_AR<>+0x40(SB)/4,$0x8f1bbcdc -DATA K_XMM_AR<>+0x44(SB)/4,$0x8f1bbcdc -DATA K_XMM_AR<>+0x48(SB)/4,$0x8f1bbcdc -DATA K_XMM_AR<>+0x4c(SB)/4,$0x8f1bbcdc -DATA K_XMM_AR<>+0x50(SB)/4,$0x8f1bbcdc -DATA K_XMM_AR<>+0x54(SB)/4,$0x8f1bbcdc -DATA K_XMM_AR<>+0x58(SB)/4,$0x8f1bbcdc -DATA K_XMM_AR<>+0x5c(SB)/4,$0x8f1bbcdc -DATA K_XMM_AR<>+0x60(SB)/4,$0xca62c1d6 -DATA K_XMM_AR<>+0x64(SB)/4,$0xca62c1d6 -DATA K_XMM_AR<>+0x68(SB)/4,$0xca62c1d6 -DATA K_XMM_AR<>+0x6c(SB)/4,$0xca62c1d6 -DATA K_XMM_AR<>+0x70(SB)/4,$0xca62c1d6 -DATA K_XMM_AR<>+0x74(SB)/4,$0xca62c1d6 -DATA K_XMM_AR<>+0x78(SB)/4,$0xca62c1d6 -DATA K_XMM_AR<>+0x7c(SB)/4,$0xca62c1d6 -GLOBL K_XMM_AR<>(SB),RODATA,$128 - -DATA BSWAP_SHUFB_CTL<>+0x00(SB)/4,$0x00010203 -DATA BSWAP_SHUFB_CTL<>+0x04(SB)/4,$0x04050607 -DATA BSWAP_SHUFB_CTL<>+0x08(SB)/4,$0x08090a0b -DATA BSWAP_SHUFB_CTL<>+0x0c(SB)/4,$0x0c0d0e0f -DATA BSWAP_SHUFB_CTL<>+0x10(SB)/4,$0x00010203 -DATA BSWAP_SHUFB_CTL<>+0x14(SB)/4,$0x04050607 -DATA BSWAP_SHUFB_CTL<>+0x18(SB)/4,$0x08090a0b -DATA BSWAP_SHUFB_CTL<>+0x1c(SB)/4,$0x0c0d0e0f -GLOBL BSWAP_SHUFB_CTL<>(SB),RODATA,$32 +// func blockAVX2(dig *digest, p []byte) +// Requires: AVX, AVX2, BMI, BMI2, CMOV +TEXT ·blockAVX2(SB), $1408-32 + MOVQ dig+0(FP), DI + MOVQ p_base+8(FP), SI + MOVQ p_len+16(FP), DX + SHRQ $0x06, DX + SHLQ $0x06, DX + LEAQ K_XMM_AR<>+0(SB), R8 + MOVQ DI, R9 + MOVQ SI, R10 + LEAQ 64(SI), R13 + ADDQ SI, DX + ADDQ $0x40, DX + MOVQ DX, R11 + CMPQ R13, R11 + CMOVQCC R8, R13 + VMOVDQU BSWAP_SHUFB_CTL<>+0(SB), Y10 + MOVL (R9), CX + MOVL 4(R9), SI + MOVL 8(R9), DI + MOVL 12(R9), AX + MOVL 16(R9), DX + MOVQ SP, R14 + LEAQ 672(SP), R15 + VMOVDQU (R10), X0 + VINSERTI128 $0x01, (R13), Y0, Y0 + VPSHUFB Y10, Y0, Y15 + VPADDD (R8), Y15, Y0 + VMOVDQU Y0, (R14) + VMOVDQU 16(R10), X0 + VINSERTI128 $0x01, 16(R13), Y0, Y0 + VPSHUFB Y10, Y0, Y14 + VPADDD (R8), Y14, Y0 + VMOVDQU Y0, 32(R14) + VMOVDQU 32(R10), X0 + VINSERTI128 $0x01, 32(R13), Y0, Y0 + VPSHUFB Y10, Y0, Y13 + VPADDD (R8), Y13, Y0 + VMOVDQU Y0, 64(R14) + VMOVDQU 48(R10), X0 + VINSERTI128 $0x01, 48(R13), Y0, Y0 + VPSHUFB Y10, Y0, Y12 + VPADDD (R8), Y12, Y0 + VMOVDQU Y0, 96(R14) + VPALIGNR $0x08, Y15, Y14, Y8 + VPSRLDQ $0x04, Y12, Y0 + VPXOR Y13, Y8, Y8 + VPXOR Y15, Y0, Y0 + VPXOR Y0, Y8, Y8 + VPSLLDQ $0x0c, Y8, Y9 + VPSLLD $0x01, Y8, Y0 + VPSRLD $0x1f, Y8, Y8 + VPOR Y8, Y0, Y0 + VPSLLD $0x02, Y9, Y8 + VPSRLD $0x1e, Y9, Y9 + VPXOR Y8, Y0, Y0 + VPXOR Y9, Y0, Y8 + VPADDD (R8), Y8, Y0 + VMOVDQU Y0, 128(R14) + VPALIGNR $0x08, Y14, Y13, Y7 + VPSRLDQ $0x04, Y8, Y0 + VPXOR Y12, Y7, Y7 + VPXOR Y14, Y0, Y0 + VPXOR Y0, Y7, Y7 + VPSLLDQ $0x0c, Y7, Y9 + VPSLLD $0x01, Y7, Y0 + VPSRLD $0x1f, Y7, Y7 + VPOR Y7, Y0, Y0 + VPSLLD $0x02, Y9, Y7 + VPSRLD $0x1e, Y9, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y0, Y7 + VPADDD 32(R8), Y7, Y0 + VMOVDQU Y0, 160(R14) + VPALIGNR $0x08, Y13, Y12, Y5 + VPSRLDQ $0x04, Y7, Y0 + VPXOR Y8, Y5, Y5 + VPXOR Y13, Y0, Y0 + VPXOR Y0, Y5, Y5 + VPSLLDQ $0x0c, Y5, Y9 + VPSLLD $0x01, Y5, Y0 + VPSRLD $0x1f, Y5, Y5 + VPOR Y5, Y0, Y0 + VPSLLD $0x02, Y9, Y5 + VPSRLD $0x1e, Y9, Y9 + VPXOR Y5, Y0, Y0 + VPXOR Y9, Y0, Y5 + VPADDD 32(R8), Y5, Y0 + VMOVDQU Y0, 192(R14) + VPALIGNR $0x08, Y12, Y8, Y3 + VPSRLDQ $0x04, Y5, Y0 + VPXOR Y7, Y3, Y3 + VPXOR Y12, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSLLDQ $0x0c, Y3, Y9 + VPSLLD $0x01, Y3, Y0 + VPSRLD $0x1f, Y3, Y3 + VPOR Y3, Y0, Y0 + VPSLLD $0x02, Y9, Y3 + VPSRLD $0x1e, Y9, Y9 + VPXOR Y3, Y0, Y0 + VPXOR Y9, Y0, Y3 + VPADDD 32(R8), Y3, Y0 + VMOVDQU Y0, 224(R14) + VPALIGNR $0x08, Y5, Y3, Y0 + VPXOR Y14, Y15, Y15 + VPXOR Y8, Y0, Y0 + VPXOR Y0, Y15, Y15 + VPSLLD $0x02, Y15, Y0 + VPSRLD $0x1e, Y15, Y15 + VPOR Y15, Y0, Y15 + VPADDD 32(R8), Y15, Y0 + VMOVDQU Y0, 256(R14) + VPALIGNR $0x08, Y3, Y15, Y0 + VPXOR Y13, Y14, Y14 + VPXOR Y7, Y0, Y0 + VPXOR Y0, Y14, Y14 + VPSLLD $0x02, Y14, Y0 + VPSRLD $0x1e, Y14, Y14 + VPOR Y14, Y0, Y14 + VPADDD 32(R8), Y14, Y0 + VMOVDQU Y0, 288(R14) + VPALIGNR $0x08, Y15, Y14, Y0 + VPXOR Y12, Y13, Y13 + VPXOR Y5, Y0, Y0 + VPXOR Y0, Y13, Y13 + VPSLLD $0x02, Y13, Y0 + VPSRLD $0x1e, Y13, Y13 + VPOR Y13, Y0, Y13 + VPADDD 64(R8), Y13, Y0 + VMOVDQU Y0, 320(R14) + VPALIGNR $0x08, Y14, Y13, Y0 + VPXOR Y8, Y12, Y12 + VPXOR Y3, Y0, Y0 + VPXOR Y0, Y12, Y12 + VPSLLD $0x02, Y12, Y0 + VPSRLD $0x1e, Y12, Y12 + VPOR Y12, Y0, Y12 + VPADDD 64(R8), Y12, Y0 + VMOVDQU Y0, 352(R14) + VPALIGNR $0x08, Y13, Y12, Y0 + VPXOR Y7, Y8, Y8 + VPXOR Y15, Y0, Y0 + VPXOR Y0, Y8, Y8 + VPSLLD $0x02, Y8, Y0 + VPSRLD $0x1e, Y8, Y8 + VPOR Y8, Y0, Y8 + VPADDD 64(R8), Y8, Y0 + VMOVDQU Y0, 384(R14) + VPALIGNR $0x08, Y12, Y8, Y0 + VPXOR Y5, Y7, Y7 + VPXOR Y14, Y0, Y0 + VPXOR Y0, Y7, Y7 + VPSLLD $0x02, Y7, Y0 + VPSRLD $0x1e, Y7, Y7 + VPOR Y7, Y0, Y7 + VPADDD 64(R8), Y7, Y0 + VMOVDQU Y0, 416(R14) + VPALIGNR $0x08, Y8, Y7, Y0 + VPXOR Y3, Y5, Y5 + VPXOR Y13, Y0, Y0 + VPXOR Y0, Y5, Y5 + VPSLLD $0x02, Y5, Y0 + VPSRLD $0x1e, Y5, Y5 + VPOR Y5, Y0, Y5 + VPADDD 64(R8), Y5, Y0 + VMOVDQU Y0, 448(R14) + VPALIGNR $0x08, Y7, Y5, Y0 + VPXOR Y15, Y3, Y3 + VPXOR Y12, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSLLD $0x02, Y3, Y0 + VPSRLD $0x1e, Y3, Y3 + VPOR Y3, Y0, Y3 + VPADDD 96(R8), Y3, Y0 + VMOVDQU Y0, 480(R14) + VPALIGNR $0x08, Y5, Y3, Y0 + VPXOR Y14, Y15, Y15 + VPXOR Y8, Y0, Y0 + VPXOR Y0, Y15, Y15 + VPSLLD $0x02, Y15, Y0 + VPSRLD $0x1e, Y15, Y15 + VPOR Y15, Y0, Y15 + VPADDD 96(R8), Y15, Y0 + VMOVDQU Y0, 512(R14) + VPALIGNR $0x08, Y3, Y15, Y0 + VPXOR Y13, Y14, Y14 + VPXOR Y7, Y0, Y0 + VPXOR Y0, Y14, Y14 + VPSLLD $0x02, Y14, Y0 + VPSRLD $0x1e, Y14, Y14 + VPOR Y14, Y0, Y14 + VPADDD 96(R8), Y14, Y0 + VMOVDQU Y0, 544(R14) + VPALIGNR $0x08, Y15, Y14, Y0 + VPXOR Y12, Y13, Y13 + VPXOR Y5, Y0, Y0 + VPXOR Y0, Y13, Y13 + VPSLLD $0x02, Y13, Y0 + VPSRLD $0x1e, Y13, Y13 + VPOR Y13, Y0, Y13 + VPADDD 96(R8), Y13, Y0 + VMOVDQU Y0, 576(R14) + VPALIGNR $0x08, Y14, Y13, Y0 + VPXOR Y8, Y12, Y12 + VPXOR Y3, Y0, Y0 + VPXOR Y0, Y12, Y12 + VPSLLD $0x02, Y12, Y0 + VPSRLD $0x1e, Y12, Y12 + VPOR Y12, Y0, Y12 + VPADDD 96(R8), Y12, Y0 + VMOVDQU Y0, 608(R14) + XCHGQ R15, R14 + +loop: + CMPQ R10, R8 + JNE begin + VZEROUPPER + RET + +begin: + MOVL SI, BX + RORXL $0x02, SI, SI + ANDNL AX, BX, BP + ANDL DI, BX + XORL BP, BX + ADDL (R15), DX + ANDNL DI, CX, BP + LEAL (DX)(BX*1), DX + RORXL $0x1b, CX, R12 + RORXL $0x02, CX, BX + VMOVDQU 128(R10), X0 + ANDL SI, CX + XORL BP, CX + LEAL (DX)(R12*1), DX + ADDL 4(R15), AX + ANDNL SI, DX, BP + LEAL (AX)(CX*1), AX + RORXL $0x1b, DX, R12 + RORXL $0x02, DX, CX + VINSERTI128 $0x01, 128(R13), Y0, Y0 + ANDL BX, DX + XORL BP, DX + LEAL (AX)(R12*1), AX + ADDL 8(R15), DI + ANDNL BX, AX, BP + LEAL (DI)(DX*1), DI + RORXL $0x1b, AX, R12 + RORXL $0x02, AX, DX + VPSHUFB Y10, Y0, Y15 + ANDL CX, AX + XORL BP, AX + LEAL (DI)(R12*1), DI + ADDL 12(R15), SI + ANDNL CX, DI, BP + LEAL (SI)(AX*1), SI + RORXL $0x1b, DI, R12 + RORXL $0x02, DI, AX + ANDL DX, DI + XORL BP, DI + LEAL (SI)(R12*1), SI + ADDL 32(R15), BX + ANDNL DX, SI, BP + LEAL (BX)(DI*1), BX + RORXL $0x1b, SI, R12 + RORXL $0x02, SI, DI + VPADDD (R8), Y15, Y0 + ANDL AX, SI + XORL BP, SI + LEAL (BX)(R12*1), BX + ADDL 36(R15), CX + ANDNL AX, BX, BP + LEAL (CX)(SI*1), CX + RORXL $0x1b, BX, R12 + RORXL $0x02, BX, SI + ANDL DI, BX + XORL BP, BX + LEAL (CX)(R12*1), CX + ADDL 40(R15), DX + ANDNL DI, CX, BP + LEAL (DX)(BX*1), DX + RORXL $0x1b, CX, R12 + RORXL $0x02, CX, BX + ANDL SI, CX + XORL BP, CX + LEAL (DX)(R12*1), DX + ADDL 44(R15), AX + ANDNL SI, DX, BP + LEAL (AX)(CX*1), AX + RORXL $0x1b, DX, R12 + RORXL $0x02, DX, CX + VMOVDQU Y0, (R14) + ANDL BX, DX + XORL BP, DX + LEAL (AX)(R12*1), AX + ADDL 64(R15), DI + ANDNL BX, AX, BP + LEAL (DI)(DX*1), DI + RORXL $0x1b, AX, R12 + RORXL $0x02, AX, DX + VMOVDQU 144(R10), X0 + ANDL CX, AX + XORL BP, AX + LEAL (DI)(R12*1), DI + ADDL 68(R15), SI + ANDNL CX, DI, BP + LEAL (SI)(AX*1), SI + RORXL $0x1b, DI, R12 + RORXL $0x02, DI, AX + VINSERTI128 $0x01, 144(R13), Y0, Y0 + ANDL DX, DI + XORL BP, DI + LEAL (SI)(R12*1), SI + ADDL 72(R15), BX + ANDNL DX, SI, BP + LEAL (BX)(DI*1), BX + RORXL $0x1b, SI, R12 + RORXL $0x02, SI, DI + VPSHUFB Y10, Y0, Y14 + ANDL AX, SI + XORL BP, SI + LEAL (BX)(R12*1), BX + ADDL 76(R15), CX + ANDNL AX, BX, BP + LEAL (CX)(SI*1), CX + RORXL $0x1b, BX, R12 + RORXL $0x02, BX, SI + ANDL DI, BX + XORL BP, BX + LEAL (CX)(R12*1), CX + ADDL 96(R15), DX + ANDNL DI, CX, BP + LEAL (DX)(BX*1), DX + RORXL $0x1b, CX, R12 + RORXL $0x02, CX, BX + VPADDD (R8), Y14, Y0 + ANDL SI, CX + XORL BP, CX + LEAL (DX)(R12*1), DX + ADDL 100(R15), AX + ANDNL SI, DX, BP + LEAL (AX)(CX*1), AX + RORXL $0x1b, DX, R12 + RORXL $0x02, DX, CX + ANDL BX, DX + XORL BP, DX + LEAL (AX)(R12*1), AX + ADDL 104(R15), DI + ANDNL BX, AX, BP + LEAL (DI)(DX*1), DI + RORXL $0x1b, AX, R12 + RORXL $0x02, AX, DX + ANDL CX, AX + XORL BP, AX + LEAL (DI)(R12*1), DI + ADDL 108(R15), SI + ANDNL CX, DI, BP + LEAL (SI)(AX*1), SI + RORXL $0x1b, DI, R12 + RORXL $0x02, DI, AX + VMOVDQU Y0, 32(R14) + ANDL DX, DI + XORL BP, DI + LEAL (SI)(R12*1), SI + ADDL 128(R15), BX + ANDNL DX, SI, BP + LEAL (BX)(DI*1), BX + RORXL $0x1b, SI, R12 + RORXL $0x02, SI, DI + VMOVDQU 160(R10), X0 + ANDL AX, SI + XORL BP, SI + LEAL (BX)(R12*1), BX + ADDL 132(R15), CX + ANDNL AX, BX, BP + LEAL (CX)(SI*1), CX + RORXL $0x1b, BX, R12 + RORXL $0x02, BX, SI + VINSERTI128 $0x01, 160(R13), Y0, Y0 + ANDL DI, BX + XORL BP, BX + LEAL (CX)(R12*1), CX + ADDL 136(R15), DX + ANDNL DI, CX, BP + LEAL (DX)(BX*1), DX + RORXL $0x1b, CX, R12 + RORXL $0x02, CX, BX + VPSHUFB Y10, Y0, Y13 + ANDL SI, CX + XORL BP, CX + LEAL (DX)(R12*1), DX + ADDL 140(R15), AX + LEAL (AX)(CX*1), AX + RORXL $0x1b, DX, R12 + RORXL $0x02, DX, CX + XORL BX, DX + ADDL R12, AX + XORL SI, DX + ADDL 160(R15), DI + LEAL (DI)(DX*1), DI + RORXL $0x1b, AX, R12 + RORXL $0x02, AX, DX + VPADDD (R8), Y13, Y0 + XORL CX, AX + ADDL R12, DI + XORL BX, AX + ADDL 164(R15), SI + LEAL (SI)(AX*1), SI + RORXL $0x1b, DI, R12 + RORXL $0x02, DI, AX + XORL DX, DI + ADDL R12, SI + XORL CX, DI + ADDL 168(R15), BX + LEAL (BX)(DI*1), BX + RORXL $0x1b, SI, R12 + RORXL $0x02, SI, DI + XORL AX, SI + ADDL R12, BX + XORL DX, SI + ADDL 172(R15), CX + LEAL (CX)(SI*1), CX + RORXL $0x1b, BX, R12 + RORXL $0x02, BX, SI + VMOVDQU Y0, 64(R14) + XORL DI, BX + ADDL R12, CX + XORL AX, BX + ADDL 192(R15), DX + LEAL (DX)(BX*1), DX + RORXL $0x1b, CX, R12 + RORXL $0x02, CX, BX + VMOVDQU 176(R10), X0 + XORL SI, CX + ADDL R12, DX + XORL DI, CX + ADDL 196(R15), AX + LEAL (AX)(CX*1), AX + RORXL $0x1b, DX, R12 + RORXL $0x02, DX, CX + VINSERTI128 $0x01, 176(R13), Y0, Y0 + XORL BX, DX + ADDL R12, AX + XORL SI, DX + ADDL 200(R15), DI + LEAL (DI)(DX*1), DI + RORXL $0x1b, AX, R12 + RORXL $0x02, AX, DX + VPSHUFB Y10, Y0, Y12 + XORL CX, AX + ADDL R12, DI + XORL BX, AX + ADDL 204(R15), SI + LEAL (SI)(AX*1), SI + RORXL $0x1b, DI, R12 + RORXL $0x02, DI, AX + XORL DX, DI + ADDL R12, SI + XORL CX, DI + ADDL 224(R15), BX + LEAL (BX)(DI*1), BX + RORXL $0x1b, SI, R12 + RORXL $0x02, SI, DI + VPADDD (R8), Y12, Y0 + XORL AX, SI + ADDL R12, BX + XORL DX, SI + ADDL 228(R15), CX + LEAL (CX)(SI*1), CX + RORXL $0x1b, BX, R12 + RORXL $0x02, BX, SI + XORL DI, BX + ADDL R12, CX + XORL AX, BX + ADDL 232(R15), DX + LEAL (DX)(BX*1), DX + RORXL $0x1b, CX, R12 + RORXL $0x02, CX, BX + XORL SI, CX + ADDL R12, DX + XORL DI, CX + ADDL 236(R15), AX + LEAL (AX)(CX*1), AX + RORXL $0x1b, DX, R12 + RORXL $0x02, DX, CX + VMOVDQU Y0, 96(R14) + XORL BX, DX + ADDL R12, AX + XORL SI, DX + ADDL 256(R15), DI + LEAL (DI)(DX*1), DI + RORXL $0x1b, AX, R12 + RORXL $0x02, AX, DX + VPALIGNR $0x08, Y15, Y14, Y8 + VPSRLDQ $0x04, Y12, Y0 + XORL CX, AX + ADDL R12, DI + XORL BX, AX + ADDL 260(R15), SI + LEAL (SI)(AX*1), SI + RORXL $0x1b, DI, R12 + RORXL $0x02, DI, AX + VPXOR Y13, Y8, Y8 + VPXOR Y15, Y0, Y0 + XORL DX, DI + ADDL R12, SI + XORL CX, DI + ADDL 264(R15), BX + LEAL (BX)(DI*1), BX + RORXL $0x1b, SI, R12 + RORXL $0x02, SI, DI + VPXOR Y0, Y8, Y8 + VPSLLDQ $0x0c, Y8, Y9 + XORL AX, SI + ADDL R12, BX + XORL DX, SI + ADDL 268(R15), CX + LEAL (CX)(SI*1), CX + RORXL $0x1b, BX, R12 + RORXL $0x02, BX, SI + VPSLLD $0x01, Y8, Y0 + VPSRLD $0x1f, Y8, Y8 + XORL DI, BX + ADDL R12, CX + XORL AX, BX + ADDL 288(R15), DX + LEAL (DX)(BX*1), DX + RORXL $0x1b, CX, R12 + RORXL $0x02, CX, BX + VPOR Y8, Y0, Y0 + VPSLLD $0x02, Y9, Y8 + XORL SI, CX + ADDL R12, DX + XORL DI, CX + ADDL 292(R15), AX + LEAL (AX)(CX*1), AX + RORXL $0x1b, DX, R12 + RORXL $0x02, DX, CX + VPSRLD $0x1e, Y9, Y9 + VPXOR Y8, Y0, Y0 + XORL BX, DX + ADDL R12, AX + XORL SI, DX + ADDL 296(R15), DI + LEAL (DI)(DX*1), DI + RORXL $0x1b, AX, R12 + RORXL $0x02, AX, DX + XORL CX, AX + ADDL R12, DI + XORL BX, AX + ADDL 300(R15), SI + VPXOR Y9, Y0, Y8 + VPADDD (R8), Y8, Y0 + VMOVDQU Y0, 128(R14) + LEAL (SI)(AX*1), SI + MOVL DX, BP + ORL DI, BP + RORXL $0x1b, DI, R12 + RORXL $0x02, DI, AX + ANDL CX, BP + ANDL DX, DI + ORL BP, DI + ADDL R12, SI + ADDL 320(R15), BX + VPALIGNR $0x08, Y14, Y13, Y7 + VPSRLDQ $0x04, Y8, Y0 + LEAL (BX)(DI*1), BX + MOVL AX, BP + ORL SI, BP + RORXL $0x1b, SI, R12 + RORXL $0x02, SI, DI + ANDL DX, BP + ANDL AX, SI + ORL BP, SI + ADDL R12, BX + ADDL 324(R15), CX + VPXOR Y12, Y7, Y7 + VPXOR Y14, Y0, Y0 + LEAL (CX)(SI*1), CX + MOVL DI, BP + ORL BX, BP + RORXL $0x1b, BX, R12 + RORXL $0x02, BX, SI + ANDL AX, BP + ANDL DI, BX + ORL BP, BX + ADDL R12, CX + ADDL 328(R15), DX + VPXOR Y0, Y7, Y7 + VPSLLDQ $0x0c, Y7, Y9 + LEAL (DX)(BX*1), DX + MOVL SI, BP + ORL CX, BP + RORXL $0x1b, CX, R12 + RORXL $0x02, CX, BX + ANDL DI, BP + ANDL SI, CX + ORL BP, CX + ADDL R12, DX + ADDL 332(R15), AX + VPSLLD $0x01, Y7, Y0 + VPSRLD $0x1f, Y7, Y7 + LEAL (AX)(CX*1), AX + MOVL BX, BP + ORL DX, BP + RORXL $0x1b, DX, R12 + RORXL $0x02, DX, CX + ANDL SI, BP + ANDL BX, DX + ORL BP, DX + ADDL R12, AX + ADDL 352(R15), DI + VPOR Y7, Y0, Y0 + VPSLLD $0x02, Y9, Y7 + LEAL (DI)(DX*1), DI + MOVL CX, BP + ORL AX, BP + RORXL $0x1b, AX, R12 + RORXL $0x02, AX, DX + ANDL BX, BP + ANDL CX, AX + ORL BP, AX + ADDL R12, DI + ADDL 356(R15), SI + VPSRLD $0x1e, Y9, Y9 + VPXOR Y7, Y0, Y0 + LEAL (SI)(AX*1), SI + MOVL DX, BP + ORL DI, BP + RORXL $0x1b, DI, R12 + RORXL $0x02, DI, AX + ANDL CX, BP + ANDL DX, DI + ORL BP, DI + ADDL R12, SI + ADDL 360(R15), BX + LEAL (BX)(DI*1), BX + MOVL AX, BP + ORL SI, BP + RORXL $0x1b, SI, R12 + RORXL $0x02, SI, DI + ANDL DX, BP + ANDL AX, SI + ORL BP, SI + ADDL R12, BX + ADDL 364(R15), CX + VPXOR Y9, Y0, Y7 + VPADDD 32(R8), Y7, Y0 + VMOVDQU Y0, 160(R14) + LEAL (CX)(SI*1), CX + MOVL DI, BP + ORL BX, BP + RORXL $0x1b, BX, R12 + RORXL $0x02, BX, SI + ANDL AX, BP + ANDL DI, BX + ORL BP, BX + ADDL R12, CX + ADDL 384(R15), DX + VPALIGNR $0x08, Y13, Y12, Y5 + VPSRLDQ $0x04, Y7, Y0 + LEAL (DX)(BX*1), DX + MOVL SI, BP + ORL CX, BP + RORXL $0x1b, CX, R12 + RORXL $0x02, CX, BX + ANDL DI, BP + ANDL SI, CX + ORL BP, CX + ADDL R12, DX + ADDL 388(R15), AX + VPXOR Y8, Y5, Y5 + VPXOR Y13, Y0, Y0 + LEAL (AX)(CX*1), AX + MOVL BX, BP + ORL DX, BP + RORXL $0x1b, DX, R12 + RORXL $0x02, DX, CX + ANDL SI, BP + ANDL BX, DX + ORL BP, DX + ADDL R12, AX + ADDL 392(R15), DI + VPXOR Y0, Y5, Y5 + VPSLLDQ $0x0c, Y5, Y9 + LEAL (DI)(DX*1), DI + MOVL CX, BP + ORL AX, BP + RORXL $0x1b, AX, R12 + RORXL $0x02, AX, DX + ANDL BX, BP + ANDL CX, AX + ORL BP, AX + ADDL R12, DI + ADDL 396(R15), SI + VPSLLD $0x01, Y5, Y0 + VPSRLD $0x1f, Y5, Y5 + LEAL (SI)(AX*1), SI + MOVL DX, BP + ORL DI, BP + RORXL $0x1b, DI, R12 + RORXL $0x02, DI, AX + ANDL CX, BP + ANDL DX, DI + ORL BP, DI + ADDL R12, SI + ADDL 416(R15), BX + VPOR Y5, Y0, Y0 + VPSLLD $0x02, Y9, Y5 + LEAL (BX)(DI*1), BX + MOVL AX, BP + ORL SI, BP + RORXL $0x1b, SI, R12 + RORXL $0x02, SI, DI + ANDL DX, BP + ANDL AX, SI + ORL BP, SI + ADDL R12, BX + ADDL 420(R15), CX + VPSRLD $0x1e, Y9, Y9 + VPXOR Y5, Y0, Y0 + LEAL (CX)(SI*1), CX + MOVL DI, BP + ORL BX, BP + RORXL $0x1b, BX, R12 + RORXL $0x02, BX, SI + ANDL AX, BP + ANDL DI, BX + ORL BP, BX + ADDL R12, CX + ADDL 424(R15), DX + LEAL (DX)(BX*1), DX + MOVL SI, BP + ORL CX, BP + RORXL $0x1b, CX, R12 + RORXL $0x02, CX, BX + ANDL DI, BP + ANDL SI, CX + ORL BP, CX + ADDL R12, DX + ADDL 428(R15), AX + VPXOR Y9, Y0, Y5 + VPADDD 32(R8), Y5, Y0 + VMOVDQU Y0, 192(R14) + LEAL (AX)(CX*1), AX + MOVL BX, BP + ORL DX, BP + RORXL $0x1b, DX, R12 + RORXL $0x02, DX, CX + ANDL SI, BP + ANDL BX, DX + ORL BP, DX + ADDL R12, AX + ADDL 448(R15), DI + VPALIGNR $0x08, Y12, Y8, Y3 + VPSRLDQ $0x04, Y5, Y0 + LEAL (DI)(DX*1), DI + MOVL CX, BP + ORL AX, BP + RORXL $0x1b, AX, R12 + RORXL $0x02, AX, DX + ANDL BX, BP + ANDL CX, AX + ORL BP, AX + ADDL R12, DI + ADDL 452(R15), SI + VPXOR Y7, Y3, Y3 + VPXOR Y12, Y0, Y0 + LEAL (SI)(AX*1), SI + MOVL DX, BP + ORL DI, BP + RORXL $0x1b, DI, R12 + RORXL $0x02, DI, AX + ANDL CX, BP + ANDL DX, DI + ORL BP, DI + ADDL R12, SI + ADDL 456(R15), BX + VPXOR Y0, Y3, Y3 + VPSLLDQ $0x0c, Y3, Y9 + LEAL (BX)(DI*1), BX + MOVL AX, BP + ORL SI, BP + RORXL $0x1b, SI, R12 + RORXL $0x02, SI, DI + ANDL DX, BP + ANDL AX, SI + ORL BP, SI + ADDL R12, BX + ADDL 460(R15), CX + LEAL (CX)(SI*1), CX + RORXL $0x1b, BX, R12 + RORXL $0x02, BX, SI + VPSLLD $0x01, Y3, Y0 + VPSRLD $0x1f, Y3, Y3 + XORL DI, BX + ADDL R12, CX + XORL AX, BX + ADDQ $0x80, R10 + CMPQ R10, R11 + CMOVQCC R8, R10 + ADDL 480(R15), DX + LEAL (DX)(BX*1), DX + RORXL $0x1b, CX, R12 + RORXL $0x02, CX, BX + VPOR Y3, Y0, Y0 + VPSLLD $0x02, Y9, Y3 + XORL SI, CX + ADDL R12, DX + XORL DI, CX + ADDL 484(R15), AX + LEAL (AX)(CX*1), AX + RORXL $0x1b, DX, R12 + RORXL $0x02, DX, CX + VPSRLD $0x1e, Y9, Y9 + VPXOR Y3, Y0, Y0 + XORL BX, DX + ADDL R12, AX + XORL SI, DX + ADDL 488(R15), DI + LEAL (DI)(DX*1), DI + RORXL $0x1b, AX, R12 + RORXL $0x02, AX, DX + XORL CX, AX + ADDL R12, DI + XORL BX, AX + ADDL 492(R15), SI + LEAL (SI)(AX*1), SI + RORXL $0x1b, DI, R12 + RORXL $0x02, DI, AX + VPXOR Y9, Y0, Y3 + VPADDD 32(R8), Y3, Y0 + VMOVDQU Y0, 224(R14) + XORL DX, DI + ADDL R12, SI + XORL CX, DI + ADDL 512(R15), BX + LEAL (BX)(DI*1), BX + RORXL $0x1b, SI, R12 + RORXL $0x02, SI, DI + VPALIGNR $0x08, Y5, Y3, Y0 + XORL AX, SI + ADDL R12, BX + XORL DX, SI + ADDL 516(R15), CX + LEAL (CX)(SI*1), CX + RORXL $0x1b, BX, R12 + RORXL $0x02, BX, SI + VPXOR Y14, Y15, Y15 + XORL DI, BX + ADDL R12, CX + XORL AX, BX + ADDL 520(R15), DX + LEAL (DX)(BX*1), DX + RORXL $0x1b, CX, R12 + RORXL $0x02, CX, BX + VPXOR Y8, Y0, Y0 + XORL SI, CX + ADDL R12, DX + XORL DI, CX + ADDL 524(R15), AX + LEAL (AX)(CX*1), AX + RORXL $0x1b, DX, R12 + RORXL $0x02, DX, CX + VPXOR Y0, Y15, Y15 + XORL BX, DX + ADDL R12, AX + XORL SI, DX + ADDL 544(R15), DI + LEAL (DI)(DX*1), DI + RORXL $0x1b, AX, R12 + RORXL $0x02, AX, DX + VPSLLD $0x02, Y15, Y0 + XORL CX, AX + ADDL R12, DI + XORL BX, AX + ADDL 548(R15), SI + LEAL (SI)(AX*1), SI + RORXL $0x1b, DI, R12 + RORXL $0x02, DI, AX + VPSRLD $0x1e, Y15, Y15 + VPOR Y15, Y0, Y15 + XORL DX, DI + ADDL R12, SI + XORL CX, DI + ADDL 552(R15), BX + LEAL (BX)(DI*1), BX + RORXL $0x1b, SI, R12 + RORXL $0x02, SI, DI + XORL AX, SI + ADDL R12, BX + XORL DX, SI + ADDL 556(R15), CX + LEAL (CX)(SI*1), CX + RORXL $0x1b, BX, R12 + RORXL $0x02, BX, SI + VPADDD 32(R8), Y15, Y0 + VMOVDQU Y0, 256(R14) + XORL DI, BX + ADDL R12, CX + XORL AX, BX + ADDL 576(R15), DX + LEAL (DX)(BX*1), DX + RORXL $0x1b, CX, R12 + RORXL $0x02, CX, BX + VPALIGNR $0x08, Y3, Y15, Y0 + XORL SI, CX + ADDL R12, DX + XORL DI, CX + ADDL 580(R15), AX + LEAL (AX)(CX*1), AX + RORXL $0x1b, DX, R12 + RORXL $0x02, DX, CX + VPXOR Y13, Y14, Y14 + XORL BX, DX + ADDL R12, AX + XORL SI, DX + ADDL 584(R15), DI + LEAL (DI)(DX*1), DI + RORXL $0x1b, AX, R12 + RORXL $0x02, AX, DX + VPXOR Y7, Y0, Y0 + XORL CX, AX + ADDL R12, DI + XORL BX, AX + ADDL 588(R15), SI + LEAL (SI)(AX*1), SI + RORXL $0x1b, DI, R12 + RORXL $0x02, DI, AX + VPXOR Y0, Y14, Y14 + XORL DX, DI + ADDL R12, SI + XORL CX, DI + ADDL 608(R15), BX + LEAL (BX)(DI*1), BX + RORXL $0x1b, SI, R12 + RORXL $0x02, SI, DI + VPSLLD $0x02, Y14, Y0 + XORL AX, SI + ADDL R12, BX + XORL DX, SI + ADDL 612(R15), CX + LEAL (CX)(SI*1), CX + RORXL $0x1b, BX, R12 + RORXL $0x02, BX, SI + VPSRLD $0x1e, Y14, Y14 + VPOR Y14, Y0, Y14 + XORL DI, BX + ADDL R12, CX + XORL AX, BX + ADDL 616(R15), DX + LEAL (DX)(BX*1), DX + RORXL $0x1b, CX, R12 + RORXL $0x02, CX, BX + XORL SI, CX + ADDL R12, DX + XORL DI, CX + ADDL 620(R15), AX + LEAL (AX)(CX*1), AX + RORXL $0x1b, DX, R12 + VPADDD 32(R8), Y14, Y0 + VMOVDQU Y0, 288(R14) + ADDL R12, AX + ADDL (R9), AX + MOVL AX, (R9) + ADDL 4(R9), DX + MOVL DX, 4(R9) + ADDL 8(R9), BX + MOVL BX, 8(R9) + ADDL 12(R9), SI + MOVL SI, 12(R9) + ADDL 16(R9), DI + MOVL DI, 16(R9) + CMPQ R10, R8 + JE loop + MOVL DX, CX + MOVL CX, DX + RORXL $0x02, CX, CX + ANDNL SI, DX, BP + ANDL BX, DX + XORL BP, DX + ADDL 16(R15), DI + ANDNL BX, AX, BP + LEAL (DI)(DX*1), DI + RORXL $0x1b, AX, R12 + RORXL $0x02, AX, DX + VPALIGNR $0x08, Y15, Y14, Y0 + ANDL CX, AX + XORL BP, AX + LEAL (DI)(R12*1), DI + ADDL 20(R15), SI + ANDNL CX, DI, BP + LEAL (SI)(AX*1), SI + RORXL $0x1b, DI, R12 + RORXL $0x02, DI, AX + VPXOR Y12, Y13, Y13 + ANDL DX, DI + XORL BP, DI + LEAL (SI)(R12*1), SI + ADDL 24(R15), BX + ANDNL DX, SI, BP + LEAL (BX)(DI*1), BX + RORXL $0x1b, SI, R12 + RORXL $0x02, SI, DI + VPXOR Y5, Y0, Y0 + ANDL AX, SI + XORL BP, SI + LEAL (BX)(R12*1), BX + ADDL 28(R15), CX + ANDNL AX, BX, BP + LEAL (CX)(SI*1), CX + RORXL $0x1b, BX, R12 + RORXL $0x02, BX, SI + VPXOR Y0, Y13, Y13 + ANDL DI, BX + XORL BP, BX + LEAL (CX)(R12*1), CX + ADDL 48(R15), DX + ANDNL DI, CX, BP + LEAL (DX)(BX*1), DX + RORXL $0x1b, CX, R12 + RORXL $0x02, CX, BX + VPSLLD $0x02, Y13, Y0 + ANDL SI, CX + XORL BP, CX + LEAL (DX)(R12*1), DX + ADDL 52(R15), AX + ANDNL SI, DX, BP + LEAL (AX)(CX*1), AX + RORXL $0x1b, DX, R12 + RORXL $0x02, DX, CX + VPSRLD $0x1e, Y13, Y13 + VPOR Y13, Y0, Y13 + ANDL BX, DX + XORL BP, DX + LEAL (AX)(R12*1), AX + ADDL 56(R15), DI + ANDNL BX, AX, BP + LEAL (DI)(DX*1), DI + RORXL $0x1b, AX, R12 + RORXL $0x02, AX, DX + ANDL CX, AX + XORL BP, AX + LEAL (DI)(R12*1), DI + ADDL 60(R15), SI + ANDNL CX, DI, BP + LEAL (SI)(AX*1), SI + RORXL $0x1b, DI, R12 + RORXL $0x02, DI, AX + VPADDD 64(R8), Y13, Y0 + VMOVDQU Y0, 320(R14) + ANDL DX, DI + XORL BP, DI + LEAL (SI)(R12*1), SI + ADDL 80(R15), BX + ANDNL DX, SI, BP + LEAL (BX)(DI*1), BX + RORXL $0x1b, SI, R12 + RORXL $0x02, SI, DI + VPALIGNR $0x08, Y14, Y13, Y0 + ANDL AX, SI + XORL BP, SI + LEAL (BX)(R12*1), BX + ADDL 84(R15), CX + ANDNL AX, BX, BP + LEAL (CX)(SI*1), CX + RORXL $0x1b, BX, R12 + RORXL $0x02, BX, SI + VPXOR Y8, Y12, Y12 + ANDL DI, BX + XORL BP, BX + LEAL (CX)(R12*1), CX + ADDL 88(R15), DX + ANDNL DI, CX, BP + LEAL (DX)(BX*1), DX + RORXL $0x1b, CX, R12 + RORXL $0x02, CX, BX + VPXOR Y3, Y0, Y0 + ANDL SI, CX + XORL BP, CX + LEAL (DX)(R12*1), DX + ADDL 92(R15), AX + ANDNL SI, DX, BP + LEAL (AX)(CX*1), AX + RORXL $0x1b, DX, R12 + RORXL $0x02, DX, CX + VPXOR Y0, Y12, Y12 + ANDL BX, DX + XORL BP, DX + LEAL (AX)(R12*1), AX + ADDL 112(R15), DI + ANDNL BX, AX, BP + LEAL (DI)(DX*1), DI + RORXL $0x1b, AX, R12 + RORXL $0x02, AX, DX + VPSLLD $0x02, Y12, Y0 + ANDL CX, AX + XORL BP, AX + LEAL (DI)(R12*1), DI + ADDL 116(R15), SI + ANDNL CX, DI, BP + LEAL (SI)(AX*1), SI + RORXL $0x1b, DI, R12 + RORXL $0x02, DI, AX + VPSRLD $0x1e, Y12, Y12 + VPOR Y12, Y0, Y12 + ANDL DX, DI + XORL BP, DI + LEAL (SI)(R12*1), SI + ADDL 120(R15), BX + ANDNL DX, SI, BP + LEAL (BX)(DI*1), BX + RORXL $0x1b, SI, R12 + RORXL $0x02, SI, DI + ANDL AX, SI + XORL BP, SI + LEAL (BX)(R12*1), BX + ADDL 124(R15), CX + ANDNL AX, BX, BP + LEAL (CX)(SI*1), CX + RORXL $0x1b, BX, R12 + RORXL $0x02, BX, SI + VPADDD 64(R8), Y12, Y0 + VMOVDQU Y0, 352(R14) + ANDL DI, BX + XORL BP, BX + LEAL (CX)(R12*1), CX + ADDL 144(R15), DX + ANDNL DI, CX, BP + LEAL (DX)(BX*1), DX + RORXL $0x1b, CX, R12 + RORXL $0x02, CX, BX + VPALIGNR $0x08, Y13, Y12, Y0 + ANDL SI, CX + XORL BP, CX + LEAL (DX)(R12*1), DX + ADDL 148(R15), AX + ANDNL SI, DX, BP + LEAL (AX)(CX*1), AX + RORXL $0x1b, DX, R12 + RORXL $0x02, DX, CX + VPXOR Y7, Y8, Y8 + ANDL BX, DX + XORL BP, DX + LEAL (AX)(R12*1), AX + ADDL 152(R15), DI + ANDNL BX, AX, BP + LEAL (DI)(DX*1), DI + RORXL $0x1b, AX, R12 + RORXL $0x02, AX, DX + VPXOR Y15, Y0, Y0 + ANDL CX, AX + XORL BP, AX + LEAL (DI)(R12*1), DI + ADDL 156(R15), SI + LEAL (SI)(AX*1), SI + RORXL $0x1b, DI, R12 + RORXL $0x02, DI, AX + VPXOR Y0, Y8, Y8 + XORL DX, DI + ADDL R12, SI + XORL CX, DI + ADDL 176(R15), BX + LEAL (BX)(DI*1), BX + RORXL $0x1b, SI, R12 + RORXL $0x02, SI, DI + VPSLLD $0x02, Y8, Y0 + XORL AX, SI + ADDL R12, BX + XORL DX, SI + ADDL 180(R15), CX + LEAL (CX)(SI*1), CX + RORXL $0x1b, BX, R12 + RORXL $0x02, BX, SI + VPSRLD $0x1e, Y8, Y8 + VPOR Y8, Y0, Y8 + XORL DI, BX + ADDL R12, CX + XORL AX, BX + ADDL 184(R15), DX + LEAL (DX)(BX*1), DX + RORXL $0x1b, CX, R12 + RORXL $0x02, CX, BX + XORL SI, CX + ADDL R12, DX + XORL DI, CX + ADDL 188(R15), AX + LEAL (AX)(CX*1), AX + RORXL $0x1b, DX, R12 + RORXL $0x02, DX, CX + VPADDD 64(R8), Y8, Y0 + VMOVDQU Y0, 384(R14) + XORL BX, DX + ADDL R12, AX + XORL SI, DX + ADDL 208(R15), DI + LEAL (DI)(DX*1), DI + RORXL $0x1b, AX, R12 + RORXL $0x02, AX, DX + VPALIGNR $0x08, Y12, Y8, Y0 + XORL CX, AX + ADDL R12, DI + XORL BX, AX + ADDL 212(R15), SI + LEAL (SI)(AX*1), SI + RORXL $0x1b, DI, R12 + RORXL $0x02, DI, AX + VPXOR Y5, Y7, Y7 + XORL DX, DI + ADDL R12, SI + XORL CX, DI + ADDL 216(R15), BX + LEAL (BX)(DI*1), BX + RORXL $0x1b, SI, R12 + RORXL $0x02, SI, DI + VPXOR Y14, Y0, Y0 + XORL AX, SI + ADDL R12, BX + XORL DX, SI + ADDL 220(R15), CX + LEAL (CX)(SI*1), CX + RORXL $0x1b, BX, R12 + RORXL $0x02, BX, SI + VPXOR Y0, Y7, Y7 + XORL DI, BX + ADDL R12, CX + XORL AX, BX + ADDL 240(R15), DX + LEAL (DX)(BX*1), DX + RORXL $0x1b, CX, R12 + RORXL $0x02, CX, BX + VPSLLD $0x02, Y7, Y0 + XORL SI, CX + ADDL R12, DX + XORL DI, CX + ADDL 244(R15), AX + LEAL (AX)(CX*1), AX + RORXL $0x1b, DX, R12 + RORXL $0x02, DX, CX + VPSRLD $0x1e, Y7, Y7 + VPOR Y7, Y0, Y7 + XORL BX, DX + ADDL R12, AX + XORL SI, DX + ADDL 248(R15), DI + LEAL (DI)(DX*1), DI + RORXL $0x1b, AX, R12 + RORXL $0x02, AX, DX + XORL CX, AX + ADDL R12, DI + XORL BX, AX + ADDL 252(R15), SI + LEAL (SI)(AX*1), SI + RORXL $0x1b, DI, R12 + RORXL $0x02, DI, AX + VPADDD 64(R8), Y7, Y0 + VMOVDQU Y0, 416(R14) + XORL DX, DI + ADDL R12, SI + XORL CX, DI + ADDL 272(R15), BX + LEAL (BX)(DI*1), BX + RORXL $0x1b, SI, R12 + RORXL $0x02, SI, DI + VPALIGNR $0x08, Y8, Y7, Y0 + XORL AX, SI + ADDL R12, BX + XORL DX, SI + ADDL 276(R15), CX + LEAL (CX)(SI*1), CX + RORXL $0x1b, BX, R12 + RORXL $0x02, BX, SI + VPXOR Y3, Y5, Y5 + XORL DI, BX + ADDL R12, CX + XORL AX, BX + ADDL 280(R15), DX + LEAL (DX)(BX*1), DX + RORXL $0x1b, CX, R12 + RORXL $0x02, CX, BX + VPXOR Y13, Y0, Y0 + XORL SI, CX + ADDL R12, DX + XORL DI, CX + ADDL 284(R15), AX + LEAL (AX)(CX*1), AX + RORXL $0x1b, DX, R12 + RORXL $0x02, DX, CX + VPXOR Y0, Y5, Y5 + XORL BX, DX + ADDL R12, AX + XORL SI, DX + ADDL 304(R15), DI + LEAL (DI)(DX*1), DI + RORXL $0x1b, AX, R12 + RORXL $0x02, AX, DX + VPSLLD $0x02, Y5, Y0 + XORL CX, AX + ADDL R12, DI + XORL BX, AX + ADDL 308(R15), SI + LEAL (SI)(AX*1), SI + RORXL $0x1b, DI, R12 + RORXL $0x02, DI, AX + VPSRLD $0x1e, Y5, Y5 + VPOR Y5, Y0, Y5 + XORL DX, DI + ADDL R12, SI + XORL CX, DI + ADDL 312(R15), BX + LEAL (BX)(DI*1), BX + RORXL $0x1b, SI, R12 + RORXL $0x02, SI, DI + XORL AX, SI + ADDL R12, BX + XORL DX, SI + ADDL 316(R15), CX + VPADDD 64(R8), Y5, Y0 + VMOVDQU Y0, 448(R14) + LEAL (CX)(SI*1), CX + MOVL DI, BP + ORL BX, BP + RORXL $0x1b, BX, R12 + RORXL $0x02, BX, SI + ANDL AX, BP + ANDL DI, BX + ORL BP, BX + ADDL R12, CX + ADDL 336(R15), DX + VPALIGNR $0x08, Y7, Y5, Y0 + LEAL (DX)(BX*1), DX + MOVL SI, BP + ORL CX, BP + RORXL $0x1b, CX, R12 + RORXL $0x02, CX, BX + ANDL DI, BP + ANDL SI, CX + ORL BP, CX + ADDL R12, DX + ADDL 340(R15), AX + VPXOR Y15, Y3, Y3 + LEAL (AX)(CX*1), AX + MOVL BX, BP + ORL DX, BP + RORXL $0x1b, DX, R12 + RORXL $0x02, DX, CX + ANDL SI, BP + ANDL BX, DX + ORL BP, DX + ADDL R12, AX + ADDL 344(R15), DI + VPXOR Y12, Y0, Y0 + LEAL (DI)(DX*1), DI + MOVL CX, BP + ORL AX, BP + RORXL $0x1b, AX, R12 + RORXL $0x02, AX, DX + ANDL BX, BP + ANDL CX, AX + ORL BP, AX + ADDL R12, DI + ADDL 348(R15), SI + VPXOR Y0, Y3, Y3 + LEAL (SI)(AX*1), SI + MOVL DX, BP + ORL DI, BP + RORXL $0x1b, DI, R12 + RORXL $0x02, DI, AX + ANDL CX, BP + ANDL DX, DI + ORL BP, DI + ADDL R12, SI + ADDL 368(R15), BX + VPSLLD $0x02, Y3, Y0 + LEAL (BX)(DI*1), BX + MOVL AX, BP + ORL SI, BP + RORXL $0x1b, SI, R12 + RORXL $0x02, SI, DI + ANDL DX, BP + ANDL AX, SI + ORL BP, SI + ADDL R12, BX + ADDL 372(R15), CX + VPSRLD $0x1e, Y3, Y3 + VPOR Y3, Y0, Y3 + LEAL (CX)(SI*1), CX + MOVL DI, BP + ORL BX, BP + RORXL $0x1b, BX, R12 + RORXL $0x02, BX, SI + ANDL AX, BP + ANDL DI, BX + ORL BP, BX + ADDL R12, CX + ADDL 376(R15), DX + LEAL (DX)(BX*1), DX + MOVL SI, BP + ORL CX, BP + RORXL $0x1b, CX, R12 + RORXL $0x02, CX, BX + ANDL DI, BP + ANDL SI, CX + ORL BP, CX + ADDL R12, DX + ADDL 380(R15), AX + VPADDD 96(R8), Y3, Y0 + VMOVDQU Y0, 480(R14) + LEAL (AX)(CX*1), AX + MOVL BX, BP + ORL DX, BP + RORXL $0x1b, DX, R12 + RORXL $0x02, DX, CX + ANDL SI, BP + ANDL BX, DX + ORL BP, DX + ADDL R12, AX + ADDL 400(R15), DI + VPALIGNR $0x08, Y5, Y3, Y0 + LEAL (DI)(DX*1), DI + MOVL CX, BP + ORL AX, BP + RORXL $0x1b, AX, R12 + RORXL $0x02, AX, DX + ANDL BX, BP + ANDL CX, AX + ORL BP, AX + ADDL R12, DI + ADDL 404(R15), SI + VPXOR Y14, Y15, Y15 + LEAL (SI)(AX*1), SI + MOVL DX, BP + ORL DI, BP + RORXL $0x1b, DI, R12 + RORXL $0x02, DI, AX + ANDL CX, BP + ANDL DX, DI + ORL BP, DI + ADDL R12, SI + ADDL 408(R15), BX + VPXOR Y8, Y0, Y0 + LEAL (BX)(DI*1), BX + MOVL AX, BP + ORL SI, BP + RORXL $0x1b, SI, R12 + RORXL $0x02, SI, DI + ANDL DX, BP + ANDL AX, SI + ORL BP, SI + ADDL R12, BX + ADDL 412(R15), CX + VPXOR Y0, Y15, Y15 + LEAL (CX)(SI*1), CX + MOVL DI, BP + ORL BX, BP + RORXL $0x1b, BX, R12 + RORXL $0x02, BX, SI + ANDL AX, BP + ANDL DI, BX + ORL BP, BX + ADDL R12, CX + ADDL 432(R15), DX + VPSLLD $0x02, Y15, Y0 + LEAL (DX)(BX*1), DX + MOVL SI, BP + ORL CX, BP + RORXL $0x1b, CX, R12 + RORXL $0x02, CX, BX + ANDL DI, BP + ANDL SI, CX + ORL BP, CX + ADDL R12, DX + ADDL 436(R15), AX + VPSRLD $0x1e, Y15, Y15 + VPOR Y15, Y0, Y15 + LEAL (AX)(CX*1), AX + MOVL BX, BP + ORL DX, BP + RORXL $0x1b, DX, R12 + RORXL $0x02, DX, CX + ANDL SI, BP + ANDL BX, DX + ORL BP, DX + ADDL R12, AX + ADDL 440(R15), DI + LEAL (DI)(DX*1), DI + MOVL CX, BP + ORL AX, BP + RORXL $0x1b, AX, R12 + RORXL $0x02, AX, DX + ANDL BX, BP + ANDL CX, AX + ORL BP, AX + ADDL R12, DI + ADDL 444(R15), SI + VPADDD 96(R8), Y15, Y0 + VMOVDQU Y0, 512(R14) + LEAL (SI)(AX*1), SI + MOVL DX, BP + ORL DI, BP + RORXL $0x1b, DI, R12 + RORXL $0x02, DI, AX + ANDL CX, BP + ANDL DX, DI + ORL BP, DI + ADDL R12, SI + ADDL 464(R15), BX + VPALIGNR $0x08, Y3, Y15, Y0 + LEAL (BX)(DI*1), BX + MOVL AX, BP + ORL SI, BP + RORXL $0x1b, SI, R12 + RORXL $0x02, SI, DI + ANDL DX, BP + ANDL AX, SI + ORL BP, SI + ADDL R12, BX + ADDL 468(R15), CX + VPXOR Y13, Y14, Y14 + LEAL (CX)(SI*1), CX + MOVL DI, BP + ORL BX, BP + RORXL $0x1b, BX, R12 + RORXL $0x02, BX, SI + ANDL AX, BP + ANDL DI, BX + ORL BP, BX + ADDL R12, CX + ADDL 472(R15), DX + VPXOR Y7, Y0, Y0 + LEAL (DX)(BX*1), DX + MOVL SI, BP + ORL CX, BP + RORXL $0x1b, CX, R12 + RORXL $0x02, CX, BX + ANDL DI, BP + ANDL SI, CX + ORL BP, CX + ADDL R12, DX + ADDL 476(R15), AX + LEAL (AX)(CX*1), AX + RORXL $0x1b, DX, R12 + RORXL $0x02, DX, CX + VPXOR Y0, Y14, Y14 + XORL BX, DX + ADDL R12, AX + XORL SI, DX + ADDQ $0x80, R13 + CMPQ R13, R11 + CMOVQCC R8, R10 + ADDL 496(R15), DI + LEAL (DI)(DX*1), DI + RORXL $0x1b, AX, R12 + RORXL $0x02, AX, DX + VPSLLD $0x02, Y14, Y0 + XORL CX, AX + ADDL R12, DI + XORL BX, AX + ADDL 500(R15), SI + LEAL (SI)(AX*1), SI + RORXL $0x1b, DI, R12 + RORXL $0x02, DI, AX + VPSRLD $0x1e, Y14, Y14 + VPOR Y14, Y0, Y14 + XORL DX, DI + ADDL R12, SI + XORL CX, DI + ADDL 504(R15), BX + LEAL (BX)(DI*1), BX + RORXL $0x1b, SI, R12 + RORXL $0x02, SI, DI + XORL AX, SI + ADDL R12, BX + XORL DX, SI + ADDL 508(R15), CX + LEAL (CX)(SI*1), CX + RORXL $0x1b, BX, R12 + RORXL $0x02, BX, SI + VPADDD 96(R8), Y14, Y0 + VMOVDQU Y0, 544(R14) + XORL DI, BX + ADDL R12, CX + XORL AX, BX + ADDL 528(R15), DX + LEAL (DX)(BX*1), DX + RORXL $0x1b, CX, R12 + RORXL $0x02, CX, BX + VPALIGNR $0x08, Y15, Y14, Y0 + XORL SI, CX + ADDL R12, DX + XORL DI, CX + ADDL 532(R15), AX + LEAL (AX)(CX*1), AX + RORXL $0x1b, DX, R12 + RORXL $0x02, DX, CX + VPXOR Y12, Y13, Y13 + XORL BX, DX + ADDL R12, AX + XORL SI, DX + ADDL 536(R15), DI + LEAL (DI)(DX*1), DI + RORXL $0x1b, AX, R12 + RORXL $0x02, AX, DX + VPXOR Y5, Y0, Y0 + XORL CX, AX + ADDL R12, DI + XORL BX, AX + ADDL 540(R15), SI + LEAL (SI)(AX*1), SI + RORXL $0x1b, DI, R12 + RORXL $0x02, DI, AX + VPXOR Y0, Y13, Y13 + XORL DX, DI + ADDL R12, SI + XORL CX, DI + ADDL 560(R15), BX + LEAL (BX)(DI*1), BX + RORXL $0x1b, SI, R12 + RORXL $0x02, SI, DI + VPSLLD $0x02, Y13, Y0 + XORL AX, SI + ADDL R12, BX + XORL DX, SI + ADDL 564(R15), CX + LEAL (CX)(SI*1), CX + RORXL $0x1b, BX, R12 + RORXL $0x02, BX, SI + VPSRLD $0x1e, Y13, Y13 + VPOR Y13, Y0, Y13 + XORL DI, BX + ADDL R12, CX + XORL AX, BX + ADDL 568(R15), DX + LEAL (DX)(BX*1), DX + RORXL $0x1b, CX, R12 + RORXL $0x02, CX, BX + XORL SI, CX + ADDL R12, DX + XORL DI, CX + ADDL 572(R15), AX + LEAL (AX)(CX*1), AX + RORXL $0x1b, DX, R12 + RORXL $0x02, DX, CX + VPADDD 96(R8), Y13, Y0 + VMOVDQU Y0, 576(R14) + XORL BX, DX + ADDL R12, AX + XORL SI, DX + ADDL 592(R15), DI + LEAL (DI)(DX*1), DI + RORXL $0x1b, AX, R12 + RORXL $0x02, AX, DX + VPALIGNR $0x08, Y14, Y13, Y0 + XORL CX, AX + ADDL R12, DI + XORL BX, AX + ADDL 596(R15), SI + LEAL (SI)(AX*1), SI + RORXL $0x1b, DI, R12 + RORXL $0x02, DI, AX + VPXOR Y8, Y12, Y12 + XORL DX, DI + ADDL R12, SI + XORL CX, DI + ADDL 600(R15), BX + LEAL (BX)(DI*1), BX + RORXL $0x1b, SI, R12 + RORXL $0x02, SI, DI + VPXOR Y3, Y0, Y0 + XORL AX, SI + ADDL R12, BX + XORL DX, SI + ADDL 604(R15), CX + LEAL (CX)(SI*1), CX + RORXL $0x1b, BX, R12 + RORXL $0x02, BX, SI + VPXOR Y0, Y12, Y12 + XORL DI, BX + ADDL R12, CX + XORL AX, BX + ADDL 624(R15), DX + LEAL (DX)(BX*1), DX + RORXL $0x1b, CX, R12 + RORXL $0x02, CX, BX + VPSLLD $0x02, Y12, Y0 + XORL SI, CX + ADDL R12, DX + XORL DI, CX + ADDL 628(R15), AX + LEAL (AX)(CX*1), AX + RORXL $0x1b, DX, R12 + RORXL $0x02, DX, CX + VPSRLD $0x1e, Y12, Y12 + VPOR Y12, Y0, Y12 + XORL BX, DX + ADDL R12, AX + XORL SI, DX + ADDL 632(R15), DI + LEAL (DI)(DX*1), DI + RORXL $0x1b, AX, R12 + RORXL $0x02, AX, DX + XORL CX, AX + ADDL R12, DI + XORL BX, AX + ADDL 636(R15), SI + LEAL (SI)(AX*1), SI + RORXL $0x1b, DI, R12 + VPADDD 96(R8), Y12, Y0 + VMOVDQU Y0, 608(R14) + ADDL R12, SI + ADDL (R9), SI + MOVL SI, (R9) + ADDL 4(R9), DI + MOVL DI, 4(R9) + ADDL 8(R9), DX + MOVL DX, 8(R9) + ADDL 12(R9), CX + MOVL CX, 12(R9) + ADDL 16(R9), BX + MOVL BX, 16(R9) + MOVL SI, R12 + MOVL DI, SI + MOVL DX, DI + MOVL BX, DX + MOVL CX, AX + MOVL R12, CX + XCHGQ R15, R14 + JMP loop + +DATA K_XMM_AR<>+0(SB)/4, $0x5a827999 +DATA K_XMM_AR<>+4(SB)/4, $0x5a827999 +DATA K_XMM_AR<>+8(SB)/4, $0x5a827999 +DATA K_XMM_AR<>+12(SB)/4, $0x5a827999 +DATA K_XMM_AR<>+16(SB)/4, $0x5a827999 +DATA K_XMM_AR<>+20(SB)/4, $0x5a827999 +DATA K_XMM_AR<>+24(SB)/4, $0x5a827999 +DATA K_XMM_AR<>+28(SB)/4, $0x5a827999 +DATA K_XMM_AR<>+32(SB)/4, $0x6ed9eba1 +DATA K_XMM_AR<>+36(SB)/4, $0x6ed9eba1 +DATA K_XMM_AR<>+40(SB)/4, $0x6ed9eba1 +DATA K_XMM_AR<>+44(SB)/4, $0x6ed9eba1 +DATA K_XMM_AR<>+48(SB)/4, $0x6ed9eba1 +DATA K_XMM_AR<>+52(SB)/4, $0x6ed9eba1 +DATA K_XMM_AR<>+56(SB)/4, $0x6ed9eba1 +DATA K_XMM_AR<>+60(SB)/4, $0x6ed9eba1 +DATA K_XMM_AR<>+64(SB)/4, $0x8f1bbcdc +DATA K_XMM_AR<>+68(SB)/4, $0x8f1bbcdc +DATA K_XMM_AR<>+72(SB)/4, $0x8f1bbcdc +DATA K_XMM_AR<>+76(SB)/4, $0x8f1bbcdc +DATA K_XMM_AR<>+80(SB)/4, $0x8f1bbcdc +DATA K_XMM_AR<>+84(SB)/4, $0x8f1bbcdc +DATA K_XMM_AR<>+88(SB)/4, $0x8f1bbcdc +DATA K_XMM_AR<>+92(SB)/4, $0x8f1bbcdc +DATA K_XMM_AR<>+96(SB)/4, $0xca62c1d6 +DATA K_XMM_AR<>+100(SB)/4, $0xca62c1d6 +DATA K_XMM_AR<>+104(SB)/4, $0xca62c1d6 +DATA K_XMM_AR<>+108(SB)/4, $0xca62c1d6 +DATA K_XMM_AR<>+112(SB)/4, $0xca62c1d6 +DATA K_XMM_AR<>+116(SB)/4, $0xca62c1d6 +DATA K_XMM_AR<>+120(SB)/4, $0xca62c1d6 +DATA K_XMM_AR<>+124(SB)/4, $0xca62c1d6 +GLOBL K_XMM_AR<>(SB), RODATA, $128 + +DATA BSWAP_SHUFB_CTL<>+0(SB)/4, $0x00010203 +DATA BSWAP_SHUFB_CTL<>+4(SB)/4, $0x04050607 +DATA BSWAP_SHUFB_CTL<>+8(SB)/4, $0x08090a0b +DATA BSWAP_SHUFB_CTL<>+12(SB)/4, $0x0c0d0e0f +DATA BSWAP_SHUFB_CTL<>+16(SB)/4, $0x00010203 +DATA BSWAP_SHUFB_CTL<>+20(SB)/4, $0x04050607 +DATA BSWAP_SHUFB_CTL<>+24(SB)/4, $0x08090a0b +DATA BSWAP_SHUFB_CTL<>+28(SB)/4, $0x0c0d0e0f +GLOBL BSWAP_SHUFB_CTL<>(SB), RODATA, $32 diff --git a/src/go/types/stdlib_test.go b/src/go/types/stdlib_test.go index d0ed66a2e1c..ac38826ecb6 100644 --- a/src/go/types/stdlib_test.go +++ b/src/go/types/stdlib_test.go @@ -359,6 +359,7 @@ var excluded = map[string]bool{ // See go.dev/issue/46027: some imports are missing for this submodule. "crypto/internal/bigmod/_asm": true, "crypto/internal/edwards25519/field/_asm": true, + "crypto/sha1/_asm": true, "crypto/sha256/_asm": true, }