From 2f32138aba23467201c6106ce5e4d63d530d972b Mon Sep 17 00:00:00 2001 From: Russ Cox Date: Thu, 21 Mar 2013 11:32:02 -0400 Subject: [PATCH] crypto/sha1: faster amd64, 386 implementations -- amd64 -- On a MacBookPro10,2 (Core i5): benchmark old ns/op new ns/op delta BenchmarkHash8Bytes 785 592 -24.59% BenchmarkHash1K 8727 3014 -65.46% BenchmarkHash8K 64926 20723 -68.08% benchmark old MB/s new MB/s speedup BenchmarkHash8Bytes 10.19 13.50 1.32x BenchmarkHash1K 117.34 339.71 2.90x BenchmarkHash8K 126.17 395.31 3.13x For comparison, on the same machine, openssl 0.9.8r reports its sha1 speed as 341 MB/s for 1K and 404 MB/s for 8K. On an Intel Xeon E5520: benchmark old ns/op new ns/op delta BenchmarkHash8Bytes 984 707 -28.15% BenchmarkHash1K 11141 3466 -68.89% BenchmarkHash8K 82435 23411 -71.60% benchmark old MB/s new MB/s speedup BenchmarkHash8Bytes 8.13 11.31 1.39x BenchmarkHash1K 91.91 295.36 3.21x BenchmarkHash8K 99.37 349.91 3.52x For comparison, on the same machine, openssl 1.0.1 reports its sha1 speed as 286 MB/s for 1K and 394 MB/s for 8K. -- 386 -- On a MacBookPro10,2 (Core i5): benchmark old ns/op new ns/op delta BenchmarkHash8Bytes 1041 713 -31.51% BenchmarkHash1K 15612 3382 -78.34% BenchmarkHash8K 110152 22733 -79.36% benchmark old MB/s new MB/s speedup BenchmarkHash8Bytes 7.68 11.21 1.46x BenchmarkHash1K 65.59 302.76 4.62x BenchmarkHash8K 74.37 360.36 4.85x On an Intel Xeon E5520: benchmark old ns/op new ns/op delta BenchmarkHash8Bytes 1221 842 -31.04% BenchmarkHash1K 14643 4137 -71.75% BenchmarkHash8K 108722 27394 -74.80% benchmark old MB/s new MB/s speedup BenchmarkHash8Bytes 6.55 9.49 1.45x BenchmarkHash1K 69.93 247.51 3.54x BenchmarkHash8K 75.35 299.04 3.97x R=agl, dave CC=golang-dev https://golang.org/cl/7763049 --- src/pkg/crypto/sha1/sha1block.go | 2 + src/pkg/crypto/sha1/sha1block_386.s | 233 ++++++++++++++++++++++++++ src/pkg/crypto/sha1/sha1block_amd64.s | 216 ++++++++++++++++++++++++ src/pkg/crypto/sha1/sha1block_decl.go | 9 + 4 files changed, 460 insertions(+) create mode 100644 src/pkg/crypto/sha1/sha1block_386.s create mode 100644 src/pkg/crypto/sha1/sha1block_amd64.s create mode 100644 src/pkg/crypto/sha1/sha1block_decl.go diff --git a/src/pkg/crypto/sha1/sha1block.go b/src/pkg/crypto/sha1/sha1block.go index 1c9507c68e5..92224fc0ef8 100644 --- a/src/pkg/crypto/sha1/sha1block.go +++ b/src/pkg/crypto/sha1/sha1block.go @@ -2,6 +2,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +// +build !amd64,!386 + // SHA1 block step. // In its own file so that a faster assembly or C version // can be substituted easily. diff --git a/src/pkg/crypto/sha1/sha1block_386.s b/src/pkg/crypto/sha1/sha1block_386.s new file mode 100644 index 00000000000..fbf237b3fe0 --- /dev/null +++ b/src/pkg/crypto/sha1/sha1block_386.s @@ -0,0 +1,233 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// SHA1 block routine. See sha1block.go for Go equivalent. +// +// There are 80 rounds of 4 types: +// - rounds 0-15 are type 1 and load data (ROUND1 macro). +// - rounds 16-19 are type 1 and do not load data (ROUND1x macro). +// - rounds 20-39 are type 2 and do not load data (ROUND2 macro). +// - rounds 40-59 are type 3 and do not load data (ROUND3 macro). +// - rounds 60-79 are type 4 and do not load data (ROUND4 macro). +// +// Each round loads or shuffles the data, then computes a per-round +// function of b, c, d, and then mixes the result into and rotates the +// five registers a, b, c, d, e holding the intermediate results. +// +// The register rotation is implemented by rotating the arguments to +// the round macros instead of by explicit move instructions. + +// Like sha1block_amd64.s, but we keep the data and limit pointers on the stack. +// To free up the word pointer (R10 on amd64, DI here), we add it to e during +// LOAD/SHUFFLE instead of during MIX. +// +// The stack holds the intermediate word array - 16 uint32s - at 0(SP) up to 64(SP). +// The saved a, b, c, d, e (R11 through R15 on amd64) are at 64(SP) up to 84(SP). +// The saved limit pointer (DI on amd64) is at 84(SP). +// The saved data pointer (SI on amd64) is at 88(SP). + +#define LOAD(index, e) \ + MOVL 88(SP), SI; \ + MOVL (index*4)(SI), DI; \ + BSWAPL DI; \ + MOVL DI, (index*4)(SP); \ + ADDL DI, e + +#define SHUFFLE(index, e) \ + MOVL (((index)&0xf)*4)(SP), DI; \ + XORL (((index-3)&0xf)*4)(SP), DI; \ + XORL (((index-8)&0xf)*4)(SP), DI; \ + XORL (((index-14)&0xf)*4)(SP), DI; \ + ROLL $1, DI; \ + MOVL DI, (((index)&0xf)*4)(SP); \ + ADDL DI, e + +#define FUNC1(a, b, c, d, e) \ + MOVL b, SI; \ + ANDL c, SI; \ + MOVL b, DI; \ + NOTL DI; \ + ANDL d, DI; \ + ORL SI, DI + +#define FUNC2(a, b, c, d, e) \ + MOVL b, DI; \ + XORL c, DI; \ + XORL d, DI + +#define FUNC3(a, b, c, d, e) \ + MOVL b, SI; \ + ORL c, SI; \ + ANDL d, SI; \ + MOVL b, DI; \ + ANDL c, DI; \ + ORL SI, DI + +#define FUNC4 FUNC2 + +#define MIX(a, b, c, d, e, const) \ + ROLL $30, b; \ + ADDL DI, e; \ + MOVL a, SI; \ + ROLL $5, SI; \ + LEAL const(e)(SI*1), e + +#define ROUND1(a, b, c, d, e, index) \ + LOAD(index, e); \ + FUNC1(a, b, c, d, e); \ + MIX(a, b, c, d, e, 0x5A827999) + +#define ROUND1x(a, b, c, d, e, index) \ + SHUFFLE(index, e); \ + FUNC1(a, b, c, d, e); \ + MIX(a, b, c, d, e, 0x5A827999) + +#define ROUND2(a, b, c, d, e, index) \ + SHUFFLE(index, e); \ + FUNC2(a, b, c, d, e); \ + MIX(a, b, c, d, e, 0x6ED9EBA1) + +#define ROUND3(a, b, c, d, e, index) \ + SHUFFLE(index, e); \ + FUNC3(a, b, c, d, e); \ + MIX(a, b, c, d, e, 0x8F1BBCDC) + +#define ROUND4(a, b, c, d, e, index) \ + SHUFFLE(index, e); \ + FUNC4(a, b, c, d, e); \ + MIX(a, b, c, d, e, 0xCA62C1D6) + +// func block(dig *digest, p []byte) +TEXT ·block(SB),7,$92-16 + MOVL dig+0(FP), BP + MOVL p+4(FP), SI + MOVL n+8(FP), DX + SHRL $6, DX + SHLL $6, DX + + LEAL (SI)(DX*1), DI + MOVL (0*4)(BP), AX + MOVL (1*4)(BP), BX + MOVL (2*4)(BP), CX + MOVL (3*4)(BP), DX + MOVL (4*4)(BP), BP + + CMPL SI, DI + JEQ end + + MOVL DI, 84(SP) + +loop: + MOVL SI, 88(SP) + + MOVL AX, 64(SP) + MOVL BX, 68(SP) + MOVL CX, 72(SP) + MOVL DX, 76(SP) + MOVL BP, 80(SP) + + ROUND1(AX, BX, CX, DX, BP, 0) + ROUND1(BP, AX, BX, CX, DX, 1) + ROUND1(DX, BP, AX, BX, CX, 2) + ROUND1(CX, DX, BP, AX, BX, 3) + ROUND1(BX, CX, DX, BP, AX, 4) + ROUND1(AX, BX, CX, DX, BP, 5) + ROUND1(BP, AX, BX, CX, DX, 6) + ROUND1(DX, BP, AX, BX, CX, 7) + ROUND1(CX, DX, BP, AX, BX, 8) + ROUND1(BX, CX, DX, BP, AX, 9) + ROUND1(AX, BX, CX, DX, BP, 10) + ROUND1(BP, AX, BX, CX, DX, 11) + ROUND1(DX, BP, AX, BX, CX, 12) + ROUND1(CX, DX, BP, AX, BX, 13) + ROUND1(BX, CX, DX, BP, AX, 14) + ROUND1(AX, BX, CX, DX, BP, 15) + + ROUND1x(BP, AX, BX, CX, DX, 16) + ROUND1x(DX, BP, AX, BX, CX, 17) + ROUND1x(CX, DX, BP, AX, BX, 18) + ROUND1x(BX, CX, DX, BP, AX, 19) + + ROUND2(AX, BX, CX, DX, BP, 20) + ROUND2(BP, AX, BX, CX, DX, 21) + ROUND2(DX, BP, AX, BX, CX, 22) + ROUND2(CX, DX, BP, AX, BX, 23) + ROUND2(BX, CX, DX, BP, AX, 24) + ROUND2(AX, BX, CX, DX, BP, 25) + ROUND2(BP, AX, BX, CX, DX, 26) + ROUND2(DX, BP, AX, BX, CX, 27) + ROUND2(CX, DX, BP, AX, BX, 28) + ROUND2(BX, CX, DX, BP, AX, 29) + ROUND2(AX, BX, CX, DX, BP, 30) + ROUND2(BP, AX, BX, CX, DX, 31) + ROUND2(DX, BP, AX, BX, CX, 32) + ROUND2(CX, DX, BP, AX, BX, 33) + ROUND2(BX, CX, DX, BP, AX, 34) + ROUND2(AX, BX, CX, DX, BP, 35) + ROUND2(BP, AX, BX, CX, DX, 36) + ROUND2(DX, BP, AX, BX, CX, 37) + ROUND2(CX, DX, BP, AX, BX, 38) + ROUND2(BX, CX, DX, BP, AX, 39) + + ROUND3(AX, BX, CX, DX, BP, 40) + ROUND3(BP, AX, BX, CX, DX, 41) + ROUND3(DX, BP, AX, BX, CX, 42) + ROUND3(CX, DX, BP, AX, BX, 43) + ROUND3(BX, CX, DX, BP, AX, 44) + ROUND3(AX, BX, CX, DX, BP, 45) + ROUND3(BP, AX, BX, CX, DX, 46) + ROUND3(DX, BP, AX, BX, CX, 47) + ROUND3(CX, DX, BP, AX, BX, 48) + ROUND3(BX, CX, DX, BP, AX, 49) + ROUND3(AX, BX, CX, DX, BP, 50) + ROUND3(BP, AX, BX, CX, DX, 51) + ROUND3(DX, BP, AX, BX, CX, 52) + ROUND3(CX, DX, BP, AX, BX, 53) + ROUND3(BX, CX, DX, BP, AX, 54) + ROUND3(AX, BX, CX, DX, BP, 55) + ROUND3(BP, AX, BX, CX, DX, 56) + ROUND3(DX, BP, AX, BX, CX, 57) + ROUND3(CX, DX, BP, AX, BX, 58) + ROUND3(BX, CX, DX, BP, AX, 59) + + ROUND4(AX, BX, CX, DX, BP, 60) + ROUND4(BP, AX, BX, CX, DX, 61) + ROUND4(DX, BP, AX, BX, CX, 62) + ROUND4(CX, DX, BP, AX, BX, 63) + ROUND4(BX, CX, DX, BP, AX, 64) + ROUND4(AX, BX, CX, DX, BP, 65) + ROUND4(BP, AX, BX, CX, DX, 66) + ROUND4(DX, BP, AX, BX, CX, 67) + ROUND4(CX, DX, BP, AX, BX, 68) + ROUND4(BX, CX, DX, BP, AX, 69) + ROUND4(AX, BX, CX, DX, BP, 70) + ROUND4(BP, AX, BX, CX, DX, 71) + ROUND4(DX, BP, AX, BX, CX, 72) + ROUND4(CX, DX, BP, AX, BX, 73) + ROUND4(BX, CX, DX, BP, AX, 74) + ROUND4(AX, BX, CX, DX, BP, 75) + ROUND4(BP, AX, BX, CX, DX, 76) + ROUND4(DX, BP, AX, BX, CX, 77) + ROUND4(CX, DX, BP, AX, BX, 78) + ROUND4(BX, CX, DX, BP, AX, 79) + + ADDL 64(SP), AX + ADDL 68(SP), BX + ADDL 72(SP), CX + ADDL 76(SP), DX + ADDL 80(SP), BP + + MOVL 88(SP), SI + ADDL $64, SI + CMPL SI, 84(SP) + JB loop + +end: + MOVL dig+0(FP), DI + MOVL AX, (0*4)(DI) + MOVL BX, (1*4)(DI) + MOVL CX, (2*4)(DI) + MOVL DX, (3*4)(DI) + MOVL BP, (4*4)(DI) + RET diff --git a/src/pkg/crypto/sha1/sha1block_amd64.s b/src/pkg/crypto/sha1/sha1block_amd64.s new file mode 100644 index 00000000000..e2b286a91c0 --- /dev/null +++ b/src/pkg/crypto/sha1/sha1block_amd64.s @@ -0,0 +1,216 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// SHA1 block routine. See sha1block.go for Go equivalent. +// +// There are 80 rounds of 4 types: +// - rounds 0-15 are type 1 and load data (ROUND1 macro). +// - rounds 16-19 are type 1 and do not load data (ROUND1x macro). +// - rounds 20-39 are type 2 and do not load data (ROUND2 macro). +// - rounds 40-59 are type 3 and do not load data (ROUND3 macro). +// - rounds 60-79 are type 4 and do not load data (ROUND4 macro). +// +// Each round loads or shuffles the data, then computes a per-round +// function of b, c, d, and then mixes the result into and rotates the +// five registers a, b, c, d, e holding the intermediate results. +// +// The register rotation is implemented by rotating the arguments to +// the round macros instead of by explicit move instructions. + +#define LOAD(index) \ + MOVL (index*4)(SI), R10; \ + BSWAPL R10; \ + MOVL R10, (index*4)(SP) + +#define SHUFFLE(index) \ + MOVL (((index)&0xf)*4)(SP), R10; \ + XORL (((index-3)&0xf)*4)(SP), R10; \ + XORL (((index-8)&0xf)*4)(SP), R10; \ + XORL (((index-14)&0xf)*4)(SP), R10; \ + ROLL $1, R10; \ + MOVL R10, (((index)&0xf)*4)(SP) + +#define FUNC1(a, b, c, d, e) \ + MOVL b, R8; \ + ANDL c, R8; \ + MOVL b, R9; \ + NOTL R9; \ + ANDL d, R9; \ + ORL R8, R9 + +#define FUNC2(a, b, c, d, e) \ + MOVL b, R9; \ + XORL c, R9; \ + XORL d, R9 + +#define FUNC3(a, b, c, d, e) \ + MOVL b, R8; \ + ORL c, R8; \ + ANDL d, R8; \ + MOVL b, R9; \ + ANDL c, R9; \ + ORL R8, R9 + +#define FUNC4 FUNC2 + +#define MIX(a, b, c, d, e, const) \ + ROLL $30, b; \ + ADDL R9, e; \ + MOVL a, R8; \ + ROLL $5, R8; \ + LEAL const(e)(R10*1), e; \ + ADDL R8, e + +#define ROUND1(a, b, c, d, e, index) \ + LOAD(index); \ + FUNC1(a, b, c, d, e); \ + MIX(a, b, c, d, e, 0x5A827999) + +#define ROUND1x(a, b, c, d, e, index) \ + SHUFFLE(index); \ + FUNC1(a, b, c, d, e); \ + MIX(a, b, c, d, e, 0x5A827999) + +#define ROUND2(a, b, c, d, e, index) \ + SHUFFLE(index); \ + FUNC2(a, b, c, d, e); \ + MIX(a, b, c, d, e, 0x6ED9EBA1) + +#define ROUND3(a, b, c, d, e, index) \ + SHUFFLE(index); \ + FUNC3(a, b, c, d, e); \ + MIX(a, b, c, d, e, 0x8F1BBCDC) + +#define ROUND4(a, b, c, d, e, index) \ + SHUFFLE(index); \ + FUNC4(a, b, c, d, e); \ + MIX(a, b, c, d, e, 0xCA62C1D6) + +TEXT ·block(SB),7,$64-32 + MOVQ dig+0(FP), BP + MOVQ p+8(FP), SI + MOVQ n+16(FP), DX + SHRQ $6, DX + SHLQ $6, DX + + LEAQ (SI)(DX*1), DI + MOVL (0*4)(BP), AX + MOVL (1*4)(BP), BX + MOVL (2*4)(BP), CX + MOVL (3*4)(BP), DX + MOVL (4*4)(BP), BP + + CMPQ SI, DI + JEQ end + +loop: + MOVL AX, R11 + MOVL BX, R12 + MOVL CX, R13 + MOVL DX, R14 + MOVL BP, R15 + + ROUND1(AX, BX, CX, DX, BP, 0) + ROUND1(BP, AX, BX, CX, DX, 1) + ROUND1(DX, BP, AX, BX, CX, 2) + ROUND1(CX, DX, BP, AX, BX, 3) + ROUND1(BX, CX, DX, BP, AX, 4) + ROUND1(AX, BX, CX, DX, BP, 5) + ROUND1(BP, AX, BX, CX, DX, 6) + ROUND1(DX, BP, AX, BX, CX, 7) + ROUND1(CX, DX, BP, AX, BX, 8) + ROUND1(BX, CX, DX, BP, AX, 9) + ROUND1(AX, BX, CX, DX, BP, 10) + ROUND1(BP, AX, BX, CX, DX, 11) + ROUND1(DX, BP, AX, BX, CX, 12) + ROUND1(CX, DX, BP, AX, BX, 13) + ROUND1(BX, CX, DX, BP, AX, 14) + ROUND1(AX, BX, CX, DX, BP, 15) + + ROUND1x(BP, AX, BX, CX, DX, 16) + ROUND1x(DX, BP, AX, BX, CX, 17) + ROUND1x(CX, DX, BP, AX, BX, 18) + ROUND1x(BX, CX, DX, BP, AX, 19) + + ROUND2(AX, BX, CX, DX, BP, 20) + ROUND2(BP, AX, BX, CX, DX, 21) + ROUND2(DX, BP, AX, BX, CX, 22) + ROUND2(CX, DX, BP, AX, BX, 23) + ROUND2(BX, CX, DX, BP, AX, 24) + ROUND2(AX, BX, CX, DX, BP, 25) + ROUND2(BP, AX, BX, CX, DX, 26) + ROUND2(DX, BP, AX, BX, CX, 27) + ROUND2(CX, DX, BP, AX, BX, 28) + ROUND2(BX, CX, DX, BP, AX, 29) + ROUND2(AX, BX, CX, DX, BP, 30) + ROUND2(BP, AX, BX, CX, DX, 31) + ROUND2(DX, BP, AX, BX, CX, 32) + ROUND2(CX, DX, BP, AX, BX, 33) + ROUND2(BX, CX, DX, BP, AX, 34) + ROUND2(AX, BX, CX, DX, BP, 35) + ROUND2(BP, AX, BX, CX, DX, 36) + ROUND2(DX, BP, AX, BX, CX, 37) + ROUND2(CX, DX, BP, AX, BX, 38) + ROUND2(BX, CX, DX, BP, AX, 39) + + ROUND3(AX, BX, CX, DX, BP, 40) + ROUND3(BP, AX, BX, CX, DX, 41) + ROUND3(DX, BP, AX, BX, CX, 42) + ROUND3(CX, DX, BP, AX, BX, 43) + ROUND3(BX, CX, DX, BP, AX, 44) + ROUND3(AX, BX, CX, DX, BP, 45) + ROUND3(BP, AX, BX, CX, DX, 46) + ROUND3(DX, BP, AX, BX, CX, 47) + ROUND3(CX, DX, BP, AX, BX, 48) + ROUND3(BX, CX, DX, BP, AX, 49) + ROUND3(AX, BX, CX, DX, BP, 50) + ROUND3(BP, AX, BX, CX, DX, 51) + ROUND3(DX, BP, AX, BX, CX, 52) + ROUND3(CX, DX, BP, AX, BX, 53) + ROUND3(BX, CX, DX, BP, AX, 54) + ROUND3(AX, BX, CX, DX, BP, 55) + ROUND3(BP, AX, BX, CX, DX, 56) + ROUND3(DX, BP, AX, BX, CX, 57) + ROUND3(CX, DX, BP, AX, BX, 58) + ROUND3(BX, CX, DX, BP, AX, 59) + + ROUND4(AX, BX, CX, DX, BP, 60) + ROUND4(BP, AX, BX, CX, DX, 61) + ROUND4(DX, BP, AX, BX, CX, 62) + ROUND4(CX, DX, BP, AX, BX, 63) + ROUND4(BX, CX, DX, BP, AX, 64) + ROUND4(AX, BX, CX, DX, BP, 65) + ROUND4(BP, AX, BX, CX, DX, 66) + ROUND4(DX, BP, AX, BX, CX, 67) + ROUND4(CX, DX, BP, AX, BX, 68) + ROUND4(BX, CX, DX, BP, AX, 69) + ROUND4(AX, BX, CX, DX, BP, 70) + ROUND4(BP, AX, BX, CX, DX, 71) + ROUND4(DX, BP, AX, BX, CX, 72) + ROUND4(CX, DX, BP, AX, BX, 73) + ROUND4(BX, CX, DX, BP, AX, 74) + ROUND4(AX, BX, CX, DX, BP, 75) + ROUND4(BP, AX, BX, CX, DX, 76) + ROUND4(DX, BP, AX, BX, CX, 77) + ROUND4(CX, DX, BP, AX, BX, 78) + ROUND4(BX, CX, DX, BP, AX, 79) + + ADDL R11, AX + ADDL R12, BX + ADDL R13, CX + ADDL R14, DX + ADDL R15, BP + + ADDQ $64, SI + CMPQ SI, DI + JB loop + +end: + MOVQ dig+0(FP), DI + MOVL AX, (0*4)(DI) + MOVL BX, (1*4)(DI) + MOVL CX, (2*4)(DI) + MOVL DX, (3*4)(DI) + MOVL BP, (4*4)(DI) + RET diff --git a/src/pkg/crypto/sha1/sha1block_decl.go b/src/pkg/crypto/sha1/sha1block_decl.go new file mode 100644 index 00000000000..348a6aaaa3f --- /dev/null +++ b/src/pkg/crypto/sha1/sha1block_decl.go @@ -0,0 +1,9 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build amd64 386 + +package sha1 + +func block(*digest, []byte)