mirror of
https://github.com/golang/go
synced 2024-11-17 14:14:56 -07:00
crypto/md5: optimize amd64 assembly
* Use two ADDL instead of LEAL
* Keep ones in R11
* Use XORL with lower latency instead of NOTL
* Remove loads and load the correct value in the previous round
* Reduce dependency chain in round 2.
* Remove MOVL in round 3.
name old time/op new time/op delta
Hash8Bytes-32 104ns ± 0% 96ns ± 1% -7.83% (p=0.000 n=9+10)
Hash64-32 169ns ± 0% 155ns ± 0% -7.97% (p=0.000 n=10+10)
Hash128-32 244ns ± 0% 224ns ± 0% -8.16% (p=0.000 n=9+10)
Hash256-32 396ns ± 0% 360ns ± 1% -9.01% (p=0.000 n=10+10)
Hash512-32 700ns ± 1% 634ns ± 1% -9.43% (p=0.000 n=10+10)
Hash1K-32 1.30µs ± 0% 1.18µs ± 1% -9.32% (p=0.000 n=9+10)
Hash8K-32 9.77µs ± 0% 8.81µs ± 0% -9.78% (p=0.000 n=9+10)
Hash1M-32 1.24ms ± 1% 1.12ms ± 1% -9.54% (p=0.000 n=10+10)
Hash8M-32 10.0ms ± 1% 9.0ms ± 1% -10.04% (p=0.000 n=10+10)
Hash8BytesUnaligned-32 104ns ± 0% 96ns ± 0% -7.50% (p=0.000 n=10+10)
Hash1KUnaligned-32 1.32µs ± 1% 1.18µs ± 1% -10.42% (p=0.000 n=10+10)
Hash8KUnaligned-32 9.80µs ± 0% 8.79µs ± 1% -10.29% (p=0.000 n=10+10)
name old speed new speed delta
Hash8Bytes-32 77.1MB/s ± 0% 83.6MB/s ± 1% +8.49% (p=0.000 n=9+10)
Hash64-32 379MB/s ± 0% 412MB/s ± 0% +8.66% (p=0.000 n=10+10)
Hash128-32 525MB/s ± 0% 572MB/s ± 0% +8.89% (p=0.000 n=9+10)
Hash256-32 646MB/s ± 0% 710MB/s ± 1% +9.90% (p=0.000 n=10+10)
Hash512-32 732MB/s ± 1% 808MB/s ± 1% +10.41% (p=0.000 n=10+10)
Hash1K-32 786MB/s ± 0% 866MB/s ± 1% +10.30% (p=0.000 n=9+10)
Hash8K-32 839MB/s ± 0% 930MB/s ± 0% +10.79% (p=0.000 n=10+10)
Hash1M-32 849MB/s ± 1% 938MB/s ± 1% +10.54% (p=0.000 n=10+10)
Hash8M-32 841MB/s ± 1% 935MB/s ± 1% +11.16% (p=0.000 n=10+10)
Hash8BytesUnaligned-32 77.1MB/s ± 0% 83.4MB/s ± 0% +8.12% (p=0.000 n=10+10)
Hash1KUnaligned-32 778MB/s ± 1% 869MB/s ± 1% +11.64% (p=0.000 n=10+10)
Hash8KUnaligned-32 836MB/s ± 0% 932MB/s ± 1% +11.47% (p=0.000 n=10+10)
Change-Id: I02b31229b857e9257dc9d36538883eb3af4ad993
This PR will be imported into Gerrit with the title and first
comment (this text) used to generate the subject and body of
the Gerrit change.
Change-Id: I02b31229b857e9257dc9d36538883eb3af4ad993
GitHub-Last-Rev: ec8b15d789
GitHub-Pull-Request: golang/go#43690
Reviewed-on: https://go-review.googlesource.com/c/go/+/283538
Run-TryBot: Joel Sing <joel@sing.id.au>
Reviewed-by: Matthew Dempsky <mdempsky@google.com>
Reviewed-by: David Chase <drchase@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Joel Sing <joel@sing.id.au>
This commit is contained in:
parent
f7b4f02ba0
commit
55d08e5010
@ -25,6 +25,7 @@ TEXT ·block(SB),NOSPLIT,$8-32
|
||||
MOVL (1*4)(BP), BX
|
||||
MOVL (2*4)(BP), CX
|
||||
MOVL (3*4)(BP), DX
|
||||
MOVL $0xffffffff, R11
|
||||
|
||||
CMPQ SI, DI
|
||||
JEQ end
|
||||
@ -40,14 +41,15 @@ loop:
|
||||
|
||||
#define ROUND1(a, b, c, d, index, const, shift) \
|
||||
XORL c, R9; \
|
||||
LEAL const(a)(R8*1), a; \
|
||||
ADDL $const, a; \
|
||||
ADDL R8, a; \
|
||||
ANDL b, R9; \
|
||||
XORL d, R9; \
|
||||
MOVL (index*4)(SI), R8; \
|
||||
ADDL R9, a; \
|
||||
ROLL $shift, a; \
|
||||
MOVL c, R9; \
|
||||
ADDL b, a
|
||||
XORL d, R9; \
|
||||
MOVL (index*4)(SI), R8; \
|
||||
ADDL R9, a; \
|
||||
ROLL $shift, a; \
|
||||
MOVL c, R9; \
|
||||
ADDL b, a
|
||||
|
||||
ROUND1(AX,BX,CX,DX, 1,0xd76aa478, 7);
|
||||
ROUND1(DX,AX,BX,CX, 2,0xe8c7b756,12);
|
||||
@ -64,21 +66,23 @@ loop:
|
||||
ROUND1(AX,BX,CX,DX,13,0x6b901122, 7);
|
||||
ROUND1(DX,AX,BX,CX,14,0xfd987193,12);
|
||||
ROUND1(CX,DX,AX,BX,15,0xa679438e,17);
|
||||
ROUND1(BX,CX,DX,AX, 0,0x49b40821,22);
|
||||
ROUND1(BX,CX,DX,AX, 1,0x49b40821,22);
|
||||
|
||||
MOVL (1*4)(SI), R8
|
||||
MOVL DX, R9
|
||||
MOVL DX, R10
|
||||
|
||||
// Uses https://github.com/animetosho/md5-optimisation#dependency-shortcut-in-g-function
|
||||
|
||||
#define ROUND2(a, b, c, d, index, const, shift) \
|
||||
NOTL R9; \
|
||||
LEAL const(a)(R8*1),a; \
|
||||
XORL R11, R9; \
|
||||
ADDL $const, a; \
|
||||
ADDL R8, a; \
|
||||
ANDL b, R10; \
|
||||
ANDL c, R9; \
|
||||
MOVL (index*4)(SI),R8; \
|
||||
ORL R9, R10; \
|
||||
ADDL R9, a; \
|
||||
ADDL R10, a; \
|
||||
MOVL c, R9; \
|
||||
ADDL R10, a; \
|
||||
MOVL c, R10; \
|
||||
ROLL $shift, a; \
|
||||
ADDL b, a
|
||||
@ -98,22 +102,34 @@ loop:
|
||||
ROUND2(AX,BX,CX,DX, 2,0xa9e3e905, 5);
|
||||
ROUND2(DX,AX,BX,CX, 7,0xfcefa3f8, 9);
|
||||
ROUND2(CX,DX,AX,BX,12,0x676f02d9,14);
|
||||
ROUND2(BX,CX,DX,AX, 0,0x8d2a4c8a,20);
|
||||
ROUND2(BX,CX,DX,AX, 5,0x8d2a4c8a,20);
|
||||
|
||||
MOVL (5*4)(SI), R8
|
||||
MOVL CX, R9
|
||||
|
||||
#define ROUND3(a, b, c, d, index, const, shift) \
|
||||
LEAL const(a)(R8*1),a; \
|
||||
MOVL (index*4)(SI),R8; \
|
||||
XORL d, R9; \
|
||||
// Uses https://github.com/animetosho/md5-optimisation#h-function-re-use
|
||||
|
||||
#define ROUND3FIRST(a, b, c, d, index, const, shift) \
|
||||
MOVL d, R9; \
|
||||
XORL c, R9; \
|
||||
XORL b, R9; \
|
||||
ADDL $const, a; \
|
||||
ADDL R8, a; \
|
||||
MOVL (index*4)(SI),R8; \
|
||||
ADDL R9, a; \
|
||||
ROLL $shift, a; \
|
||||
MOVL b, R9; \
|
||||
ADDL b, a
|
||||
|
||||
ROUND3(AX,BX,CX,DX, 8,0xfffa3942, 4);
|
||||
#define ROUND3(a, b, c, d, index, const, shift) \
|
||||
XORL a, R9; \
|
||||
XORL b, R9; \
|
||||
ADDL $const, a; \
|
||||
ADDL R8, a; \
|
||||
MOVL (index*4)(SI),R8; \
|
||||
ADDL R9, a; \
|
||||
ROLL $shift, a; \
|
||||
ADDL b, a
|
||||
|
||||
ROUND3FIRST(AX,BX,CX,DX, 8,0xfffa3942, 4);
|
||||
ROUND3(DX,AX,BX,CX,11,0x8771f681,11);
|
||||
ROUND3(CX,DX,AX,BX,14,0x6d9d6122,16);
|
||||
ROUND3(BX,CX,DX,AX, 1,0xfde5380c,23);
|
||||
@ -130,13 +146,13 @@ loop:
|
||||
ROUND3(CX,DX,AX,BX, 2,0x1fa27cf8,16);
|
||||
ROUND3(BX,CX,DX,AX, 0,0xc4ac5665,23);
|
||||
|
||||
MOVL (0*4)(SI), R8
|
||||
MOVL $0xffffffff, R9
|
||||
MOVL R11, R9
|
||||
XORL DX, R9
|
||||
|
||||
#define ROUND4(a, b, c, d, index, const, shift) \
|
||||
LEAL const(a)(R8*1),a; \
|
||||
ORL b, R9; \
|
||||
ADDL $const, a; \
|
||||
ADDL R8, a; \
|
||||
ORL b, R9; \
|
||||
XORL c, R9; \
|
||||
ADDL R9, a; \
|
||||
MOVL (index*4)(SI),R8; \
|
||||
|
Loading…
Reference in New Issue
Block a user