1
0
mirror of https://github.com/golang/go synced 2024-11-17 14:14:56 -07:00

crypto/md5: optimize amd64 assembly

* Use two ADDL instead of LEAL
* Keep ones in R11
* Use XORL with lower latency instead of NOTL
* Remove loads and load the correct value in the previous round
* Reduce dependency chain in round 2.
* Remove MOVL in round 3.

name                    old time/op    new time/op    delta
Hash8Bytes-32              104ns ± 0%      96ns ± 1%   -7.83%   (p=0.000 n=9+10)
Hash64-32                  169ns ± 0%     155ns ± 0%   -7.97%  (p=0.000 n=10+10)
Hash128-32                 244ns ± 0%     224ns ± 0%   -8.16%   (p=0.000 n=9+10)
Hash256-32                 396ns ± 0%     360ns ± 1%   -9.01%  (p=0.000 n=10+10)
Hash512-32                 700ns ± 1%     634ns ± 1%   -9.43%  (p=0.000 n=10+10)
Hash1K-32                 1.30µs ± 0%    1.18µs ± 1%   -9.32%   (p=0.000 n=9+10)
Hash8K-32                 9.77µs ± 0%    8.81µs ± 0%   -9.78%   (p=0.000 n=9+10)
Hash1M-32                 1.24ms ± 1%    1.12ms ± 1%   -9.54%  (p=0.000 n=10+10)
Hash8M-32                 10.0ms ± 1%     9.0ms ± 1%  -10.04%  (p=0.000 n=10+10)
Hash8BytesUnaligned-32     104ns ± 0%      96ns ± 0%   -7.50%  (p=0.000 n=10+10)
Hash1KUnaligned-32        1.32µs ± 1%    1.18µs ± 1%  -10.42%  (p=0.000 n=10+10)
Hash8KUnaligned-32        9.80µs ± 0%    8.79µs ± 1%  -10.29%  (p=0.000 n=10+10)

name                    old speed      new speed      delta
Hash8Bytes-32           77.1MB/s ± 0%  83.6MB/s ± 1%   +8.49%   (p=0.000 n=9+10)
Hash64-32                379MB/s ± 0%   412MB/s ± 0%   +8.66%  (p=0.000 n=10+10)
Hash128-32               525MB/s ± 0%   572MB/s ± 0%   +8.89%   (p=0.000 n=9+10)
Hash256-32               646MB/s ± 0%   710MB/s ± 1%   +9.90%  (p=0.000 n=10+10)
Hash512-32               732MB/s ± 1%   808MB/s ± 1%  +10.41%  (p=0.000 n=10+10)
Hash1K-32                786MB/s ± 0%   866MB/s ± 1%  +10.30%   (p=0.000 n=9+10)
Hash8K-32                839MB/s ± 0%   930MB/s ± 0%  +10.79%  (p=0.000 n=10+10)
Hash1M-32                849MB/s ± 1%   938MB/s ± 1%  +10.54%  (p=0.000 n=10+10)
Hash8M-32                841MB/s ± 1%   935MB/s ± 1%  +11.16%  (p=0.000 n=10+10)
Hash8BytesUnaligned-32  77.1MB/s ± 0%  83.4MB/s ± 0%   +8.12%  (p=0.000 n=10+10)
Hash1KUnaligned-32       778MB/s ± 1%   869MB/s ± 1%  +11.64%  (p=0.000 n=10+10)
Hash8KUnaligned-32       836MB/s ± 0%   932MB/s ± 1%  +11.47%  (p=0.000 n=10+10)

Change-Id: I02b31229b857e9257dc9d36538883eb3af4ad993

This PR will be imported into Gerrit with the title and first
comment (this text) used to generate the subject and body of
the Gerrit change.

Change-Id: I02b31229b857e9257dc9d36538883eb3af4ad993
GitHub-Last-Rev: ec8b15d789
GitHub-Pull-Request: golang/go#43690
Reviewed-on: https://go-review.googlesource.com/c/go/+/283538
Run-TryBot: Joel Sing <joel@sing.id.au>
Reviewed-by: Matthew Dempsky <mdempsky@google.com>
Reviewed-by: David Chase <drchase@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Joel Sing <joel@sing.id.au>
This commit is contained in:
Klaus Post 2021-10-06 16:11:40 +00:00 committed by Joel Sing
parent f7b4f02ba0
commit 55d08e5010

View File

@ -25,6 +25,7 @@ TEXT ·block(SB),NOSPLIT,$8-32
MOVL (1*4)(BP), BX
MOVL (2*4)(BP), CX
MOVL (3*4)(BP), DX
MOVL $0xffffffff, R11
CMPQ SI, DI
JEQ end
@ -40,14 +41,15 @@ loop:
#define ROUND1(a, b, c, d, index, const, shift) \
XORL c, R9; \
LEAL const(a)(R8*1), a; \
ADDL $const, a; \
ADDL R8, a; \
ANDL b, R9; \
XORL d, R9; \
MOVL (index*4)(SI), R8; \
ADDL R9, a; \
ROLL $shift, a; \
MOVL c, R9; \
ADDL b, a
XORL d, R9; \
MOVL (index*4)(SI), R8; \
ADDL R9, a; \
ROLL $shift, a; \
MOVL c, R9; \
ADDL b, a
ROUND1(AX,BX,CX,DX, 1,0xd76aa478, 7);
ROUND1(DX,AX,BX,CX, 2,0xe8c7b756,12);
@ -64,21 +66,23 @@ loop:
ROUND1(AX,BX,CX,DX,13,0x6b901122, 7);
ROUND1(DX,AX,BX,CX,14,0xfd987193,12);
ROUND1(CX,DX,AX,BX,15,0xa679438e,17);
ROUND1(BX,CX,DX,AX, 0,0x49b40821,22);
ROUND1(BX,CX,DX,AX, 1,0x49b40821,22);
MOVL (1*4)(SI), R8
MOVL DX, R9
MOVL DX, R10
// Uses https://github.com/animetosho/md5-optimisation#dependency-shortcut-in-g-function
#define ROUND2(a, b, c, d, index, const, shift) \
NOTL R9; \
LEAL const(a)(R8*1),a; \
XORL R11, R9; \
ADDL $const, a; \
ADDL R8, a; \
ANDL b, R10; \
ANDL c, R9; \
MOVL (index*4)(SI),R8; \
ORL R9, R10; \
ADDL R9, a; \
ADDL R10, a; \
MOVL c, R9; \
ADDL R10, a; \
MOVL c, R10; \
ROLL $shift, a; \
ADDL b, a
@ -98,22 +102,34 @@ loop:
ROUND2(AX,BX,CX,DX, 2,0xa9e3e905, 5);
ROUND2(DX,AX,BX,CX, 7,0xfcefa3f8, 9);
ROUND2(CX,DX,AX,BX,12,0x676f02d9,14);
ROUND2(BX,CX,DX,AX, 0,0x8d2a4c8a,20);
ROUND2(BX,CX,DX,AX, 5,0x8d2a4c8a,20);
MOVL (5*4)(SI), R8
MOVL CX, R9
#define ROUND3(a, b, c, d, index, const, shift) \
LEAL const(a)(R8*1),a; \
MOVL (index*4)(SI),R8; \
XORL d, R9; \
// Uses https://github.com/animetosho/md5-optimisation#h-function-re-use
#define ROUND3FIRST(a, b, c, d, index, const, shift) \
MOVL d, R9; \
XORL c, R9; \
XORL b, R9; \
ADDL $const, a; \
ADDL R8, a; \
MOVL (index*4)(SI),R8; \
ADDL R9, a; \
ROLL $shift, a; \
MOVL b, R9; \
ADDL b, a
ROUND3(AX,BX,CX,DX, 8,0xfffa3942, 4);
#define ROUND3(a, b, c, d, index, const, shift) \
XORL a, R9; \
XORL b, R9; \
ADDL $const, a; \
ADDL R8, a; \
MOVL (index*4)(SI),R8; \
ADDL R9, a; \
ROLL $shift, a; \
ADDL b, a
ROUND3FIRST(AX,BX,CX,DX, 8,0xfffa3942, 4);
ROUND3(DX,AX,BX,CX,11,0x8771f681,11);
ROUND3(CX,DX,AX,BX,14,0x6d9d6122,16);
ROUND3(BX,CX,DX,AX, 1,0xfde5380c,23);
@ -130,13 +146,13 @@ loop:
ROUND3(CX,DX,AX,BX, 2,0x1fa27cf8,16);
ROUND3(BX,CX,DX,AX, 0,0xc4ac5665,23);
MOVL (0*4)(SI), R8
MOVL $0xffffffff, R9
MOVL R11, R9
XORL DX, R9
#define ROUND4(a, b, c, d, index, const, shift) \
LEAL const(a)(R8*1),a; \
ORL b, R9; \
ADDL $const, a; \
ADDL R8, a; \
ORL b, R9; \
XORL c, R9; \
ADDL R9, a; \
MOVL (index*4)(SI),R8; \