1
0
mirror of https://github.com/golang/go synced 2024-11-13 13:00:32 -07:00

crypto/md5: implement md5block in hardware on loong64

goos: linux
goarch: loong64
pkg: crypto/md5
cpu: Loongson-3A6000 @ 2500.00MHz
                    │  bench.old   │              bench.new              │
                    │    sec/op    │   sec/op     vs base                │
Hash8Bytes             276.6n ± 0%   219.7n ± 0%  -20.57% (p=0.000 n=20)
Hash64                 445.8n ± 0%   339.9n ± 0%  -23.76% (p=0.000 n=20)
Hash128                632.0n ± 0%   468.1n ± 0%  -25.93% (p=0.000 n=20)
Hash256               1005.0n ± 0%   723.8n ± 0%  -27.98% (p=0.000 n=20)
Hash512                1.749µ ± 0%   1.238µ ± 0%  -29.22% (p=0.000 n=20)
Hash1K                 3.238µ ± 0%   2.265µ ± 0%  -30.05% (p=0.000 n=20)
Hash8K                 24.09µ ± 0%   16.66µ ± 0%  -30.83% (p=0.000 n=20)
Hash1M                 3.049m ± 0%   2.105m ± 0%  -30.97% (p=0.000 n=20)
Hash8M                 24.39m ± 0%   16.84m ± 0%  -30.97% (p=0.000 n=20)
Hash8BytesUnaligned    284.1n ± 0%   227.2n ± 0%  -20.03% (p=0.000 n=20)
Hash1KUnaligned        3.238µ ± 0%   2.265µ ± 0%  -30.05% (p=0.000 n=20)
Hash8KUnaligned        24.09µ ± 0%   16.66µ ± 0%  -30.82% (p=0.000 n=20)
geomean                7.142µ        5.164µ       -27.70%

                    │  bench.old   │              bench.new               │
                    │     B/s      │     B/s       vs base                │
Hash8Bytes            27.58Mi ± 0%   34.73Mi ± 0%  +25.93% (p=0.000 n=20)
Hash64                136.9Mi ± 0%   179.6Mi ± 0%  +31.15% (p=0.000 n=20)
Hash128               193.1Mi ± 0%   260.8Mi ± 0%  +35.03% (p=0.000 n=20)
Hash256               243.0Mi ± 0%   337.3Mi ± 0%  +38.82% (p=0.000 n=20)
Hash512               279.1Mi ± 0%   394.3Mi ± 0%  +41.25% (p=0.000 n=20)
Hash1K                301.6Mi ± 0%   431.1Mi ± 0%  +42.94% (p=0.000 n=20)
Hash8K                324.3Mi ± 0%   468.9Mi ± 0%  +44.56% (p=0.000 n=20)
Hash1M                327.9Mi ± 0%   475.0Mi ± 0%  +44.86% (p=0.000 n=20)
Hash8M                328.0Mi ± 0%   475.1Mi ± 0%  +44.86% (p=0.000 n=20)
Hash8BytesUnaligned   26.86Mi ± 0%   33.58Mi ± 0%  +25.04% (p=0.000 n=20)
Hash1KUnaligned       301.6Mi ± 0%   431.1Mi ± 0%  +42.95% (p=0.000 n=20)
Hash8KUnaligned       324.3Mi ± 0%   468.9Mi ± 0%  +44.56% (p=0.000 n=20)
geomean               182.5Mi        252.4Mi       +38.31%

goos: linux
goarch: loong64
pkg: crypto/md5
cpu: Loongson-3A5000 @ 2500.00MHz
                    │  bench.old   │              bench.new              │
                    │    sec/op    │   sec/op     vs base                │
Hash8Bytes             346.0n ± 0%   289.1n ± 0%  -16.45% (p=0.000 n=20)
Hash64                 521.2n ± 0%   409.3n ± 0%  -21.47% (p=0.000 n=20)
Hash128                707.1n ± 0%   537.8n ± 0%  -23.94% (p=0.000 n=20)
Hash256               1080.0n ± 0%   795.8n ± 0%  -26.31% (p=0.000 n=20)
Hash512                1.826µ ± 0%   1.311µ ± 0%  -28.20% (p=0.000 n=20)
Hash1K                 3.315µ ± 0%   2.342µ ± 0%  -29.35% (p=0.000 n=20)
Hash8K                 24.19µ ± 0%   16.78µ ± 0%  -30.65% (p=0.000 n=20)
Hash1M                 3.052m ± 0%   2.110m ± 0%  -30.86% (p=0.000 n=20)
Hash8M                 24.41m ± 0%   16.88m ± 0%  -30.85% (p=0.000 n=20)
Hash8BytesUnaligned    345.9n ± 0%   289.0n ± 0%  -16.45% (p=0.000 n=20)
Hash1KUnaligned        3.316µ ± 0%   2.342µ ± 0%  -29.37% (p=0.000 n=20)
Hash8KUnaligned        24.19µ ± 0%   16.78µ ± 0%  -30.66% (p=0.000 n=20)
geomean                7.673µ        5.648µ       -26.39%

                    │  bench.old   │              bench.new               │
                    │     B/s      │     B/s       vs base                │
Hash8Bytes            22.05Mi ± 0%   26.39Mi ± 0%  +19.68% (p=0.000 n=20)
Hash64                117.1Mi ± 0%   149.1Mi ± 0%  +27.32% (p=0.000 n=20)
Hash128               172.6Mi ± 0%   227.0Mi ± 0%  +31.49% (p=0.000 n=20)
Hash256               226.0Mi ± 0%   306.8Mi ± 0%  +35.77% (p=0.000 n=20)
Hash512               267.4Mi ± 0%   372.5Mi ± 0%  +39.26% (p=0.000 n=20)
Hash1K                294.6Mi ± 0%   417.0Mi ± 0%  +41.53% (p=0.000 n=20)
Hash8K                322.9Mi ± 0%   465.7Mi ± 0%  +44.20% (p=0.000 n=20)
Hash1M                327.7Mi ± 0%   474.0Mi ± 0%  +44.64% (p=0.000 n=20)
Hash8M                327.8Mi ± 0%   474.1Mi ± 0%  +44.62% (p=0.000 n=20)
Hash8BytesUnaligned   22.06Mi ± 0%   26.40Mi ± 0%  +19.67% (p=0.000 n=20)
Hash1KUnaligned       294.5Mi ± 0%   417.0Mi ± 0%  +41.60% (p=0.000 n=20)
Hash8KUnaligned       322.9Mi ± 0%   465.7Mi ± 0%  +44.21% (p=0.000 n=20)
geomean               169.9Mi        230.8Mi       +35.85%

Change-Id: Iffddd60e3fc0b3bb265289f836a2d875f0805f64
Reviewed-on: https://go-review.googlesource.com/c/go/+/589540
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn>
This commit is contained in:
Xiaolin Zhao 2024-06-03 17:04:53 +08:00 committed by abner chenc
parent 4a9d98d49a
commit 6edc1c23ed
3 changed files with 182 additions and 2 deletions

View File

@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build (amd64 || 386 || arm || ppc64le || ppc64 || s390x || arm64) && !purego
//go:build (386 || amd64 || arm || arm64 || loong64 || ppc64 || ppc64le || s390x) && !purego
package md5

View File

@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build (!amd64 && !386 && !arm && !ppc64le && !ppc64 && !s390x && !arm64) || purego
//go:build (!386 && !amd64 && !arm && !arm64 && !loong64 && !ppc64 && !ppc64le && !s390x) || purego
package md5

View File

@ -0,0 +1,180 @@
// Copyright 2024 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//
// Loong64 version of md5block.go
// derived from crypto/md5/md5block_amd64.s
//go:build !purego
#define REGTMP R30
#define REGTMP1 R12
#define REGTMP2 R18
#include "textflag.h"
// func block(dig *digest, p []byte)
TEXT ·block(SB),NOSPLIT,$0-32
MOVV dig+0(FP), R4
MOVV p+8(FP), R5
MOVV p_len+16(FP), R6
AND $~63, R6
BEQ R6, zero
// p_len >= 64
ADDV R5, R6, R24
MOVW (0*4)(R4), R7
MOVW (1*4)(R4), R8
MOVW (2*4)(R4), R9
MOVW (3*4)(R4), R10
loop:
MOVW R7, R14
MOVW R8, R15
MOVW R9, R16
MOVW R10, R17
MOVW (0*4)(R5), R11
MOVW R10, REGTMP1
// F = ((c ^ d) & b) ^ d
#define ROUND1(a, b, c, d, index, const, shift) \
ADDV $const, a; \
ADD R11, a; \
MOVW (index*4)(R5), R11; \
XOR c, REGTMP1; \
AND b, REGTMP1; \
XOR d, REGTMP1; \
ADD REGTMP1, a; \
ROTR $(32-shift), a; \
MOVW c, REGTMP1; \
ADD b, a
ROUND1(R7, R8, R9, R10, 1, 0xd76aa478, 7);
ROUND1(R10, R7, R8, R9, 2, 0xe8c7b756, 12);
ROUND1(R9, R10, R7, R8, 3, 0x242070db, 17);
ROUND1(R8, R9, R10, R7, 4, 0xc1bdceee, 22);
ROUND1(R7, R8, R9, R10, 5, 0xf57c0faf, 7);
ROUND1(R10, R7, R8, R9, 6, 0x4787c62a, 12);
ROUND1(R9, R10, R7, R8, 7, 0xa8304613, 17);
ROUND1(R8, R9, R10, R7, 8, 0xfd469501, 22);
ROUND1(R7, R8, R9, R10, 9, 0x698098d8, 7);
ROUND1(R10, R7, R8, R9, 10, 0x8b44f7af, 12);
ROUND1(R9, R10, R7, R8, 11, 0xffff5bb1, 17);
ROUND1(R8, R9, R10, R7, 12, 0x895cd7be, 22);
ROUND1(R7, R8, R9, R10, 13, 0x6b901122, 7);
ROUND1(R10, R7, R8, R9, 14, 0xfd987193, 12);
ROUND1(R9, R10, R7, R8, 15, 0xa679438e, 17);
ROUND1(R8, R9, R10, R7, 1, 0x49b40821, 22);
MOVW (1*4)(R5), R11
// F = ((b ^ c) & d) ^ c
#define ROUND2(a, b, c, d, index, const, shift) \
ADDV $const, a; \
ADD R11, a; \
MOVW (index*4)(R5), R11; \
XOR b, c, REGTMP; \
AND REGTMP, d, REGTMP; \
XOR REGTMP, c, REGTMP; \
ADD REGTMP, a; \
ROTR $(32-shift), a; \
ADD b, a
ROUND2(R7, R8, R9, R10, 6, 0xf61e2562, 5);
ROUND2(R10, R7, R8, R9, 11, 0xc040b340, 9);
ROUND2(R9, R10, R7, R8, 0, 0x265e5a51, 14);
ROUND2(R8, R9, R10, R7, 5, 0xe9b6c7aa, 20);
ROUND2(R7, R8, R9, R10, 10, 0xd62f105d, 5);
ROUND2(R10, R7, R8, R9, 15, 0x2441453, 9);
ROUND2(R9, R10, R7, R8, 4, 0xd8a1e681, 14);
ROUND2(R8, R9, R10, R7, 9, 0xe7d3fbc8, 20);
ROUND2(R7, R8, R9, R10, 14, 0x21e1cde6, 5);
ROUND2(R10, R7, R8, R9, 3, 0xc33707d6, 9);
ROUND2(R9, R10, R7, R8, 8, 0xf4d50d87, 14);
ROUND2(R8, R9, R10, R7, 13, 0x455a14ed, 20);
ROUND2(R7, R8, R9, R10, 2, 0xa9e3e905, 5);
ROUND2(R10, R7, R8, R9, 7, 0xfcefa3f8, 9);
ROUND2(R9, R10, R7, R8, 12, 0x676f02d9, 14);
ROUND2(R8, R9, R10, R7, 5, 0x8d2a4c8a, 20);
MOVW (5*4)(R5), R11
MOVW R9, REGTMP1
// F = b ^ c ^ d
#define ROUND3(a, b, c, d, index, const, shift) \
ADDV $const, a; \
ADD R11, a; \
MOVW (index*4)(R5), R11; \
XOR d, REGTMP1; \
XOR b, REGTMP1; \
ADD REGTMP1, a; \
ROTR $(32-shift), a; \
MOVW b, REGTMP1; \
ADD b, a
ROUND3(R7, R8, R9, R10, 8, 0xfffa3942, 4);
ROUND3(R10, R7, R8, R9, 11, 0x8771f681, 11);
ROUND3(R9, R10, R7, R8, 14, 0x6d9d6122, 16);
ROUND3(R8, R9, R10, R7, 1, 0xfde5380c, 23);
ROUND3(R7, R8, R9, R10, 4, 0xa4beea44, 4);
ROUND3(R10, R7, R8, R9, 7, 0x4bdecfa9, 11);
ROUND3(R9, R10, R7, R8, 10, 0xf6bb4b60, 16);
ROUND3(R8, R9, R10, R7, 13, 0xbebfbc70, 23);
ROUND3(R7, R8, R9, R10, 0, 0x289b7ec6, 4);
ROUND3(R10, R7, R8, R9, 3, 0xeaa127fa, 11);
ROUND3(R9, R10, R7, R8, 6, 0xd4ef3085, 16);
ROUND3(R8, R9, R10, R7, 9, 0x4881d05, 23);
ROUND3(R7, R8, R9, R10, 12, 0xd9d4d039, 4);
ROUND3(R10, R7, R8, R9, 15, 0xe6db99e5, 11);
ROUND3(R9, R10, R7, R8, 2, 0x1fa27cf8, 16);
ROUND3(R8, R9, R10, R7, 0, 0xc4ac5665, 23);
MOVW (0*4)(R5), R11
MOVV $0xffffffff, REGTMP2
XOR R10, REGTMP2, REGTMP1 // REGTMP1 = ~d
// F = c ^ (b | (~d))
#define ROUND4(a, b, c, d, index, const, shift) \
ADDV $const, a; \
ADD R11, a; \
MOVW (index*4)(R5), R11; \
OR b, REGTMP1; \
XOR c, REGTMP1; \
ADD REGTMP1, a; \
ROTR $(32-shift), a; \
MOVV $0xffffffff, REGTMP2; \
XOR c, REGTMP2, REGTMP1; \
ADD b, a
ROUND4(R7, R8, R9, R10, 7, 0xf4292244, 6);
ROUND4(R10, R7, R8, R9, 14, 0x432aff97, 10);
ROUND4(R9, R10, R7, R8, 5, 0xab9423a7, 15);
ROUND4(R8, R9, R10, R7, 12, 0xfc93a039, 21);
ROUND4(R7, R8, R9, R10, 3, 0x655b59c3, 6);
ROUND4(R10, R7, R8, R9, 10, 0x8f0ccc92, 10);
ROUND4(R9, R10, R7, R8, 1, 0xffeff47d, 15);
ROUND4(R8, R9, R10, R7, 8, 0x85845dd1, 21);
ROUND4(R7, R8, R9, R10, 15, 0x6fa87e4f, 6);
ROUND4(R10, R7, R8, R9, 6, 0xfe2ce6e0, 10);
ROUND4(R9, R10, R7, R8, 13, 0xa3014314, 15);
ROUND4(R8, R9, R10, R7, 4, 0x4e0811a1, 21);
ROUND4(R7, R8, R9, R10, 11, 0xf7537e82, 6);
ROUND4(R10, R7, R8, R9, 2, 0xbd3af235, 10);
ROUND4(R9, R10, R7, R8, 9, 0x2ad7d2bb, 15);
ROUND4(R8, R9, R10, R7, 0, 0xeb86d391, 21);
ADD R14, R7
ADD R15, R8
ADD R16, R9
ADD R17, R10
ADDV $64, R5
BNE R5, R24, loop
MOVW R7, (0*4)(R4)
MOVW R8, (1*4)(R4)
MOVW R9, (2*4)(R4)
MOVW R10, (3*4)(R4)
zero:
RET