mirror of
https://github.com/golang/go
synced 2024-11-25 08:47:56 -07:00
crypto/sha1: implement sha1block in hardware on loong64
goos: linux goarch: loong64 pkg: crypto/sha1 cpu: Loongson-3A6000 @ 2500.00MHz │ bench.old │ bench.new │ │ sec/op │ sec/op vs base │ Hash8Bytes/New 489.8n ± 0% 280.6n ± 0% -42.71% (p=0.000 n=20) Hash8Bytes/Sum 496.6n ± 0% 288.9n ± 0% -41.82% (p=0.000 n=20) Hash320Bytes/New 2251.0n ± 0% 992.0n ± 0% -55.93% (p=0.000 n=20) Hash320Bytes/Sum 2258.0n ± 0% 998.0n ± 0% -55.80% (p=0.000 n=20) Hash1K/New 6.113µ ± 0% 2.583µ ± 0% -57.75% (p=0.000 n=20) Hash1K/Sum 6.117µ ± 0% 2.588µ ± 0% -57.69% (p=0.000 n=20) Hash8K/New 45.42µ ± 0% 18.79µ ± 0% -58.63% (p=0.000 n=20) Hash8K/Sum 45.43µ ± 0% 18.80µ ± 0% -58.62% (p=0.000 n=20) geomean 4.192µ 1.926µ -54.05% │ bench.old │ bench.new │ │ B/s │ B/s vs base │ Hash8Bytes/New 15.57Mi ± 0% 27.19Mi ± 0% +74.59% (p=0.000 n=20) Hash8Bytes/Sum 15.36Mi ± 0% 26.41Mi ± 0% +71.88% (p=0.000 n=20) Hash320Bytes/New 135.6Mi ± 0% 307.6Mi ± 0% +126.90% (p=0.000 n=20) Hash320Bytes/Sum 135.2Mi ± 0% 305.8Mi ± 0% +126.22% (p=0.000 n=20) Hash1K/New 159.8Mi ± 0% 378.1Mi ± 0% +136.69% (p=0.000 n=20) Hash1K/Sum 159.7Mi ± 0% 377.4Mi ± 0% +136.38% (p=0.000 n=20) Hash8K/New 172.0Mi ± 0% 415.8Mi ± 0% +141.75% (p=0.000 n=20) Hash8K/Sum 172.0Mi ± 0% 415.6Mi ± 0% +141.65% (p=0.000 n=20) geomean 87.09Mi 189.5Mi +117.64% goos: linux goarch: loong64 pkg: crypto/sha1 cpu: Loongson-3A5000 @ 2500.00MHz │ bench.old │ bench.new │ │ sec/op │ sec/op vs base │ Hash8Bytes/New 565.9n ± 0% 374.5n ± 1% -33.82% (p=0.000 n=20) Hash8Bytes/Sum 571.3n ± 0% 366.7n ± 1% -35.81% (p=0.000 n=20) Hash320Bytes/New 2.662µ ± 0% 1.201µ ± 0% -54.88% (p=0.000 n=20) Hash320Bytes/Sum 2.662µ ± 0% 1.194µ ± 0% -55.15% (p=0.000 n=20) Hash1K/New 7.171µ ± 0% 3.084µ ± 0% -56.99% (p=0.000 n=20) Hash1K/Sum 7.171µ ± 0% 3.076µ ± 0% -57.11% (p=0.000 n=20) Hash8K/New 53.10µ ± 0% 22.24µ ± 0% -58.12% (p=0.000 n=20) Hash8K/Sum 53.09µ ± 0% 22.23µ ± 0% -58.12% (p=0.000 n=20) geomean 4.900µ 2.348µ -52.08% │ bench.old │ bench.new │ │ B/s │ B/s vs base │ Hash8Bytes/New 13.48Mi ± 0% 20.38Mi ± 1% +51.10% (p=0.000 n=20) Hash8Bytes/Sum 13.35Mi ± 0% 20.80Mi ± 1% +55.82% (p=0.000 n=20) Hash320Bytes/New 114.6Mi ± 0% 254.0Mi ± 1% +121.61% (p=0.000 n=20) Hash320Bytes/Sum 114.6Mi ± 0% 255.6Mi ± 0% +123.00% (p=0.000 n=20) Hash1K/New 136.2Mi ± 0% 316.7Mi ± 0% +132.54% (p=0.000 n=20) Hash1K/Sum 136.2Mi ± 0% 317.5Mi ± 0% +133.19% (p=0.000 n=20) Hash8K/New 147.1Mi ± 0% 351.3Mi ± 0% +138.79% (p=0.000 n=20) Hash8K/Sum 147.2Mi ± 0% 351.4Mi ± 0% +138.78% (p=0.000 n=20) geomean 74.51Mi 155.5Mi +108.69% Change-Id: I716babd19c18dc2c3314d972ced9d83de2d93cb2 Reviewed-on: https://go-review.googlesource.com/c/go/+/589775 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com>
This commit is contained in:
parent
400e6b68b5
commit
64a5d1d7de
@ -2,7 +2,7 @@
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:build (arm || 386 || s390x) && !purego
|
||||
//go:build (386 || arm || loong64 || s390x) && !purego
|
||||
|
||||
package sha1
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:build (!amd64 && !386 && !arm && !s390x && !arm64) || purego
|
||||
//go:build (!386 && !amd64 && !arm && !arm64 && !loong64 && !s390x) || purego
|
||||
|
||||
package sha1
|
||||
|
||||
|
226
src/crypto/sha1/sha1block_loong64.s
Normal file
226
src/crypto/sha1/sha1block_loong64.s
Normal file
@ -0,0 +1,226 @@
|
||||
// Copyright 2024 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:build !purego
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
// SHA-1 block routine. See sha1block.go for Go equivalent.
|
||||
//
|
||||
// There are 80 rounds of 4 types:
|
||||
// - rounds 0-15 are type 1 and load data (ROUND1 macro).
|
||||
// - rounds 16-19 are type 1 and do not load data (ROUND1x macro).
|
||||
// - rounds 20-39 are type 2 and do not load data (ROUND2 macro).
|
||||
// - rounds 40-59 are type 3 and do not load data (ROUND3 macro).
|
||||
// - rounds 60-79 are type 4 and do not load data (ROUND4 macro).
|
||||
//
|
||||
// Each round loads or shuffles the data, then computes a per-round
|
||||
// function of b, c, d, and then mixes the result into and rotates the
|
||||
// five registers a, b, c, d, e holding the intermediate results.
|
||||
//
|
||||
// The register rotation is implemented by rotating the arguments to
|
||||
// the round macros instead of by explicit move instructions.
|
||||
|
||||
#define REGTMP R30
|
||||
#define REGTMP1 R17
|
||||
#define REGTMP2 R18
|
||||
#define REGTMP3 R19
|
||||
|
||||
#define LOAD1(index) \
|
||||
MOVW (index*4)(R5), REGTMP3; \
|
||||
WORD $0x3a73; \ // REVB2W REGTMP3, REGTMP3 to big-endian
|
||||
MOVW REGTMP3, (index*4)(R3)
|
||||
|
||||
#define LOAD(index) \
|
||||
MOVW (((index)&0xf)*4)(R3), REGTMP3; \
|
||||
MOVW (((index-3)&0xf)*4)(R3), REGTMP; \
|
||||
MOVW (((index-8)&0xf)*4)(R3), REGTMP1; \
|
||||
MOVW (((index-14)&0xf)*4)(R3), REGTMP2; \
|
||||
XOR REGTMP, REGTMP3; \
|
||||
XOR REGTMP1, REGTMP3; \
|
||||
XOR REGTMP2, REGTMP3; \
|
||||
ROTR $31, REGTMP3; \
|
||||
MOVW REGTMP3, (((index)&0xf)*4)(R3)
|
||||
|
||||
// f = d ^ (b & (c ^ d))
|
||||
#define FUNC1(a, b, c, d, e) \
|
||||
XOR c, d, REGTMP1; \
|
||||
AND b, REGTMP1; \
|
||||
XOR d, REGTMP1
|
||||
|
||||
// f = b ^ c ^ d
|
||||
#define FUNC2(a, b, c, d, e) \
|
||||
XOR b, c, REGTMP1; \
|
||||
XOR d, REGTMP1
|
||||
|
||||
// f = (b & c) | ((b | c) & d)
|
||||
#define FUNC3(a, b, c, d, e) \
|
||||
OR b, c, REGTMP2; \
|
||||
AND b, c, REGTMP; \
|
||||
AND d, REGTMP2; \
|
||||
OR REGTMP, REGTMP2, REGTMP1
|
||||
|
||||
#define FUNC4 FUNC2
|
||||
|
||||
#define MIX(a, b, c, d, e, const) \
|
||||
ROTR $2, b; \ // b << 30
|
||||
ADD REGTMP1, e; \ // e = e + f
|
||||
ROTR $27, a, REGTMP2; \ // a << 5
|
||||
ADD REGTMP3, e; \ // e = e + w[i]
|
||||
ADDV $const, e; \ // e = e + k
|
||||
ADD REGTMP2, e // e = e + a<<5
|
||||
|
||||
#define ROUND1(a, b, c, d, e, index) \
|
||||
LOAD1(index); \
|
||||
FUNC1(a, b, c, d, e); \
|
||||
MIX(a, b, c, d, e, 0x5A827999)
|
||||
|
||||
#define ROUND1x(a, b, c, d, e, index) \
|
||||
LOAD(index); \
|
||||
FUNC1(a, b, c, d, e); \
|
||||
MIX(a, b, c, d, e, 0x5A827999)
|
||||
|
||||
#define ROUND2(a, b, c, d, e, index) \
|
||||
LOAD(index); \
|
||||
FUNC2(a, b, c, d, e); \
|
||||
MIX(a, b, c, d, e, 0x6ED9EBA1)
|
||||
|
||||
#define ROUND3(a, b, c, d, e, index) \
|
||||
LOAD(index); \
|
||||
FUNC3(a, b, c, d, e); \
|
||||
MIX(a, b, c, d, e, 0x8F1BBCDC)
|
||||
|
||||
#define ROUND4(a, b, c, d, e, index) \
|
||||
LOAD(index); \
|
||||
FUNC4(a, b, c, d, e); \
|
||||
MIX(a, b, c, d, e, 0xCA62C1D6)
|
||||
|
||||
// A stack frame size of 64 bytes is required here, because
|
||||
// the frame size used for data expansion is 64 bytes.
|
||||
// See the definition of the macro LOAD above, and the definition
|
||||
// of the local variable w in the general implementation (sha1block.go).
|
||||
TEXT ·block(SB),NOSPLIT,$64-32
|
||||
MOVV dig+0(FP), R4
|
||||
MOVV p_base+8(FP), R5
|
||||
MOVV p_len+16(FP), R6
|
||||
AND $~63, R6
|
||||
BEQ R6, zero
|
||||
|
||||
// p_len >= 64
|
||||
ADDV R5, R6, R24
|
||||
MOVW (0*4)(R4), R7
|
||||
MOVW (1*4)(R4), R8
|
||||
MOVW (2*4)(R4), R9
|
||||
MOVW (3*4)(R4), R10
|
||||
MOVW (4*4)(R4), R11
|
||||
|
||||
loop:
|
||||
MOVW R7, R12
|
||||
MOVW R8, R13
|
||||
MOVW R9, R14
|
||||
MOVW R10, R15
|
||||
MOVW R11, R16
|
||||
|
||||
ROUND1(R7, R8, R9, R10, R11, 0)
|
||||
ROUND1(R11, R7, R8, R9, R10, 1)
|
||||
ROUND1(R10, R11, R7, R8, R9, 2)
|
||||
ROUND1(R9, R10, R11, R7, R8, 3)
|
||||
ROUND1(R8, R9, R10, R11, R7, 4)
|
||||
ROUND1(R7, R8, R9, R10, R11, 5)
|
||||
ROUND1(R11, R7, R8, R9, R10, 6)
|
||||
ROUND1(R10, R11, R7, R8, R9, 7)
|
||||
ROUND1(R9, R10, R11, R7, R8, 8)
|
||||
ROUND1(R8, R9, R10, R11, R7, 9)
|
||||
ROUND1(R7, R8, R9, R10, R11, 10)
|
||||
ROUND1(R11, R7, R8, R9, R10, 11)
|
||||
ROUND1(R10, R11, R7, R8, R9, 12)
|
||||
ROUND1(R9, R10, R11, R7, R8, 13)
|
||||
ROUND1(R8, R9, R10, R11, R7, 14)
|
||||
ROUND1(R7, R8, R9, R10, R11, 15)
|
||||
|
||||
ROUND1x(R11, R7, R8, R9, R10, 16)
|
||||
ROUND1x(R10, R11, R7, R8, R9, 17)
|
||||
ROUND1x(R9, R10, R11, R7, R8, 18)
|
||||
ROUND1x(R8, R9, R10, R11, R7, 19)
|
||||
|
||||
ROUND2(R7, R8, R9, R10, R11, 20)
|
||||
ROUND2(R11, R7, R8, R9, R10, 21)
|
||||
ROUND2(R10, R11, R7, R8, R9, 22)
|
||||
ROUND2(R9, R10, R11, R7, R8, 23)
|
||||
ROUND2(R8, R9, R10, R11, R7, 24)
|
||||
ROUND2(R7, R8, R9, R10, R11, 25)
|
||||
ROUND2(R11, R7, R8, R9, R10, 26)
|
||||
ROUND2(R10, R11, R7, R8, R9, 27)
|
||||
ROUND2(R9, R10, R11, R7, R8, 28)
|
||||
ROUND2(R8, R9, R10, R11, R7, 29)
|
||||
ROUND2(R7, R8, R9, R10, R11, 30)
|
||||
ROUND2(R11, R7, R8, R9, R10, 31)
|
||||
ROUND2(R10, R11, R7, R8, R9, 32)
|
||||
ROUND2(R9, R10, R11, R7, R8, 33)
|
||||
ROUND2(R8, R9, R10, R11, R7, 34)
|
||||
ROUND2(R7, R8, R9, R10, R11, 35)
|
||||
ROUND2(R11, R7, R8, R9, R10, 36)
|
||||
ROUND2(R10, R11, R7, R8, R9, 37)
|
||||
ROUND2(R9, R10, R11, R7, R8, 38)
|
||||
ROUND2(R8, R9, R10, R11, R7, 39)
|
||||
|
||||
ROUND3(R7, R8, R9, R10, R11, 40)
|
||||
ROUND3(R11, R7, R8, R9, R10, 41)
|
||||
ROUND3(R10, R11, R7, R8, R9, 42)
|
||||
ROUND3(R9, R10, R11, R7, R8, 43)
|
||||
ROUND3(R8, R9, R10, R11, R7, 44)
|
||||
ROUND3(R7, R8, R9, R10, R11, 45)
|
||||
ROUND3(R11, R7, R8, R9, R10, 46)
|
||||
ROUND3(R10, R11, R7, R8, R9, 47)
|
||||
ROUND3(R9, R10, R11, R7, R8, 48)
|
||||
ROUND3(R8, R9, R10, R11, R7, 49)
|
||||
ROUND3(R7, R8, R9, R10, R11, 50)
|
||||
ROUND3(R11, R7, R8, R9, R10, 51)
|
||||
ROUND3(R10, R11, R7, R8, R9, 52)
|
||||
ROUND3(R9, R10, R11, R7, R8, 53)
|
||||
ROUND3(R8, R9, R10, R11, R7, 54)
|
||||
ROUND3(R7, R8, R9, R10, R11, 55)
|
||||
ROUND3(R11, R7, R8, R9, R10, 56)
|
||||
ROUND3(R10, R11, R7, R8, R9, 57)
|
||||
ROUND3(R9, R10, R11, R7, R8, 58)
|
||||
ROUND3(R8, R9, R10, R11, R7, 59)
|
||||
|
||||
ROUND4(R7, R8, R9, R10, R11, 60)
|
||||
ROUND4(R11, R7, R8, R9, R10, 61)
|
||||
ROUND4(R10, R11, R7, R8, R9, 62)
|
||||
ROUND4(R9, R10, R11, R7, R8, 63)
|
||||
ROUND4(R8, R9, R10, R11, R7, 64)
|
||||
ROUND4(R7, R8, R9, R10, R11, 65)
|
||||
ROUND4(R11, R7, R8, R9, R10, 66)
|
||||
ROUND4(R10, R11, R7, R8, R9, 67)
|
||||
ROUND4(R9, R10, R11, R7, R8, 68)
|
||||
ROUND4(R8, R9, R10, R11, R7, 69)
|
||||
ROUND4(R7, R8, R9, R10, R11, 70)
|
||||
ROUND4(R11, R7, R8, R9, R10, 71)
|
||||
ROUND4(R10, R11, R7, R8, R9, 72)
|
||||
ROUND4(R9, R10, R11, R7, R8, 73)
|
||||
ROUND4(R8, R9, R10, R11, R7, 74)
|
||||
ROUND4(R7, R8, R9, R10, R11, 75)
|
||||
ROUND4(R11, R7, R8, R9, R10, 76)
|
||||
ROUND4(R10, R11, R7, R8, R9, 77)
|
||||
ROUND4(R9, R10, R11, R7, R8, 78)
|
||||
ROUND4(R8, R9, R10, R11, R7, 79)
|
||||
|
||||
ADD R12, R7
|
||||
ADD R13, R8
|
||||
ADD R14, R9
|
||||
ADD R15, R10
|
||||
ADD R16, R11
|
||||
|
||||
ADDV $64, R5
|
||||
BNE R5, R24, loop
|
||||
|
||||
end:
|
||||
MOVW R7, (0*4)(R4)
|
||||
MOVW R8, (1*4)(R4)
|
||||
MOVW R9, (2*4)(R4)
|
||||
MOVW R10, (3*4)(R4)
|
||||
MOVW R11, (4*4)(R4)
|
||||
zero:
|
||||
RET
|
Loading…
Reference in New Issue
Block a user