From a274b3003b8d03a0384a5bbbf6bfe459fd2c4787 Mon Sep 17 00:00:00 2001 From: Joel Sing Date: Tue, 8 Aug 2023 01:18:17 +1000 Subject: [PATCH] crypto/internal/bigmod: provide assembly addMulVVW* for riscv64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This provides an assembly implementation of addMulVVW* for riscv64, processing four words per loop, resulting in a performance gain of 23%+ for RSA decryption/signing on a StarFive VisionFive 2: │ rsa1 │ rsa2 │ │ sec/op │ sec/op vs base │ DecryptPKCS1v15/2048-4 24.29m ± 0% 18.65m ± 0% -23.24% (p=0.000 n=10) DecryptPKCS1v15/3072-4 73.28m ± 0% 54.08m ± 0% -26.20% (p=0.000 n=10) DecryptPKCS1v15/4096-4 163.5m ± 0% 119.1m ± 0% -27.17% (p=0.000 n=10) EncryptPKCS1v15/2048-4 1.505m ± 0% 1.446m ± 0% -3.93% (p=0.000 n=10) DecryptOAEP/2048-4 24.37m ± 0% 18.72m ± 0% -23.17% (p=0.000 n=10) EncryptOAEP/2048-4 1.570m ± 0% 1.510m ± 0% -3.84% (p=0.000 n=10) SignPKCS1v15/2048-4 24.52m ± 0% 18.80m ± 0% -23.36% (p=0.000 n=10) VerifyPKCS1v15/2048-4 1.491m ± 0% 1.431m ± 0% -4.00% (p=0.000 n=10) SignPSS/2048-4 24.60m ± 0% 18.89m ± 0% -23.21% (p=0.000 n=10) VerifyPSS/2048-4 1.565m ± 0% 1.504m ± 0% -3.87% (p=0.000 n=10) geomean 10.90m 9.066m -16.79% Change-Id: I8414ba0028b0781a945610abe02c285d2387aef3 Reviewed-on: https://go-review.googlesource.com/c/go/+/516536 Reviewed-by: Mark Ryan Reviewed-by: Filippo Valsorda Reviewed-by: Dmitri Shuralyov Reviewed-by: M Zhuo Reviewed-by: Michael Knyszek Run-TryBot: Joel Sing TryBot-Result: Gopher Robot --- src/crypto/internal/bigmod/nat_asm.go | 2 +- src/crypto/internal/bigmod/nat_noasm.go | 2 +- src/crypto/internal/bigmod/nat_riscv64.s | 91 ++++++++++++++++++++++++ 3 files changed, 93 insertions(+), 2 deletions(-) create mode 100644 src/crypto/internal/bigmod/nat_riscv64.s diff --git a/src/crypto/internal/bigmod/nat_asm.go b/src/crypto/internal/bigmod/nat_asm.go index 5eb91e1c6c2..0283b07e682 100644 --- a/src/crypto/internal/bigmod/nat_asm.go +++ b/src/crypto/internal/bigmod/nat_asm.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build !purego && (386 || amd64 || arm || arm64 || ppc64 || ppc64le || s390x) +//go:build !purego && (386 || amd64 || arm || arm64 || ppc64 || ppc64le || riscv64 || s390x) package bigmod diff --git a/src/crypto/internal/bigmod/nat_noasm.go b/src/crypto/internal/bigmod/nat_noasm.go index eff12536f91..71f38da754b 100644 --- a/src/crypto/internal/bigmod/nat_noasm.go +++ b/src/crypto/internal/bigmod/nat_noasm.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build purego || !(386 || amd64 || arm || arm64 || ppc64 || ppc64le || s390x) +//go:build purego || !(386 || amd64 || arm || arm64 || ppc64 || ppc64le || riscv64 || s390x) package bigmod diff --git a/src/crypto/internal/bigmod/nat_riscv64.s b/src/crypto/internal/bigmod/nat_riscv64.s new file mode 100644 index 00000000000..1d8c8c8900c --- /dev/null +++ b/src/crypto/internal/bigmod/nat_riscv64.s @@ -0,0 +1,91 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !purego + +#include "textflag.h" + +// func addMulVVW1024(z, x *uint, y uint) (c uint) +TEXT ·addMulVVW1024(SB),$0-32 + MOV $16, X30 + JMP addMulVVWx(SB) + +// func addMulVVW1536(z, x *uint, y uint) (c uint) +TEXT ·addMulVVW1536(SB),$0-32 + MOV $24, X30 + JMP addMulVVWx(SB) + +// func addMulVVW2048(z, x *uint, y uint) (c uint) +TEXT ·addMulVVW2048(SB),$0-32 + MOV $32, X30 + JMP addMulVVWx(SB) + +TEXT addMulVVWx(SB),NOFRAME|NOSPLIT,$0 + MOV z+0(FP), X5 + MOV x+8(FP), X7 + MOV y+16(FP), X6 + MOV $0, X29 + + BEQZ X30, done +loop: + MOV 0*8(X5), X10 // z[0] + MOV 1*8(X5), X13 // z[1] + MOV 2*8(X5), X16 // z[2] + MOV 3*8(X5), X19 // z[3] + + MOV 0*8(X7), X8 // x[0] + MOV 1*8(X7), X11 // x[1] + MOV 2*8(X7), X14 // x[2] + MOV 3*8(X7), X17 // x[3] + + MULHU X8, X6, X9 // z_hi[0] = x[0] * y + MUL X8, X6, X8 // z_lo[0] = x[0] * y + ADD X8, X10, X21 // z_lo[0] = x[0] * y + z[0] + SLTU X8, X21, X22 + ADD X9, X22, X9 // z_hi[0] = x[0] * y + z[0] + ADD X21, X29, X10 // z_lo[0] = x[0] * y + z[0] + c + SLTU X21, X10, X22 + ADD X9, X22, X29 // next c + + MULHU X11, X6, X12 // z_hi[1] = x[1] * y + MUL X11, X6, X11 // z_lo[1] = x[1] * y + ADD X11, X13, X21 // z_lo[1] = x[1] * y + z[1] + SLTU X11, X21, X22 + ADD X12, X22, X12 // z_hi[1] = x[1] * y + z[1] + ADD X21, X29, X13 // z_lo[1] = x[1] * y + z[1] + c + SLTU X21, X13, X22 + ADD X12, X22, X29 // next c + + MULHU X14, X6, X15 // z_hi[2] = x[2] * y + MUL X14, X6, X14 // z_lo[2] = x[2] * y + ADD X14, X16, X21 // z_lo[2] = x[2] * y + z[2] + SLTU X14, X21, X22 + ADD X15, X22, X15 // z_hi[2] = x[2] * y + z[2] + ADD X21, X29, X16 // z_lo[2] = x[2] * y + z[2] + c + SLTU X21, X16, X22 + ADD X15, X22, X29 // next c + + MULHU X17, X6, X18 // z_hi[3] = x[3] * y + MUL X17, X6, X17 // z_lo[3] = x[3] * y + ADD X17, X19, X21 // z_lo[3] = x[3] * y + z[3] + SLTU X17, X21, X22 + ADD X18, X22, X18 // z_hi[3] = x[3] * y + z[3] + ADD X21, X29, X19 // z_lo[3] = x[3] * y + z[3] + c + SLTU X21, X19, X22 + ADD X18, X22, X29 // next c + + MOV X10, 0*8(X5) // z[0] + MOV X13, 1*8(X5) // z[1] + MOV X16, 2*8(X5) // z[2] + MOV X19, 3*8(X5) // z[3] + + ADDI $32, X5 + ADDI $32, X7 + + ADDI $-4, X30 + BNEZ X30, loop + +done: + MOV X29, c+24(FP) + RET