1
0
mirror of https://github.com/golang/go synced 2024-11-17 16:54:44 -07:00

crypto/sha512: reduce add usage on PPC64

Similar to sha256, minimize add usage by preloading
constants. This results in a small performance uplift.

Likewise, cleanup some unused macros and registers to
make room for constants.

On ppc64le/power9:

Hash8Bytes/New     22.7MB/s ± 0%  24.1MB/s ± 0%  +6.49%
Hash8Bytes/Sum384  23.4MB/s ± 0%  24.9MB/s ± 0%  +6.32%
Hash8Bytes/Sum512  23.5MB/s ± 0%  24.9MB/s ± 0%  +6.18%
Hash1K/New          422MB/s ± 0%   455MB/s ± 0%  +7.92%
Hash1K/Sum384       424MB/s ± 0%   457MB/s ± 0%  +7.78%
Hash1K/Sum512       424MB/s ± 0%   457MB/s ± 0%  +7.77%
Hash8K/New          488MB/s ± 0%   528MB/s ± 0%  +8.18%
Hash8K/Sum384       481MB/s ± 0%   528MB/s ± 0%  +9.76%
Hash8K/Sum512       488MB/s ± 0%   515MB/s ± 0%  +5.60%

Change-Id: Ic604b482e3f6a9680b89c71399f85442f06fef3f
Reviewed-on: https://go-review.googlesource.com/c/go/+/460459
Reviewed-by: Archana Ravindar <aravind5@in.ibm.com>
Run-TryBot: Carlos Eduardo Seo <carlos.seo@linaro.org>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Carlos Eduardo Seo <carlos.seo@linaro.org>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Reviewed-by: Bryan Mills <bcmills@google.com>
This commit is contained in:
Paul E. Murphy 2023-01-04 12:06:34 -06:00 committed by Carlos Eduardo Seo
parent 932d0ae83e
commit 504f0d0419

View File

@ -58,16 +58,31 @@
#define INP R4
#define END R5
#define TBL R6
#define IDX R7
#define CNT R8
#define LEN R9
#define OFFLOAD R11
#define TEMP R12
#define HEX00 R0
#define HEX10 R10
#define HEX20 R25
#define HEX30 R26
#define TBL_STRT R7 // Pointer to start of kcon table.
#define R_x000 R0
#define R_x010 R10
#define R_x020 R25
#define R_x030 R26
#define R_x040 R14
#define R_x050 R15
#define R_x060 R16
#define R_x070 R17
#define R_x080 R18
#define R_x090 R19
#define R_x0a0 R20
#define R_x0b0 R21
#define R_x0c0 R22
#define R_x0d0 R23
#define R_x0e0 R24
#define R_x0f0 R28
#define R_x100 R29
#define R_x110 R27
// V0-V7 are A-H
// V8-V23 are used for the message schedule
@ -254,7 +269,7 @@ DATA ·kcon+0x510(SB)/8, $0x1011121314151617
DATA ·kcon+0x518(SB)/8, $0x0001020304050607
GLOBL ·kcon(SB), RODATA, $1312
#define SHA512ROUND0(a, b, c, d, e, f, g, h, xi) \
#define SHA512ROUND0(a, b, c, d, e, f, g, h, xi, idx) \
VSEL g, f, e, FUNC; \
VSHASIGMAD $15, e, $1, S1; \
VADDUDM xi, h, h; \
@ -266,11 +281,10 @@ GLOBL ·kcon(SB), RODATA, $1312
VADDUDM KI, g, g; \
VADDUDM h, d, d; \
VADDUDM FUNC, S0, S0; \
LVX (TBL)(IDX), KI; \
ADD $16, IDX; \
LVX (TBL)(idx), KI; \
VADDUDM S0, h, h
#define SHA512ROUND1(a, b, c, d, e, f, g, h, xi, xj, xj_1, xj_9, xj_14) \
#define SHA512ROUND1(a, b, c, d, e, f, g, h, xi, xj, xj_1, xj_9, xj_14, idx) \
VSHASIGMAD $0, xj_1, $0, s0; \
VSEL g, f, e, FUNC; \
VSHASIGMAD $15, e, $1, S1; \
@ -286,8 +300,7 @@ GLOBL ·kcon(SB), RODATA, $1312
VADDUDM h, d, d; \
VADDUDM FUNC, S0, S0; \
VADDUDM s0, xj, xj; \
LVX (TBL)(IDX), KI; \
ADD $16, IDX; \
LVX (TBL)(idx), KI; \
VADDUDM S0, h, h; \
VADDUDM s1, xj, xj
@ -305,36 +318,50 @@ TEXT ·block(SB),0,$0-32
CMP INP, END
BEQ end
MOVD $·kcon(SB), TBL
MOVD R1, OFFLOAD
MOVD $·kcon(SB), TBL_STRT
MOVD R0, CNT
MOVWZ $0x10, HEX10
MOVWZ $0x20, HEX20
MOVWZ $0x30, HEX30
MOVWZ $0x010, R_x010
MOVWZ $0x020, R_x020
MOVWZ $0x030, R_x030
MOVD $0x040, R_x040
MOVD $0x050, R_x050
MOVD $0x060, R_x060
MOVD $0x070, R_x070
MOVD $0x080, R_x080
MOVD $0x090, R_x090
MOVD $0x0a0, R_x0a0
MOVD $0x0b0, R_x0b0
MOVD $0x0c0, R_x0c0
MOVD $0x0d0, R_x0d0
MOVD $0x0e0, R_x0e0
MOVD $0x0f0, R_x0f0
MOVD $0x100, R_x100
MOVD $0x110, R_x110
// Generate the mask used with VPERM for LE
#ifdef GOARCH_ppc64le
MOVWZ $8, IDX
LVSL (IDX)(R0), LEMASK
// Generate the mask used with VPERM for LE
MOVWZ $8, TEMP
LVSL (TEMP)(R0), LEMASK
VSPLTISB $0x0F, KI
VXOR KI, LEMASK, LEMASK
#endif
LXVD2X (CTX)(HEX00), VS32 // v0 = vs32
LXVD2X (CTX)(HEX10), VS34 // v2 = vs34
LXVD2X (CTX)(HEX20), VS36 // v4 = vs36
LXVD2X (CTX)(R_x000), VS32 // v0 = vs32
LXVD2X (CTX)(R_x010), VS34 // v2 = vs34
LXVD2X (CTX)(R_x020), VS36 // v4 = vs36
// unpack the input values into vector registers
VSLDOI $8, V0, V0, V1
LXVD2X (CTX)(HEX30), VS38 // v6 = vs38
LXVD2X (CTX)(R_x030), VS38 // v6 = vs38
VSLDOI $8, V2, V2, V3
VSLDOI $8, V4, V4, V5
VSLDOI $8, V6, V6, V7
loop:
LVX (TBL)(HEX00), KI
MOVWZ $16, IDX
MOVD TBL_STRT, TBL
LVX (TBL)(R_x000), KI
LXVD2X (INP)(R0), VS40 // load v8 (=vs40) in advance
ADD $16, INP
@ -351,78 +378,73 @@ loop:
XXLOR V7, V7, VS31
VADDUDM KI, V7, V7 // h+K[i]
LVX (TBL)(IDX), KI
ADD $16, IDX
LVX (TBL)(R_x010), KI
VPERMLE(V8,V8,LEMASK,V8)
SHA512ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V8)
LXVD2X (INP)(R0), VS42 // load v10 (=vs42) in advance
ADD $16, INP, INP
SHA512ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V8, R_x020)
LXVD2X (INP)(R_x000), VS42 // load v10 (=vs42) in advance
VSLDOI $8, V8, V8, V9
SHA512ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V9)
SHA512ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V9, R_x030)
VPERMLE(V10,V10,LEMASK,V10)
SHA512ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V10)
LXVD2X (INP)(R0), VS44 // load v12 (=vs44) in advance
ADD $16, INP, INP
SHA512ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V10, R_x040)
LXVD2X (INP)(R_x010), VS44 // load v12 (=vs44) in advance
VSLDOI $8, V10, V10, V11
SHA512ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V11)
SHA512ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V11, R_x050)
VPERMLE(V12,V12,LEMASK,V12)
SHA512ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V12)
LXVD2X (INP)(R0), VS46 // load v14 (=vs46) in advance
ADD $16, INP, INP
SHA512ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V12, R_x060)
LXVD2X (INP)(R_x020), VS46 // load v14 (=vs46) in advance
VSLDOI $8, V12, V12, V13
SHA512ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V13)
SHA512ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V13, R_x070)
VPERMLE(V14,V14,LEMASK,V14)
SHA512ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V14)
LXVD2X (INP)(R0), VS48 // load v16 (=vs48) in advance
ADD $16, INP, INP
SHA512ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V14, R_x080)
LXVD2X (INP)(R_x030), VS48 // load v16 (=vs48) in advance
VSLDOI $8, V14, V14, V15
SHA512ROUND0(V1, V2, V3, V4, V5, V6, V7, V0, V15)
SHA512ROUND0(V1, V2, V3, V4, V5, V6, V7, V0, V15, R_x090)
VPERMLE(V16,V16,LEMASK,V16)
SHA512ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V16)
LXVD2X (INP)(R0), VS50 // load v18 (=vs50) in advance
ADD $16, INP, INP
SHA512ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V16, R_x0a0)
LXVD2X (INP)(R_x040), VS50 // load v18 (=vs50) in advance
VSLDOI $8, V16, V16, V17
SHA512ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V17)
SHA512ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V17, R_x0b0)
VPERMLE(V18,V18,LEMASK,V18)
SHA512ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V18)
LXVD2X (INP)(R0), VS52 // load v20 (=vs52) in advance
ADD $16, INP, INP
SHA512ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V18, R_x0c0)
LXVD2X (INP)(R_x050), VS52 // load v20 (=vs52) in advance
VSLDOI $8, V18, V18, V19
SHA512ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V19)
SHA512ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V19, R_x0d0)
VPERMLE(V20,V20,LEMASK,V20)
SHA512ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V20)
LXVD2X (INP)(R0), VS54 // load v22 (=vs54) in advance
ADD $16, INP, INP
SHA512ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V20, R_x0e0)
LXVD2X (INP)(R_x060), VS54 // load v22 (=vs54) in advance
VSLDOI $8, V20, V20, V21
SHA512ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V21)
SHA512ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V21, R_x0f0)
VPERMLE(V22,V22,LEMASK,V22)
SHA512ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V22)
SHA512ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V22, R_x100)
VSLDOI $8, V22, V22, V23
SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22)
SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22, R_x110)
MOVWZ $4, TEMP
MOVWZ TEMP, CTR
ADD $0x120, TBL
ADD $0x70, INP
L16_xx:
SHA512ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V18, V23)
SHA512ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V9, V10, V11, V19, V8)
SHA512ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V10, V11, V12, V20, V9)
SHA512ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V11, V12, V13, V21, V10)
SHA512ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V12, V13, V14, V22, V11)
SHA512ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V13, V14, V15, V23, V12)
SHA512ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V14, V15, V16, V8, V13)
SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V15, V16, V17, V9, V14)
SHA512ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V16, V17, V18, V10, V15)
SHA512ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V17, V18, V19, V11, V16)
SHA512ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V18, V19, V20, V12, V17)
SHA512ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V19, V20, V21, V13, V18)
SHA512ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V20, V21, V22, V14, V19)
SHA512ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V21, V22, V23, V15, V20)
SHA512ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V22, V23, V8, V16, V21)
SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22)
SHA512ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V18, V23, R_x000)
SHA512ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V9, V10, V11, V19, V8, R_x010)
SHA512ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V10, V11, V12, V20, V9, R_x020)
SHA512ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V11, V12, V13, V21, V10, R_x030)
SHA512ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V12, V13, V14, V22, V11, R_x040)
SHA512ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V13, V14, V15, V23, V12, R_x050)
SHA512ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V14, V15, V16, V8, V13, R_x060)
SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V15, V16, V17, V9, V14, R_x070)
SHA512ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V16, V17, V18, V10, V15, R_x080)
SHA512ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V17, V18, V19, V11, V16, R_x090)
SHA512ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V18, V19, V20, V12, V17, R_x0a0)
SHA512ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V19, V20, V21, V13, V18, R_x0b0)
SHA512ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V20, V21, V22, V14, V19, R_x0c0)
SHA512ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V21, V22, V23, V15, V20, R_x0d0)
SHA512ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V22, V23, V8, V16, V21, R_x0e0)
SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22, R_x0f0)
ADD $0x100, TBL
BC 0x10, 0, L16_xx // bdnz
BDNZ L16_xx
XXLOR VS24, VS24, V10
XXLOR VS25, VS25, V11
@ -455,10 +477,10 @@ L16_xx:
VPERM V5, V4, KI, V4
VPERM V7, V6, KI, V6
#endif
STXVD2X VS32, (CTX+HEX00) // v0 = vs32
STXVD2X VS34, (CTX+HEX10) // v2 = vs34
STXVD2X VS36, (CTX+HEX20) // v4 = vs36
STXVD2X VS38, (CTX+HEX30) // v6 = vs38
STXVD2X VS32, (CTX+R_x000) // v0 = vs32
STXVD2X VS34, (CTX+R_x010) // v2 = vs34
STXVD2X VS36, (CTX+R_x020) // v4 = vs36
STXVD2X VS38, (CTX+R_x030) // v6 = vs38
end:
RET