From 504f0d04191943dbdbf6b8f3022585f75ca0bc83 Mon Sep 17 00:00:00 2001 From: "Paul E. Murphy" Date: Wed, 4 Jan 2023 12:06:34 -0600 Subject: [PATCH] crypto/sha512: reduce add usage on PPC64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Similar to sha256, minimize add usage by preloading constants. This results in a small performance uplift. Likewise, cleanup some unused macros and registers to make room for constants. On ppc64le/power9: Hash8Bytes/New 22.7MB/s ± 0% 24.1MB/s ± 0% +6.49% Hash8Bytes/Sum384 23.4MB/s ± 0% 24.9MB/s ± 0% +6.32% Hash8Bytes/Sum512 23.5MB/s ± 0% 24.9MB/s ± 0% +6.18% Hash1K/New 422MB/s ± 0% 455MB/s ± 0% +7.92% Hash1K/Sum384 424MB/s ± 0% 457MB/s ± 0% +7.78% Hash1K/Sum512 424MB/s ± 0% 457MB/s ± 0% +7.77% Hash8K/New 488MB/s ± 0% 528MB/s ± 0% +8.18% Hash8K/Sum384 481MB/s ± 0% 528MB/s ± 0% +9.76% Hash8K/Sum512 488MB/s ± 0% 515MB/s ± 0% +5.60% Change-Id: Ic604b482e3f6a9680b89c71399f85442f06fef3f Reviewed-on: https://go-review.googlesource.com/c/go/+/460459 Reviewed-by: Archana Ravindar Run-TryBot: Carlos Eduardo Seo Reviewed-by: Dmitri Shuralyov TryBot-Result: Gopher Robot Reviewed-by: Carlos Eduardo Seo Reviewed-by: Lynn Boger Reviewed-by: Bryan Mills --- src/crypto/sha512/sha512block_ppc64x.s | 180 ++++++++++++++----------- 1 file changed, 101 insertions(+), 79 deletions(-) diff --git a/src/crypto/sha512/sha512block_ppc64x.s b/src/crypto/sha512/sha512block_ppc64x.s index 968183dde9..90dbf0f02b 100644 --- a/src/crypto/sha512/sha512block_ppc64x.s +++ b/src/crypto/sha512/sha512block_ppc64x.s @@ -58,16 +58,31 @@ #define INP R4 #define END R5 #define TBL R6 -#define IDX R7 #define CNT R8 #define LEN R9 -#define OFFLOAD R11 #define TEMP R12 -#define HEX00 R0 -#define HEX10 R10 -#define HEX20 R25 -#define HEX30 R26 +#define TBL_STRT R7 // Pointer to start of kcon table. + +#define R_x000 R0 +#define R_x010 R10 +#define R_x020 R25 +#define R_x030 R26 +#define R_x040 R14 +#define R_x050 R15 +#define R_x060 R16 +#define R_x070 R17 +#define R_x080 R18 +#define R_x090 R19 +#define R_x0a0 R20 +#define R_x0b0 R21 +#define R_x0c0 R22 +#define R_x0d0 R23 +#define R_x0e0 R24 +#define R_x0f0 R28 +#define R_x100 R29 +#define R_x110 R27 + // V0-V7 are A-H // V8-V23 are used for the message schedule @@ -254,7 +269,7 @@ DATA ·kcon+0x510(SB)/8, $0x1011121314151617 DATA ·kcon+0x518(SB)/8, $0x0001020304050607 GLOBL ·kcon(SB), RODATA, $1312 -#define SHA512ROUND0(a, b, c, d, e, f, g, h, xi) \ +#define SHA512ROUND0(a, b, c, d, e, f, g, h, xi, idx) \ VSEL g, f, e, FUNC; \ VSHASIGMAD $15, e, $1, S1; \ VADDUDM xi, h, h; \ @@ -266,11 +281,10 @@ GLOBL ·kcon(SB), RODATA, $1312 VADDUDM KI, g, g; \ VADDUDM h, d, d; \ VADDUDM FUNC, S0, S0; \ - LVX (TBL)(IDX), KI; \ - ADD $16, IDX; \ + LVX (TBL)(idx), KI; \ VADDUDM S0, h, h -#define SHA512ROUND1(a, b, c, d, e, f, g, h, xi, xj, xj_1, xj_9, xj_14) \ +#define SHA512ROUND1(a, b, c, d, e, f, g, h, xi, xj, xj_1, xj_9, xj_14, idx) \ VSHASIGMAD $0, xj_1, $0, s0; \ VSEL g, f, e, FUNC; \ VSHASIGMAD $15, e, $1, S1; \ @@ -286,8 +300,7 @@ GLOBL ·kcon(SB), RODATA, $1312 VADDUDM h, d, d; \ VADDUDM FUNC, S0, S0; \ VADDUDM s0, xj, xj; \ - LVX (TBL)(IDX), KI; \ - ADD $16, IDX; \ + LVX (TBL)(idx), KI; \ VADDUDM S0, h, h; \ VADDUDM s1, xj, xj @@ -305,36 +318,50 @@ TEXT ·block(SB),0,$0-32 CMP INP, END BEQ end - MOVD $·kcon(SB), TBL - MOVD R1, OFFLOAD + MOVD $·kcon(SB), TBL_STRT MOVD R0, CNT - MOVWZ $0x10, HEX10 - MOVWZ $0x20, HEX20 - MOVWZ $0x30, HEX30 + MOVWZ $0x010, R_x010 + MOVWZ $0x020, R_x020 + MOVWZ $0x030, R_x030 + MOVD $0x040, R_x040 + MOVD $0x050, R_x050 + MOVD $0x060, R_x060 + MOVD $0x070, R_x070 + MOVD $0x080, R_x080 + MOVD $0x090, R_x090 + MOVD $0x0a0, R_x0a0 + MOVD $0x0b0, R_x0b0 + MOVD $0x0c0, R_x0c0 + MOVD $0x0d0, R_x0d0 + MOVD $0x0e0, R_x0e0 + MOVD $0x0f0, R_x0f0 + MOVD $0x100, R_x100 + MOVD $0x110, R_x110 -// Generate the mask used with VPERM for LE #ifdef GOARCH_ppc64le - MOVWZ $8, IDX - LVSL (IDX)(R0), LEMASK + // Generate the mask used with VPERM for LE + MOVWZ $8, TEMP + LVSL (TEMP)(R0), LEMASK VSPLTISB $0x0F, KI VXOR KI, LEMASK, LEMASK #endif - LXVD2X (CTX)(HEX00), VS32 // v0 = vs32 - LXVD2X (CTX)(HEX10), VS34 // v2 = vs34 - LXVD2X (CTX)(HEX20), VS36 // v4 = vs36 + LXVD2X (CTX)(R_x000), VS32 // v0 = vs32 + LXVD2X (CTX)(R_x010), VS34 // v2 = vs34 + LXVD2X (CTX)(R_x020), VS36 // v4 = vs36 + // unpack the input values into vector registers VSLDOI $8, V0, V0, V1 - LXVD2X (CTX)(HEX30), VS38 // v6 = vs38 + LXVD2X (CTX)(R_x030), VS38 // v6 = vs38 VSLDOI $8, V2, V2, V3 VSLDOI $8, V4, V4, V5 VSLDOI $8, V6, V6, V7 loop: - LVX (TBL)(HEX00), KI - MOVWZ $16, IDX + MOVD TBL_STRT, TBL + LVX (TBL)(R_x000), KI LXVD2X (INP)(R0), VS40 // load v8 (=vs40) in advance ADD $16, INP @@ -351,78 +378,73 @@ loop: XXLOR V7, V7, VS31 VADDUDM KI, V7, V7 // h+K[i] - LVX (TBL)(IDX), KI - ADD $16, IDX + LVX (TBL)(R_x010), KI VPERMLE(V8,V8,LEMASK,V8) - SHA512ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V8) - LXVD2X (INP)(R0), VS42 // load v10 (=vs42) in advance - ADD $16, INP, INP + SHA512ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V8, R_x020) + LXVD2X (INP)(R_x000), VS42 // load v10 (=vs42) in advance VSLDOI $8, V8, V8, V9 - SHA512ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V9) + SHA512ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V9, R_x030) VPERMLE(V10,V10,LEMASK,V10) - SHA512ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V10) - LXVD2X (INP)(R0), VS44 // load v12 (=vs44) in advance - ADD $16, INP, INP + SHA512ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V10, R_x040) + LXVD2X (INP)(R_x010), VS44 // load v12 (=vs44) in advance VSLDOI $8, V10, V10, V11 - SHA512ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V11) + SHA512ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V11, R_x050) VPERMLE(V12,V12,LEMASK,V12) - SHA512ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V12) - LXVD2X (INP)(R0), VS46 // load v14 (=vs46) in advance - ADD $16, INP, INP + SHA512ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V12, R_x060) + LXVD2X (INP)(R_x020), VS46 // load v14 (=vs46) in advance VSLDOI $8, V12, V12, V13 - SHA512ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V13) + SHA512ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V13, R_x070) VPERMLE(V14,V14,LEMASK,V14) - SHA512ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V14) - LXVD2X (INP)(R0), VS48 // load v16 (=vs48) in advance - ADD $16, INP, INP + SHA512ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V14, R_x080) + LXVD2X (INP)(R_x030), VS48 // load v16 (=vs48) in advance VSLDOI $8, V14, V14, V15 - SHA512ROUND0(V1, V2, V3, V4, V5, V6, V7, V0, V15) + SHA512ROUND0(V1, V2, V3, V4, V5, V6, V7, V0, V15, R_x090) VPERMLE(V16,V16,LEMASK,V16) - SHA512ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V16) - LXVD2X (INP)(R0), VS50 // load v18 (=vs50) in advance - ADD $16, INP, INP + SHA512ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V16, R_x0a0) + LXVD2X (INP)(R_x040), VS50 // load v18 (=vs50) in advance VSLDOI $8, V16, V16, V17 - SHA512ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V17) + SHA512ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V17, R_x0b0) VPERMLE(V18,V18,LEMASK,V18) - SHA512ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V18) - LXVD2X (INP)(R0), VS52 // load v20 (=vs52) in advance - ADD $16, INP, INP + SHA512ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V18, R_x0c0) + LXVD2X (INP)(R_x050), VS52 // load v20 (=vs52) in advance VSLDOI $8, V18, V18, V19 - SHA512ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V19) + SHA512ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V19, R_x0d0) VPERMLE(V20,V20,LEMASK,V20) - SHA512ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V20) - LXVD2X (INP)(R0), VS54 // load v22 (=vs54) in advance - ADD $16, INP, INP + SHA512ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V20, R_x0e0) + LXVD2X (INP)(R_x060), VS54 // load v22 (=vs54) in advance VSLDOI $8, V20, V20, V21 - SHA512ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V21) + SHA512ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V21, R_x0f0) VPERMLE(V22,V22,LEMASK,V22) - SHA512ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V22) + SHA512ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V22, R_x100) VSLDOI $8, V22, V22, V23 - SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22) + SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22, R_x110) MOVWZ $4, TEMP MOVWZ TEMP, CTR + ADD $0x120, TBL + ADD $0x70, INP L16_xx: - SHA512ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V18, V23) - SHA512ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V9, V10, V11, V19, V8) - SHA512ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V10, V11, V12, V20, V9) - SHA512ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V11, V12, V13, V21, V10) - SHA512ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V12, V13, V14, V22, V11) - SHA512ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V13, V14, V15, V23, V12) - SHA512ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V14, V15, V16, V8, V13) - SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V15, V16, V17, V9, V14) - SHA512ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V16, V17, V18, V10, V15) - SHA512ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V17, V18, V19, V11, V16) - SHA512ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V18, V19, V20, V12, V17) - SHA512ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V19, V20, V21, V13, V18) - SHA512ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V20, V21, V22, V14, V19) - SHA512ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V21, V22, V23, V15, V20) - SHA512ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V22, V23, V8, V16, V21) - SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22) + SHA512ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V18, V23, R_x000) + SHA512ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V9, V10, V11, V19, V8, R_x010) + SHA512ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V10, V11, V12, V20, V9, R_x020) + SHA512ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V11, V12, V13, V21, V10, R_x030) + SHA512ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V12, V13, V14, V22, V11, R_x040) + SHA512ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V13, V14, V15, V23, V12, R_x050) + SHA512ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V14, V15, V16, V8, V13, R_x060) + SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V15, V16, V17, V9, V14, R_x070) + SHA512ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V16, V17, V18, V10, V15, R_x080) + SHA512ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V17, V18, V19, V11, V16, R_x090) + SHA512ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V18, V19, V20, V12, V17, R_x0a0) + SHA512ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V19, V20, V21, V13, V18, R_x0b0) + SHA512ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V20, V21, V22, V14, V19, R_x0c0) + SHA512ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V21, V22, V23, V15, V20, R_x0d0) + SHA512ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V22, V23, V8, V16, V21, R_x0e0) + SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22, R_x0f0) + ADD $0x100, TBL - BC 0x10, 0, L16_xx // bdnz + BDNZ L16_xx XXLOR VS24, VS24, V10 XXLOR VS25, VS25, V11 @@ -455,10 +477,10 @@ L16_xx: VPERM V5, V4, KI, V4 VPERM V7, V6, KI, V6 #endif - STXVD2X VS32, (CTX+HEX00) // v0 = vs32 - STXVD2X VS34, (CTX+HEX10) // v2 = vs34 - STXVD2X VS36, (CTX+HEX20) // v4 = vs36 - STXVD2X VS38, (CTX+HEX30) // v6 = vs38 + STXVD2X VS32, (CTX+R_x000) // v0 = vs32 + STXVD2X VS34, (CTX+R_x010) // v2 = vs34 + STXVD2X VS36, (CTX+R_x020) // v4 = vs36 + STXVD2X VS38, (CTX+R_x030) // v6 = vs38 end: RET