mirror of
https://github.com/golang/go
synced 2024-11-17 16:54:44 -07:00
crypto/sha512: reduce add usage on PPC64
Similar to sha256, minimize add usage by preloading constants. This results in a small performance uplift. Likewise, cleanup some unused macros and registers to make room for constants. On ppc64le/power9: Hash8Bytes/New 22.7MB/s ± 0% 24.1MB/s ± 0% +6.49% Hash8Bytes/Sum384 23.4MB/s ± 0% 24.9MB/s ± 0% +6.32% Hash8Bytes/Sum512 23.5MB/s ± 0% 24.9MB/s ± 0% +6.18% Hash1K/New 422MB/s ± 0% 455MB/s ± 0% +7.92% Hash1K/Sum384 424MB/s ± 0% 457MB/s ± 0% +7.78% Hash1K/Sum512 424MB/s ± 0% 457MB/s ± 0% +7.77% Hash8K/New 488MB/s ± 0% 528MB/s ± 0% +8.18% Hash8K/Sum384 481MB/s ± 0% 528MB/s ± 0% +9.76% Hash8K/Sum512 488MB/s ± 0% 515MB/s ± 0% +5.60% Change-Id: Ic604b482e3f6a9680b89c71399f85442f06fef3f Reviewed-on: https://go-review.googlesource.com/c/go/+/460459 Reviewed-by: Archana Ravindar <aravind5@in.ibm.com> Run-TryBot: Carlos Eduardo Seo <carlos.seo@linaro.org> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Carlos Eduardo Seo <carlos.seo@linaro.org> Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com> Reviewed-by: Bryan Mills <bcmills@google.com>
This commit is contained in:
parent
932d0ae83e
commit
504f0d0419
@ -58,16 +58,31 @@
|
||||
#define INP R4
|
||||
#define END R5
|
||||
#define TBL R6
|
||||
#define IDX R7
|
||||
#define CNT R8
|
||||
#define LEN R9
|
||||
#define OFFLOAD R11
|
||||
#define TEMP R12
|
||||
|
||||
#define HEX00 R0
|
||||
#define HEX10 R10
|
||||
#define HEX20 R25
|
||||
#define HEX30 R26
|
||||
#define TBL_STRT R7 // Pointer to start of kcon table.
|
||||
|
||||
#define R_x000 R0
|
||||
#define R_x010 R10
|
||||
#define R_x020 R25
|
||||
#define R_x030 R26
|
||||
#define R_x040 R14
|
||||
#define R_x050 R15
|
||||
#define R_x060 R16
|
||||
#define R_x070 R17
|
||||
#define R_x080 R18
|
||||
#define R_x090 R19
|
||||
#define R_x0a0 R20
|
||||
#define R_x0b0 R21
|
||||
#define R_x0c0 R22
|
||||
#define R_x0d0 R23
|
||||
#define R_x0e0 R24
|
||||
#define R_x0f0 R28
|
||||
#define R_x100 R29
|
||||
#define R_x110 R27
|
||||
|
||||
|
||||
// V0-V7 are A-H
|
||||
// V8-V23 are used for the message schedule
|
||||
@ -254,7 +269,7 @@ DATA ·kcon+0x510(SB)/8, $0x1011121314151617
|
||||
DATA ·kcon+0x518(SB)/8, $0x0001020304050607
|
||||
GLOBL ·kcon(SB), RODATA, $1312
|
||||
|
||||
#define SHA512ROUND0(a, b, c, d, e, f, g, h, xi) \
|
||||
#define SHA512ROUND0(a, b, c, d, e, f, g, h, xi, idx) \
|
||||
VSEL g, f, e, FUNC; \
|
||||
VSHASIGMAD $15, e, $1, S1; \
|
||||
VADDUDM xi, h, h; \
|
||||
@ -266,11 +281,10 @@ GLOBL ·kcon(SB), RODATA, $1312
|
||||
VADDUDM KI, g, g; \
|
||||
VADDUDM h, d, d; \
|
||||
VADDUDM FUNC, S0, S0; \
|
||||
LVX (TBL)(IDX), KI; \
|
||||
ADD $16, IDX; \
|
||||
LVX (TBL)(idx), KI; \
|
||||
VADDUDM S0, h, h
|
||||
|
||||
#define SHA512ROUND1(a, b, c, d, e, f, g, h, xi, xj, xj_1, xj_9, xj_14) \
|
||||
#define SHA512ROUND1(a, b, c, d, e, f, g, h, xi, xj, xj_1, xj_9, xj_14, idx) \
|
||||
VSHASIGMAD $0, xj_1, $0, s0; \
|
||||
VSEL g, f, e, FUNC; \
|
||||
VSHASIGMAD $15, e, $1, S1; \
|
||||
@ -286,8 +300,7 @@ GLOBL ·kcon(SB), RODATA, $1312
|
||||
VADDUDM h, d, d; \
|
||||
VADDUDM FUNC, S0, S0; \
|
||||
VADDUDM s0, xj, xj; \
|
||||
LVX (TBL)(IDX), KI; \
|
||||
ADD $16, IDX; \
|
||||
LVX (TBL)(idx), KI; \
|
||||
VADDUDM S0, h, h; \
|
||||
VADDUDM s1, xj, xj
|
||||
|
||||
@ -305,36 +318,50 @@ TEXT ·block(SB),0,$0-32
|
||||
CMP INP, END
|
||||
BEQ end
|
||||
|
||||
MOVD $·kcon(SB), TBL
|
||||
MOVD R1, OFFLOAD
|
||||
MOVD $·kcon(SB), TBL_STRT
|
||||
|
||||
MOVD R0, CNT
|
||||
MOVWZ $0x10, HEX10
|
||||
MOVWZ $0x20, HEX20
|
||||
MOVWZ $0x30, HEX30
|
||||
MOVWZ $0x010, R_x010
|
||||
MOVWZ $0x020, R_x020
|
||||
MOVWZ $0x030, R_x030
|
||||
MOVD $0x040, R_x040
|
||||
MOVD $0x050, R_x050
|
||||
MOVD $0x060, R_x060
|
||||
MOVD $0x070, R_x070
|
||||
MOVD $0x080, R_x080
|
||||
MOVD $0x090, R_x090
|
||||
MOVD $0x0a0, R_x0a0
|
||||
MOVD $0x0b0, R_x0b0
|
||||
MOVD $0x0c0, R_x0c0
|
||||
MOVD $0x0d0, R_x0d0
|
||||
MOVD $0x0e0, R_x0e0
|
||||
MOVD $0x0f0, R_x0f0
|
||||
MOVD $0x100, R_x100
|
||||
MOVD $0x110, R_x110
|
||||
|
||||
// Generate the mask used with VPERM for LE
|
||||
|
||||
#ifdef GOARCH_ppc64le
|
||||
MOVWZ $8, IDX
|
||||
LVSL (IDX)(R0), LEMASK
|
||||
// Generate the mask used with VPERM for LE
|
||||
MOVWZ $8, TEMP
|
||||
LVSL (TEMP)(R0), LEMASK
|
||||
VSPLTISB $0x0F, KI
|
||||
VXOR KI, LEMASK, LEMASK
|
||||
#endif
|
||||
|
||||
LXVD2X (CTX)(HEX00), VS32 // v0 = vs32
|
||||
LXVD2X (CTX)(HEX10), VS34 // v2 = vs34
|
||||
LXVD2X (CTX)(HEX20), VS36 // v4 = vs36
|
||||
LXVD2X (CTX)(R_x000), VS32 // v0 = vs32
|
||||
LXVD2X (CTX)(R_x010), VS34 // v2 = vs34
|
||||
LXVD2X (CTX)(R_x020), VS36 // v4 = vs36
|
||||
|
||||
// unpack the input values into vector registers
|
||||
VSLDOI $8, V0, V0, V1
|
||||
LXVD2X (CTX)(HEX30), VS38 // v6 = vs38
|
||||
LXVD2X (CTX)(R_x030), VS38 // v6 = vs38
|
||||
VSLDOI $8, V2, V2, V3
|
||||
VSLDOI $8, V4, V4, V5
|
||||
VSLDOI $8, V6, V6, V7
|
||||
|
||||
loop:
|
||||
LVX (TBL)(HEX00), KI
|
||||
MOVWZ $16, IDX
|
||||
MOVD TBL_STRT, TBL
|
||||
LVX (TBL)(R_x000), KI
|
||||
|
||||
LXVD2X (INP)(R0), VS40 // load v8 (=vs40) in advance
|
||||
ADD $16, INP
|
||||
@ -351,78 +378,73 @@ loop:
|
||||
XXLOR V7, V7, VS31
|
||||
|
||||
VADDUDM KI, V7, V7 // h+K[i]
|
||||
LVX (TBL)(IDX), KI
|
||||
ADD $16, IDX
|
||||
LVX (TBL)(R_x010), KI
|
||||
|
||||
VPERMLE(V8,V8,LEMASK,V8)
|
||||
SHA512ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V8)
|
||||
LXVD2X (INP)(R0), VS42 // load v10 (=vs42) in advance
|
||||
ADD $16, INP, INP
|
||||
SHA512ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V8, R_x020)
|
||||
LXVD2X (INP)(R_x000), VS42 // load v10 (=vs42) in advance
|
||||
VSLDOI $8, V8, V8, V9
|
||||
SHA512ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V9)
|
||||
SHA512ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V9, R_x030)
|
||||
VPERMLE(V10,V10,LEMASK,V10)
|
||||
SHA512ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V10)
|
||||
LXVD2X (INP)(R0), VS44 // load v12 (=vs44) in advance
|
||||
ADD $16, INP, INP
|
||||
SHA512ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V10, R_x040)
|
||||
LXVD2X (INP)(R_x010), VS44 // load v12 (=vs44) in advance
|
||||
VSLDOI $8, V10, V10, V11
|
||||
SHA512ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V11)
|
||||
SHA512ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V11, R_x050)
|
||||
VPERMLE(V12,V12,LEMASK,V12)
|
||||
SHA512ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V12)
|
||||
LXVD2X (INP)(R0), VS46 // load v14 (=vs46) in advance
|
||||
ADD $16, INP, INP
|
||||
SHA512ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V12, R_x060)
|
||||
LXVD2X (INP)(R_x020), VS46 // load v14 (=vs46) in advance
|
||||
VSLDOI $8, V12, V12, V13
|
||||
SHA512ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V13)
|
||||
SHA512ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V13, R_x070)
|
||||
VPERMLE(V14,V14,LEMASK,V14)
|
||||
SHA512ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V14)
|
||||
LXVD2X (INP)(R0), VS48 // load v16 (=vs48) in advance
|
||||
ADD $16, INP, INP
|
||||
SHA512ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V14, R_x080)
|
||||
LXVD2X (INP)(R_x030), VS48 // load v16 (=vs48) in advance
|
||||
VSLDOI $8, V14, V14, V15
|
||||
SHA512ROUND0(V1, V2, V3, V4, V5, V6, V7, V0, V15)
|
||||
SHA512ROUND0(V1, V2, V3, V4, V5, V6, V7, V0, V15, R_x090)
|
||||
VPERMLE(V16,V16,LEMASK,V16)
|
||||
SHA512ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V16)
|
||||
LXVD2X (INP)(R0), VS50 // load v18 (=vs50) in advance
|
||||
ADD $16, INP, INP
|
||||
SHA512ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V16, R_x0a0)
|
||||
LXVD2X (INP)(R_x040), VS50 // load v18 (=vs50) in advance
|
||||
VSLDOI $8, V16, V16, V17
|
||||
SHA512ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V17)
|
||||
SHA512ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V17, R_x0b0)
|
||||
VPERMLE(V18,V18,LEMASK,V18)
|
||||
SHA512ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V18)
|
||||
LXVD2X (INP)(R0), VS52 // load v20 (=vs52) in advance
|
||||
ADD $16, INP, INP
|
||||
SHA512ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V18, R_x0c0)
|
||||
LXVD2X (INP)(R_x050), VS52 // load v20 (=vs52) in advance
|
||||
VSLDOI $8, V18, V18, V19
|
||||
SHA512ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V19)
|
||||
SHA512ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V19, R_x0d0)
|
||||
VPERMLE(V20,V20,LEMASK,V20)
|
||||
SHA512ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V20)
|
||||
LXVD2X (INP)(R0), VS54 // load v22 (=vs54) in advance
|
||||
ADD $16, INP, INP
|
||||
SHA512ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V20, R_x0e0)
|
||||
LXVD2X (INP)(R_x060), VS54 // load v22 (=vs54) in advance
|
||||
VSLDOI $8, V20, V20, V21
|
||||
SHA512ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V21)
|
||||
SHA512ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V21, R_x0f0)
|
||||
VPERMLE(V22,V22,LEMASK,V22)
|
||||
SHA512ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V22)
|
||||
SHA512ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V22, R_x100)
|
||||
VSLDOI $8, V22, V22, V23
|
||||
SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22)
|
||||
SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22, R_x110)
|
||||
|
||||
MOVWZ $4, TEMP
|
||||
MOVWZ TEMP, CTR
|
||||
ADD $0x120, TBL
|
||||
ADD $0x70, INP
|
||||
|
||||
L16_xx:
|
||||
SHA512ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V18, V23)
|
||||
SHA512ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V9, V10, V11, V19, V8)
|
||||
SHA512ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V10, V11, V12, V20, V9)
|
||||
SHA512ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V11, V12, V13, V21, V10)
|
||||
SHA512ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V12, V13, V14, V22, V11)
|
||||
SHA512ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V13, V14, V15, V23, V12)
|
||||
SHA512ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V14, V15, V16, V8, V13)
|
||||
SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V15, V16, V17, V9, V14)
|
||||
SHA512ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V16, V17, V18, V10, V15)
|
||||
SHA512ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V17, V18, V19, V11, V16)
|
||||
SHA512ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V18, V19, V20, V12, V17)
|
||||
SHA512ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V19, V20, V21, V13, V18)
|
||||
SHA512ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V20, V21, V22, V14, V19)
|
||||
SHA512ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V21, V22, V23, V15, V20)
|
||||
SHA512ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V22, V23, V8, V16, V21)
|
||||
SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22)
|
||||
SHA512ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V18, V23, R_x000)
|
||||
SHA512ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V9, V10, V11, V19, V8, R_x010)
|
||||
SHA512ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V10, V11, V12, V20, V9, R_x020)
|
||||
SHA512ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V11, V12, V13, V21, V10, R_x030)
|
||||
SHA512ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V12, V13, V14, V22, V11, R_x040)
|
||||
SHA512ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V13, V14, V15, V23, V12, R_x050)
|
||||
SHA512ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V14, V15, V16, V8, V13, R_x060)
|
||||
SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V15, V16, V17, V9, V14, R_x070)
|
||||
SHA512ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V16, V17, V18, V10, V15, R_x080)
|
||||
SHA512ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V17, V18, V19, V11, V16, R_x090)
|
||||
SHA512ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V18, V19, V20, V12, V17, R_x0a0)
|
||||
SHA512ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V19, V20, V21, V13, V18, R_x0b0)
|
||||
SHA512ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V20, V21, V22, V14, V19, R_x0c0)
|
||||
SHA512ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V21, V22, V23, V15, V20, R_x0d0)
|
||||
SHA512ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V22, V23, V8, V16, V21, R_x0e0)
|
||||
SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22, R_x0f0)
|
||||
ADD $0x100, TBL
|
||||
|
||||
BC 0x10, 0, L16_xx // bdnz
|
||||
BDNZ L16_xx
|
||||
|
||||
XXLOR VS24, VS24, V10
|
||||
XXLOR VS25, VS25, V11
|
||||
@ -455,10 +477,10 @@ L16_xx:
|
||||
VPERM V5, V4, KI, V4
|
||||
VPERM V7, V6, KI, V6
|
||||
#endif
|
||||
STXVD2X VS32, (CTX+HEX00) // v0 = vs32
|
||||
STXVD2X VS34, (CTX+HEX10) // v2 = vs34
|
||||
STXVD2X VS36, (CTX+HEX20) // v4 = vs36
|
||||
STXVD2X VS38, (CTX+HEX30) // v6 = vs38
|
||||
STXVD2X VS32, (CTX+R_x000) // v0 = vs32
|
||||
STXVD2X VS34, (CTX+R_x010) // v2 = vs34
|
||||
STXVD2X VS36, (CTX+R_x020) // v4 = vs36
|
||||
STXVD2X VS38, (CTX+R_x030) // v6 = vs38
|
||||
|
||||
end:
|
||||
RET
|
||||
|
Loading…
Reference in New Issue
Block a user