crypto/sha256: add sha-ni implementation

goos: linux goarch: amd64 pkg: crypto/sha256 cpu: 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz │ bench.old │ bench.new │ │ sec/op │ sec/op vs base │ Hash8Bytes/New-4 169.20n ± 7% 65.40n ± 5% -61.35% (p=0.000 n=10) Hash8Bytes/Sum224-4 166.10n ± 3% 65.20n ± 8% -60.74% (p=0.000 n=10) Hash8Bytes/Sum256-4 168.50n ± 6% 63.58n ± 7% -62.27% (p=0.000 n=10) Hash1K/New-4 2275.5n ± 5% 618.5n ± 2% -72.82% (p=0.000 n=10) Hash1K/Sum224-4 2364.5n ± 1% 618.1n ± 1% -73.86% (p=0.000 n=10) Hash1K/Sum256-4 2338.5n ± 2% 613.0n ± 2% -73.79% (p=0.000 n=10) Hash8K/New-4 17.530µ ± 2% 4.501µ ± 1% -74.33% (p=0.000 n=10) Hash8K/Sum224-4 17.456µ ± 2% 4.505µ ± 1% -74.19% (p=0.000 n=10) Hash8K/Sum256-4 17.417µ ± 2% 4.504µ ± 1% -74.14% (p=0.000 n=10) geomean 1.897µ 564.3n -70.25% │ bench.old │ bench.new │ │ B/s │ B/s vs base │ Hash8Bytes/New-4 45.11Mi ± 6% 116.66Mi ± 5% +158.62% (p=0.000 n=10) Hash8Bytes/Sum224-4 45.92Mi ± 3% 117.04Mi ± 8% +154.89% (p=0.000 n=10) Hash8Bytes/Sum256-4 45.29Mi ± 6% 120.00Mi ± 7% +164.99% (p=0.000 n=10) Hash1K/New-4 429.2Mi ± 5% 1578.9Mi ± 2% +267.92% (p=0.000 n=10) Hash1K/Sum224-4 413.0Mi ± 1% 1579.8Mi ± 1% +282.49% (p=0.000 n=10) Hash1K/Sum256-4 417.6Mi ± 1% 1593.1Mi ± 2% +281.53% (p=0.000 n=10) Hash8K/New-4 445.7Mi ± 1% 1735.9Mi ± 1% +289.50% (p=0.000 n=10) Hash8K/Sum224-4 447.6Mi ± 2% 1734.5Mi ± 1% +287.54% (p=0.000 n=10) Hash8K/Sum256-4 448.6Mi ± 2% 1734.8Mi ± 1% +286.75% (p=0.000 n=10) geomean 204.3Mi 686.8Mi +236.11% │ bench.old │ bench.new │ │ B/op │ B/op vs base │ Hash8Bytes/New-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ Hash8Bytes/Sum224-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ Hash8Bytes/Sum256-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ Hash1K/New-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ Hash1K/Sum224-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ Hash1K/Sum256-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ Hash8K/New-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ Hash8K/Sum224-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ Hash8K/Sum256-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ geomean ² +0.00% ² ¹ all samples are equal ² summaries must be >0 to compute geomean │ bench.old │ bench.new │ │ allocs/op │ allocs/op vs base │ Hash8Bytes/New-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ Hash8Bytes/Sum224-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ Hash8Bytes/Sum256-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ Hash1K/New-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ Hash1K/Sum224-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ Hash1K/Sum256-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ Hash8K/New-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ Hash8K/Sum224-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ Hash8K/Sum256-4 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ geomean ² +0.00% ² ¹ all samples are equal ² summaries must be >0 to compute geomean Fixes #50543. Change-Id: Ie9783647fe82f40fcbd91989a96a24f2d3d5b9a0 Reviewed-on: https://go-review.googlesource.com/c/go/+/408795 Reviewed-by: Paulo Gomes <paulo.gomes.uk@gmail.com> TryBot-Result: Gopher Robot <gobot@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Alan Donovan <adonovan@google.com> Auto-Submit: Russ Cox <rsc@golang.org> Reviewed-by: Russ Cox <rsc@golang.org>
2024-11-16 21:14:44 -07:00 · 2022-05-25 20:43:31 -04:00 · 2022-05-25 20:43:31 -04:00 · 1a64574f42
commit 1a64574f42
parent e29dd78ffe
2 changed files with 152 additions and 9 deletions
--- a/src/crypto/sha256/sha256block_amd64.go
+++ b/src/crypto/sha256/sha256block_amd64.go
@ -7,3 +7,4 @@ package sha256
 import "internal/cpu"

 var useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI2
+var useSHA = useAVX2 && cpu.X86.HasSHA
--- a/src/crypto/sha256/sha256block_amd64.s
+++ b/src/crypto/sha256/sha256block_amd64.s
@ -179,7 +179,7 @@

 #define XFER  Y9

-#define BYTE_FLIP_MASK 	Y13 // mask to convert LE -> BE
+#define BYTE_FLIP_MASK	Y13 // mask to convert LE -> BE
 #define X_BYTE_FLIP_MASK X13

 #define NUM_BYTES DX
@ -232,14 +232,14 @@
 	RORXL    $13, a, T1;                  \ // T1 = a >> 13			// S0B
 	;                                     \
 	XORL     y1, y0;                      \ // y0 = (e>>25) ^ (e>>11)					// S1
-	XORL     g, y2;                       \ // y2 = f^g                              	// CH
+	XORL     g, y2;                       \ // y2 = f^g	// CH
 	VPADDD   XDWORD0, XTMP0, XTMP0;       \ // XTMP0 = W[-7] + W[-16]	// y1 = (e >> 6)	// S1
 	RORXL    $6, e, y1;                   \ // y1 = (e >> 6)						// S1
 	;                                     \
 	ANDL     e, y2;                       \ // y2 = (f^g)&e                         // CH
 	XORL     y1, y0;                      \ // y0 = (e>>25) ^ (e>>11) ^ (e>>6)		// S1
 	RORXL    $22, a, y1;                  \ // y1 = a >> 22							// S0A
-	ADDL     h, d;                        \ // d = k + w + h + d                     	// --
+	ADDL     h, d;                        \ // d = k + w + h + d	// --
 	;                                     \
 	ANDL     b, y3;                       \ // y3 = (a|c)&b							// MAJA
 	VPALIGNR $4, XDWORD0, XDWORD1, XTMP1; \ // XTMP1 = W[-15]
@ -270,7 +270,7 @@
 	MOVL    a, y3;                       \ // y3 = a                       // MAJA
 	RORXL   $25, e, y0;                  \ // y0 = e >> 25					// S1A
 	RORXL   $11, e, y1;                  \ // y1 = e >> 11					// S1B
-	ADDL    (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h         		// --
+	ADDL    (disp + 1*4)(SP)(SRND*1), h; \ // h = k + w + h		// --
 	ORL     c, y3;                       \ // y3 = a|c						// MAJA
 	;                                    \
 	VPSRLD  $3, XTMP1, XTMP4;            \ // XTMP4 = W[-15] >> 3
@ -316,7 +316,7 @@
 	;                                    \
 	MOVL    a, y3;                       \ // y3 = a							// MAJA
 	RORXL   $25, e, y0;                  \ // y0 = e >> 25						// S1A
-	ADDL    (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h        			// --
+	ADDL    (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h			// --
 	;                                    \
 	VPSRLQ  $19, XTMP2, XTMP3;           \ // XTMP3 = W[-2] ror 19 {xBxA}
 	RORXL   $11, e, y1;                  \ // y1 = e >> 11						// S1B
@ -495,7 +495,7 @@
 	;                                  \
 	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
 	RORXL $2, a, T1;                   \ // T1 = (a >> 2)						// S0
-	ADDL  (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h 	// --
+	ADDL  (disp + 2*4)(SP)(SRND*1), h; \ // h = k + w + h	// --
 	ORL   c, y3;                       \ // y3 = a|c								// MAJA
 	;                                  \
 	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
@ -531,7 +531,7 @@
 	;                                  \
 	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13)				// S0
 	RORXL $2, a, T1;                   \ // T1 = (a >> 2)						// S0
-	ADDL  (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h 	// --
+	ADDL  (disp + 3*4)(SP)(SRND*1), h; \ // h = k + w + h	// --
 	ORL   c, y3;                       \ // y3 = a|c								// MAJA
 	;                                  \
 	XORL  T1, y1;                      \ // y1 = (a>>22) ^ (a>>13) ^ (a>>2)		// S0
@ -550,9 +550,80 @@
 	;                                  \
 	ADDL  y3, h                        // h = t1 + S0 + MAJ					// --

+// Definitions for sha-ni version
+//
+// The sha-ni implementation uses Intel(R) SHA extensions SHA256RNDS2, SHA256MSG1, SHA256MSG2
+// It also reuses portions of the flip_mask (half) and K256 table (stride 32) from the avx2 version
+//
+// Reference
+// S. Gulley, et al, "New Instructions Supporting the Secure Hash
+// Algorithm on Intel® Architecture Processors", July 2013
+// https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html
+//
+
+#define digestPtr	DI	// input/output, base pointer to digest hash vector H0, H1, ..., H7
+#define dataPtr		SI	// input, base pointer to first input data block
+#define numBytes	DX	// input, number of input bytes to be processed
+#define sha256Constants	AX	// round contants from K256 table, indexed by round number x 32
+#define msg		X0	// input data
+#define state0		X1	// round intermediates and outputs
+#define state1		X2
+#define m0		X3	// m0, m1,... m4 -- round message temps
+#define m1		X4
+#define m2		X5
+#define m3		X6
+#define m4		X7
+#define shufMask	X8	// input data endian conversion control mask
+#define abefSave	X9	// digest hash vector inter-block buffer abef
+#define cdghSave	X10	// digest hash vector inter-block buffer cdgh
+
+#define nop(m,a)		// nop instead of final SHA256MSG1 for first and last few rounds
+
+#define sha256msg1(m,a) \	// final SHA256MSG1 for middle rounds that require it
+	SHA256MSG1		m, a
+
+#define vmov(a,b) \		// msg copy for all but rounds 12-15
+	VMOVDQA		a, b
+
+#define vmovrev(a,b) \		// reverse copy for rounds 12-15
+	VMOVDQA		b, a
+
+// sha rounds 0 to 11
+// identical with the exception of the final msg op
+// which is replaced with a nop for rounds where it is not needed
+// refer to Gulley, et al for more information
+#define rounds0to11(m,a,c,sha256Msg1)				\
+	VMOVDQU			c*16(dataPtr), msg		\
+	PSHUFB			shufMask, msg			\
+	VMOVDQA			msg, m				\
+	PADDD			(c*32)(sha256Constants), msg	\
+	SHA256RNDS2		msg, state0, state1		\
+	PSHUFD			$0x0e, msg, msg			\
+	SHA256RNDS2		msg, state1, state0		\
+	sha256Msg1		(m,a)
+
+// sha rounds 12 to 59
+// identical with the exception of the final msg op
+// and the reverse copy(m,msg) in round 12 which is required
+// after the last data load
+// refer to Gulley, et al for more information
+#define rounds12to59(m,c,a,t,sha256Msg1,movop)			\
+	movop			(m,msg)				\
+	PADDD			(c*32)(sha256Constants), msg	\
+	SHA256RNDS2		msg, state0, state1		\
+	VMOVDQA			m, m4				\
+	PALIGNR			$4, a, m4			\
+	PADDD			m4, t				\
+	SHA256MSG2		m, t				\
+	PSHUFD			$0x0e, msg, msg			\
+	SHA256RNDS2		msg, state1, state0		\
+	sha256Msg1		(m,a)
+
 TEXT ·block(SB), 0, $536-32
-	CMPB ·useAVX2(SB), $1
-	JE   avx2
+	CMPB	·useSHA(SB), $1
+	JE	sha_ni
+	CMPB	·useAVX2(SB), $1
+	JE	avx2

 	MOVQ p_base+8(FP), SI
 	MOVQ p_len+16(FP), DX
@ -862,6 +933,77 @@ done_hash:
 	VZEROUPPER
 	RET

+sha_ni:
+	MOVQ		dig+0(FP), digestPtr		// init digest hash vector H0, H1,..., H7 pointer
+	MOVQ		p_base+8(FP), dataPtr		// init input data base pointer
+	MOVQ		p_len+16(FP), numBytes		// get number of input bytes to hash
+	SHRQ		$6, numBytes			// force modulo 64 input buffer length
+	SHLQ		$6, numBytes
+	CMPQ		numBytes, $0			// exit early for zero-length input buffer
+	JEQ		done
+	ADDQ		dataPtr, numBytes		// point numBytes to end of input buffer
+	VMOVDQU		(0*16)(digestPtr), state0	// load initial hash values and reorder
+	VMOVDQU		(1*16)(digestPtr), state1	// DCBA, HGFE -> ABEF, CDGH
+	PSHUFD		$0xb1, state0, state0		// CDAB
+	PSHUFD		$0x1b, state1, state1		// EFGH
+	VMOVDQA		state0, m4
+	PALIGNR		$8, state1, state0		// ABEF
+	PBLENDW		$0xf0, m4, state1		// CDGH
+	VMOVDQA		flip_mask<>(SB), shufMask
+	LEAQ		K256<>(SB), sha256Constants
+
+roundLoop:
+	// save hash values for addition after rounds
+	VMOVDQA		state0, abefSave
+	VMOVDQA		state1, cdghSave
+
+	// do rounds 0-59
+	rounds0to11	(m0,-,0,nop)			// 0-3
+	rounds0to11	(m1,m0,1,sha256msg1)		// 4-7
+	rounds0to11	(m2,m1,2,sha256msg1)		// 8-11
+	VMOVDQU		(3*16)(dataPtr), msg
+	PSHUFB		shufMask, msg
+	rounds12to59	(m3,3,m2,m0,sha256msg1,vmovrev)	// 12-15
+	rounds12to59	(m0,4,m3,m1,sha256msg1,vmov)    // 16-19
+	rounds12to59	(m1,5,m0,m2,sha256msg1,vmov)    // 20-23
+	rounds12to59	(m2,6,m1,m3,sha256msg1,vmov)    // 24-27
+	rounds12to59	(m3,7,m2,m0,sha256msg1,vmov)    // 28-31
+	rounds12to59	(m0,8,m3,m1,sha256msg1,vmov)    // 32-35
+	rounds12to59	(m1,9,m0,m2,sha256msg1,vmov)    // 36-39
+	rounds12to59	(m2,10,m1,m3,sha256msg1,vmov)   // 40-43
+	rounds12to59	(m3,11,m2,m0,sha256msg1,vmov)   // 44-47
+	rounds12to59	(m0,12,m3,m1,sha256msg1,vmov)   // 48-51
+	rounds12to59	(m1,13,m0,m2,nop,vmov)          // 52-55
+	rounds12to59	(m2,14,m1,m3,nop,vmov)		// 56-59
+
+	// do rounds 60-63
+	VMOVDQA		m3, msg
+	PADDD		(15*32)(sha256Constants), msg
+	SHA256RNDS2	msg, state0, state1
+	PSHUFD		$0x0e, msg, msg
+	SHA256RNDS2	msg, state1, state0
+
+	// add current hash values with previously saved
+	PADDD		abefSave, state0
+	PADDD		cdghSave, state1
+
+	// advance data pointer; loop until buffer empty
+	ADDQ		$64, dataPtr
+	CMPQ		numBytes, dataPtr
+	JNE		roundLoop
+
+	// write hash values back in the correct order
+	PSHUFD		$0x1b, state0, state0		// FEBA
+	PSHUFD		$0xb1, state1, state1		// DCHG
+	VMOVDQA		state0, m4
+	PBLENDW		$0xf0, state1, state0		// DCBA
+	PALIGNR		$8, m4, state1			// HGFE
+	VMOVDQU		state0, (0*16)(digestPtr)
+	VMOVDQU		state1, (1*16)(digestPtr)
+
+done:
+	RET
+
 // shuffle byte order from LE to BE
 DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
 DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b