1
0
mirror of https://github.com/golang/go synced 2024-11-17 21:54:49 -07:00

crypto/sha256: improve performance for sha256.block on ppc64le

Adds an assembly implementation of sha256.block for ppc64le to improve its
performance.  This implementation is largely based on the original amd64
implementation, which unrolls the 64 iterations of the inner loop.

Fixes #17652

benchmark               old ns/op     new ns/op     delta
BenchmarkHash8Bytes     1263          767           -39.27%
BenchmarkHash1K         14048         7766          -44.72%
BenchmarkHash8K         102245        55626         -45.60%

benchmark               old MB/s     new MB/s     speedup
BenchmarkHash8Bytes     6.33         10.43        1.65x
BenchmarkHash1K         72.89        131.85       1.81x
BenchmarkHash8K         80.12        147.27       1.84x

Change-Id: Ib4adf429423b20495580400be10bd7e171bcc70b
Reviewed-on: https://go-review.googlesource.com/32318
Reviewed-by: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
Reviewed-by: David Chase <drchase@google.com>
Run-TryBot: David Chase <drchase@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
This commit is contained in:
Mike Strosaker 2016-10-28 19:50:16 -04:00 committed by David Chase
parent 854ae03db4
commit 7b50bd8abf
3 changed files with 271 additions and 2 deletions

View File

@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build 386 amd64 s390x
// +build 386 amd64 s390x ppc64le
package sha256

View File

@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !amd64,!386,!s390x
// +build !amd64,!386,!s390x,!ppc64le
package sha256

View File

@ -0,0 +1,269 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "textflag.h"
// SHA256 block routine. See sha256block.go for Go equivalent.
//
// The algorithm is detailed in FIPS 180-4:
//
// http://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
//
// Wt = Mt; for 0 <= t <= 15
// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
//
// a = H0
// b = H1
// c = H2
// d = H3
// e = H4
// f = H5
// g = H6
// h = H7
//
// for t = 0 to 63 {
// T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
// T2 = BIGSIGMA0(a) + Maj(a,b,c)
// h = g
// g = f
// f = e
// e = d + T1
// d = c
// c = b
// b = a
// a = T1 + T2
// }
//
// H0 = a + H0
// H1 = b + H1
// H2 = c + H2
// H3 = d + H3
// H4 = e + H4
// H5 = f + H5
// H6 = g + H6
// H7 = h + H7
// Wt = Mt; for 0 <= t <= 15
#define MSGSCHEDULE0(index) \
MOVWZ (index*4)(R26), R7; \
RLWNM $24, R7, $-1, R11; \
RLWMI $8, R7, $0x00FF0000, R11; \
RLWMI $8, R7, $0x000000FF, R11; \
MOVWZ R11, R7; \
MOVWZ R7, (index*4)(R27)
// Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63
// SIGMA0(x) = ROTR(7,x) XOR ROTR(18,x) XOR SHR(3,x)
// SIGMA1(x) = ROTR(17,x) XOR ROTR(19,x) XOR SHR(10,x)
#define MSGSCHEDULE1(index) \
MOVWZ ((index-2)*4)(R27), R7; \
MOVWZ R7, R9; \
RLWNM $32-17, R7, $-1, R7; \
MOVWZ R9, R10; \
RLWNM $32-19, R9, $-1, R9; \
SRW $10, R10; \
MOVWZ ((index-15)*4)(R27), R8; \
XOR R9, R7; \
MOVWZ R8, R9; \
XOR R10, R7; \
RLWNM $32-7, R8, $-1, R8; \
MOVWZ R9, R10; \
SRW $3, R10; \
RLWNM $32-18, R9, $-1, R9; \
MOVWZ ((index-7)*4)(R27), R11; \
ADD R11, R7; \
XOR R9, R8; \
XOR R10, R8; \
MOVWZ ((index-16)*4)(R27), R11; \
ADD R11, R8; \
ADD R8, R7; \
MOVWZ R7, ((index)*4)(R27)
// T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
// BIGSIGMA1(x) = ROTR(6,x) XOR ROTR(11,x) XOR ROTR(25,x)
// Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
#define SHA256T1(const, e, f, g, h) \
ADD R7, h; \
MOVWZ e, R7; \
ADD $const, h; \
MOVWZ e, R9; \
RLWNM $32-6, R7, $-1, R7; \
MOVWZ e, R10; \
RLWNM $32-11, R9, $-1, R9; \
XOR R9, R7; \
MOVWZ e, R9; \
RLWNM $32-25, R10, $-1, R10; \
AND f, R9; \
XOR R7, R10; \
MOVWZ e, R7; \
NOR R7, R7, R7; \
ADD R10, h; \
AND g, R7; \
XOR R9, R7; \
ADD h, R7
// T2 = BIGSIGMA0(a) + Maj(a, b, c)
// BIGSIGMA0(x) = ROTR(2,x) XOR ROTR(13,x) XOR ROTR(22,x)
// Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
#define SHA256T2(a, b, c) \
MOVWZ a, R28; \
MOVWZ c, R8; \
RLWNM $32-2, R28, $-1, R28; \
MOVWZ a, R10; \
AND b, R8; \
RLWNM $32-13, R10, $-1, R10; \
MOVWZ a, R9; \
AND c, R9; \
XOR R10, R28; \
XOR R9, R8; \
MOVWZ a, R10; \
MOVWZ b, R9; \
RLWNM $32-22, R10, $-1, R10; \
AND a, R9; \
XOR R9, R8; \
XOR R10, R28; \
ADD R28, R8
// Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
// The values for e and a are stored in d and h, ready for rotation.
#define SHA256ROUND(index, const, a, b, c, d, e, f, g, h) \
SHA256T1(const, e, f, g, h); \
SHA256T2(a, b, c); \
MOVWZ R8, h; \
ADD R7, d; \
ADD R7, h
#define SHA256ROUND0(index, const, a, b, c, d, e, f, g, h) \
MSGSCHEDULE0(index); \
SHA256ROUND(index, const, a, b, c, d, e, f, g, h)
#define SHA256ROUND1(index, const, a, b, c, d, e, f, g, h) \
MSGSCHEDULE1(index); \
SHA256ROUND(index, const, a, b, c, d, e, f, g, h)
// func block(dig *digest, p []byte)
TEXT ·block(SB),0,$296-32
MOVD p_base+8(FP), R26
MOVD p_len+16(FP), R29
SRD $6, R29
SLD $6, R29
ADD R26, R29, R28
MOVD R28, 256(R1)
CMP R26, R28
BEQ end
MOVD dig+0(FP), R27
MOVWZ (0*4)(R27), R14 // a = H0
MOVWZ (1*4)(R27), R15 // b = H1
MOVWZ (2*4)(R27), R16 // c = H2
MOVWZ (3*4)(R27), R17 // d = H3
MOVWZ (4*4)(R27), R18 // e = H4
MOVWZ (5*4)(R27), R19 // f = H5
MOVWZ (6*4)(R27), R20 // g = H6
MOVWZ (7*4)(R27), R21 // h = H7
loop:
MOVD R1, R27 // R27: message schedule
SHA256ROUND0(0, 0x428a2f98, R14, R15, R16, R17, R18, R19, R20, R21)
SHA256ROUND0(1, 0x71374491, R21, R14, R15, R16, R17, R18, R19, R20)
SHA256ROUND0(2, 0xb5c0fbcf, R20, R21, R14, R15, R16, R17, R18, R19)
SHA256ROUND0(3, 0xe9b5dba5, R19, R20, R21, R14, R15, R16, R17, R18)
SHA256ROUND0(4, 0x3956c25b, R18, R19, R20, R21, R14, R15, R16, R17)
SHA256ROUND0(5, 0x59f111f1, R17, R18, R19, R20, R21, R14, R15, R16)
SHA256ROUND0(6, 0x923f82a4, R16, R17, R18, R19, R20, R21, R14, R15)
SHA256ROUND0(7, 0xab1c5ed5, R15, R16, R17, R18, R19, R20, R21, R14)
SHA256ROUND0(8, 0xd807aa98, R14, R15, R16, R17, R18, R19, R20, R21)
SHA256ROUND0(9, 0x12835b01, R21, R14, R15, R16, R17, R18, R19, R20)
SHA256ROUND0(10, 0x243185be, R20, R21, R14, R15, R16, R17, R18, R19)
SHA256ROUND0(11, 0x550c7dc3, R19, R20, R21, R14, R15, R16, R17, R18)
SHA256ROUND0(12, 0x72be5d74, R18, R19, R20, R21, R14, R15, R16, R17)
SHA256ROUND0(13, 0x80deb1fe, R17, R18, R19, R20, R21, R14, R15, R16)
SHA256ROUND0(14, 0x9bdc06a7, R16, R17, R18, R19, R20, R21, R14, R15)
SHA256ROUND0(15, 0xc19bf174, R15, R16, R17, R18, R19, R20, R21, R14)
SHA256ROUND1(16, 0xe49b69c1, R14, R15, R16, R17, R18, R19, R20, R21)
SHA256ROUND1(17, 0xefbe4786, R21, R14, R15, R16, R17, R18, R19, R20)
SHA256ROUND1(18, 0x0fc19dc6, R20, R21, R14, R15, R16, R17, R18, R19)
SHA256ROUND1(19, 0x240ca1cc, R19, R20, R21, R14, R15, R16, R17, R18)
SHA256ROUND1(20, 0x2de92c6f, R18, R19, R20, R21, R14, R15, R16, R17)
SHA256ROUND1(21, 0x4a7484aa, R17, R18, R19, R20, R21, R14, R15, R16)
SHA256ROUND1(22, 0x5cb0a9dc, R16, R17, R18, R19, R20, R21, R14, R15)
SHA256ROUND1(23, 0x76f988da, R15, R16, R17, R18, R19, R20, R21, R14)
SHA256ROUND1(24, 0x983e5152, R14, R15, R16, R17, R18, R19, R20, R21)
SHA256ROUND1(25, 0xa831c66d, R21, R14, R15, R16, R17, R18, R19, R20)
SHA256ROUND1(26, 0xb00327c8, R20, R21, R14, R15, R16, R17, R18, R19)
SHA256ROUND1(27, 0xbf597fc7, R19, R20, R21, R14, R15, R16, R17, R18)
SHA256ROUND1(28, 0xc6e00bf3, R18, R19, R20, R21, R14, R15, R16, R17)
SHA256ROUND1(29, 0xd5a79147, R17, R18, R19, R20, R21, R14, R15, R16)
SHA256ROUND1(30, 0x06ca6351, R16, R17, R18, R19, R20, R21, R14, R15)
SHA256ROUND1(31, 0x14292967, R15, R16, R17, R18, R19, R20, R21, R14)
SHA256ROUND1(32, 0x27b70a85, R14, R15, R16, R17, R18, R19, R20, R21)
SHA256ROUND1(33, 0x2e1b2138, R21, R14, R15, R16, R17, R18, R19, R20)
SHA256ROUND1(34, 0x4d2c6dfc, R20, R21, R14, R15, R16, R17, R18, R19)
SHA256ROUND1(35, 0x53380d13, R19, R20, R21, R14, R15, R16, R17, R18)
SHA256ROUND1(36, 0x650a7354, R18, R19, R20, R21, R14, R15, R16, R17)
SHA256ROUND1(37, 0x766a0abb, R17, R18, R19, R20, R21, R14, R15, R16)
SHA256ROUND1(38, 0x81c2c92e, R16, R17, R18, R19, R20, R21, R14, R15)
SHA256ROUND1(39, 0x92722c85, R15, R16, R17, R18, R19, R20, R21, R14)
SHA256ROUND1(40, 0xa2bfe8a1, R14, R15, R16, R17, R18, R19, R20, R21)
SHA256ROUND1(41, 0xa81a664b, R21, R14, R15, R16, R17, R18, R19, R20)
SHA256ROUND1(42, 0xc24b8b70, R20, R21, R14, R15, R16, R17, R18, R19)
SHA256ROUND1(43, 0xc76c51a3, R19, R20, R21, R14, R15, R16, R17, R18)
SHA256ROUND1(44, 0xd192e819, R18, R19, R20, R21, R14, R15, R16, R17)
SHA256ROUND1(45, 0xd6990624, R17, R18, R19, R20, R21, R14, R15, R16)
SHA256ROUND1(46, 0xf40e3585, R16, R17, R18, R19, R20, R21, R14, R15)
SHA256ROUND1(47, 0x106aa070, R15, R16, R17, R18, R19, R20, R21, R14)
SHA256ROUND1(48, 0x19a4c116, R14, R15, R16, R17, R18, R19, R20, R21)
SHA256ROUND1(49, 0x1e376c08, R21, R14, R15, R16, R17, R18, R19, R20)
SHA256ROUND1(50, 0x2748774c, R20, R21, R14, R15, R16, R17, R18, R19)
SHA256ROUND1(51, 0x34b0bcb5, R19, R20, R21, R14, R15, R16, R17, R18)
SHA256ROUND1(52, 0x391c0cb3, R18, R19, R20, R21, R14, R15, R16, R17)
SHA256ROUND1(53, 0x4ed8aa4a, R17, R18, R19, R20, R21, R14, R15, R16)
SHA256ROUND1(54, 0x5b9cca4f, R16, R17, R18, R19, R20, R21, R14, R15)
SHA256ROUND1(55, 0x682e6ff3, R15, R16, R17, R18, R19, R20, R21, R14)
SHA256ROUND1(56, 0x748f82ee, R14, R15, R16, R17, R18, R19, R20, R21)
SHA256ROUND1(57, 0x78a5636f, R21, R14, R15, R16, R17, R18, R19, R20)
SHA256ROUND1(58, 0x84c87814, R20, R21, R14, R15, R16, R17, R18, R19)
SHA256ROUND1(59, 0x8cc70208, R19, R20, R21, R14, R15, R16, R17, R18)
SHA256ROUND1(60, 0x90befffa, R18, R19, R20, R21, R14, R15, R16, R17)
SHA256ROUND1(61, 0xa4506ceb, R17, R18, R19, R20, R21, R14, R15, R16)
SHA256ROUND1(62, 0xbef9a3f7, R16, R17, R18, R19, R20, R21, R14, R15)
SHA256ROUND1(63, 0xc67178f2, R15, R16, R17, R18, R19, R20, R21, R14)
MOVD dig+0(FP), R27
MOVWZ (0*4)(R27), R11
ADD R11, R14 // H0 = a + H0
MOVWZ R14, (0*4)(R27)
MOVWZ (1*4)(R27), R11
ADD R11, R15 // H1 = b + H1
MOVWZ R15, (1*4)(R27)
MOVWZ (2*4)(R27), R11
ADD R11, R16 // H2 = c + H2
MOVWZ R16, (2*4)(R27)
MOVWZ (3*4)(R27), R11
ADD R11, R17 // H3 = d + H3
MOVWZ R17, (3*4)(R27)
MOVWZ (4*4)(R27), R11
ADD R11, R18 // H4 = e + H4
MOVWZ R18, (4*4)(R27)
MOVWZ (5*4)(R27), R11
ADD R11, R19 // H5 = f + H5
MOVWZ R19, (5*4)(R27)
MOVWZ (6*4)(R27), R11
ADD R11, R20 // H6 = g + H6
MOVWZ R20, (6*4)(R27)
MOVWZ (7*4)(R27), R11
ADD R11, R21 // H7 = h + H7
MOVWZ R21, (7*4)(R27)
ADD $64, R26
MOVD 256(R1), R11
CMPU R26, R11
BLT loop
end:
RET