mirror of
https://github.com/golang/go
synced 2024-11-19 14:54:43 -07:00
An ARM version of sha1block.go with a big improvement in throughput
(up to 2.8x). This is a partially unrolled version which performs better for small hashes and only sacrifices a small amount of ultimate speed to a fully unrolled version which uses 5k of code. Code size Before 1636 bytes After 1880 bytes 15% larger Benchmarks on Samsung Exynos 5 ARMv7 Chromebook benchmark old ns/op new ns/op delta BenchmarkHash8Bytes 1907 1136 -40.43% BenchmarkHash1K 20280 7547 -62.79% BenchmarkHash8K 148469 52576 -64.59% benchmark old MB/s new MB/s speedup BenchmarkHash8Bytes 4.19 7.04 1.68x BenchmarkHash1K 50.49 135.68 2.69x BenchmarkHash8K 55.18 155.81 2.82x LGTM=dave, agl R=dave, bradfitz, agl, adg, nick CC=golang-codereviews https://golang.org/cl/56990044
This commit is contained in:
parent
2ea859a779
commit
44c252bda2
@ -2,7 +2,7 @@
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !amd64,!386
|
||||
// +build !amd64,!386,!arm
|
||||
|
||||
// SHA1 block step.
|
||||
// In its own file so that a faster assembly or C version
|
||||
|
217
src/pkg/crypto/sha1/sha1block_arm.s
Normal file
217
src/pkg/crypto/sha1/sha1block_arm.s
Normal file
@ -0,0 +1,217 @@
|
||||
// Copyright 2014 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
//
|
||||
// ARM version of md5block.go
|
||||
|
||||
#include "../../../cmd/ld/textflag.h"
|
||||
|
||||
// SHA1 block routine. See sha1block.go for Go equivalent.
|
||||
//
|
||||
// There are 80 rounds of 4 types:
|
||||
// - rounds 0-15 are type 1 and load data (ROUND1 macro).
|
||||
// - rounds 16-19 are type 1 and do not load data (ROUND1x macro).
|
||||
// - rounds 20-39 are type 2 and do not load data (ROUND2 macro).
|
||||
// - rounds 40-59 are type 3 and do not load data (ROUND3 macro).
|
||||
// - rounds 60-79 are type 4 and do not load data (ROUND4 macro).
|
||||
//
|
||||
// Each round loads or shuffles the data, then computes a per-round
|
||||
// function of b, c, d, and then mixes the result into and rotates the
|
||||
// five registers a, b, c, d, e holding the intermediate results.
|
||||
//
|
||||
// The register rotation is implemented by rotating the arguments to
|
||||
// the round macros instead of by explicit move instructions.
|
||||
|
||||
// Register definitions
|
||||
data = 0 // Pointer to incoming data
|
||||
const = 1 // Current constant for SHA round
|
||||
a = 2 // SHA1 accumulator
|
||||
b = 3 // SHA1 accumulator
|
||||
c = 4 // SHA1 accumulator
|
||||
d = 5 // SHA1 accumulator
|
||||
e = 6 // SHA1 accumulator
|
||||
t0 = 7 // Temporary
|
||||
t1 = 8 // Temporary
|
||||
// r9, r10 are forbidden
|
||||
// r11 is OK provided you check the assembler that no synthetic instructions use it
|
||||
t2 = 11 // Temporary
|
||||
ctr = 12 // loop counter
|
||||
w = 14 // point to w buffer
|
||||
|
||||
// func block(dig *digest, p []byte)
|
||||
// 0(FP) is *digest
|
||||
// 4(FP) is p.array (struct Slice)
|
||||
// 8(FP) is p.len
|
||||
//12(FP) is p.cap
|
||||
//
|
||||
// Stack frame
|
||||
p_end = -4 // -4(SP) pointer to the end of data
|
||||
p_data = p_end - 4 // -8(SP) current data pointer
|
||||
w_buf = p_data - 4*80 // -328(SP) 80 words temporary buffer w uint32[80]
|
||||
saved = w_buf - 4*5 // -348(SP) saved sha1 registers a,b,c,d,e - these must be last
|
||||
// Total size +4 for saved LR is 352
|
||||
|
||||
// w[i] = p[j]<<24 | p[j+1]<<16 | p[j+2]<<8 | p[j+3]
|
||||
// e += w[i]
|
||||
#define LOAD(e) \
|
||||
MOVBU 2(R(data)), R(t0) ; \
|
||||
MOVBU 3(R(data)), R(t1) ; \
|
||||
MOVBU 1(R(data)), R(t2) ; \
|
||||
ORR R(t0)<<8, R(t1), R(t0) ; \
|
||||
MOVBU.P 4(R(data)), R(t1) ; \
|
||||
ORR R(t2)<<16, R(t0), R(t0) ; \
|
||||
ORR R(t1)<<24, R(t0), R(t0) ; \
|
||||
MOVW.P R(t0), 4(R(w)) ; \
|
||||
ADD R(t0), R(e), R(e)
|
||||
|
||||
// tmp := w[(i-3)&0xf] ^ w[(i-8)&0xf] ^ w[(i-14)&0xf] ^ w[(i)&0xf]
|
||||
// w[i&0xf] = tmp<<1 | tmp>>(32-1)
|
||||
// e += w[i&0xf]
|
||||
#define SHUFFLE(e) \
|
||||
MOVW (-16*4)(R(w)), R(t0) ; \
|
||||
MOVW (-14*4)(R(w)), R(t1) ; \
|
||||
MOVW (-8*4)(R(w)), R(t2) ; \
|
||||
EOR R(t0), R(t1), R(t0) ; \
|
||||
MOVW (-3*4)(R(w)), R(t1) ; \
|
||||
EOR R(t2), R(t0), R(t0) ; \
|
||||
EOR R(t0), R(t1), R(t0) ; \
|
||||
MOVW R(t0)@>(32-1), R(t0) ; \
|
||||
MOVW.P R(t0), 4(R(w)) ; \
|
||||
ADD R(t0), R(e), R(e)
|
||||
|
||||
// t1 = (b & c) | ((~b) & d)
|
||||
#define FUNC1(a, b, c, d, e) \
|
||||
MVN R(b), R(t1) ; \
|
||||
AND R(b), R(c), R(t0) ; \
|
||||
AND R(d), R(t1), R(t1) ; \
|
||||
ORR R(t0), R(t1), R(t1)
|
||||
|
||||
// t1 = b ^ c ^ d
|
||||
#define FUNC2(a, b, c, d, e) \
|
||||
EOR R(b), R(c), R(t1) ; \
|
||||
EOR R(d), R(t1), R(t1)
|
||||
|
||||
// t1 = (b & c) | (b & d) | (c & d) =
|
||||
// t1 = (b & c) | ((b | c) & d)
|
||||
#define FUNC3(a, b, c, d, e) \
|
||||
ORR R(b), R(c), R(t0) ; \
|
||||
AND R(b), R(c), R(t1) ; \
|
||||
AND R(d), R(t0), R(t0) ; \
|
||||
ORR R(t0), R(t1), R(t1)
|
||||
|
||||
#define FUNC4 FUNC2
|
||||
|
||||
// a5 := a<<5 | a>>(32-5)
|
||||
// b = b<<30 | b>>(32-30)
|
||||
// e = a5 + t1 + e + const
|
||||
#define MIX(a, b, c, d, e) \
|
||||
ADD R(t1), R(e), R(e) ; \
|
||||
MOVW R(b)@>(32-30), R(b) ; \
|
||||
ADD R(a)@>(32-5), R(e), R(e) ; \
|
||||
ADD R(const), R(e), R(e)
|
||||
|
||||
#define ROUND1(a, b, c, d, e) \
|
||||
LOAD(e) ; \
|
||||
FUNC1(a, b, c, d, e) ; \
|
||||
MIX(a, b, c, d, e)
|
||||
|
||||
#define ROUND1x(a, b, c, d, e) \
|
||||
SHUFFLE(e) ; \
|
||||
FUNC1(a, b, c, d, e) ; \
|
||||
MIX(a, b, c, d, e)
|
||||
|
||||
#define ROUND2(a, b, c, d, e) \
|
||||
SHUFFLE(e) ; \
|
||||
FUNC2(a, b, c, d, e) ; \
|
||||
MIX(a, b, c, d, e)
|
||||
|
||||
#define ROUND3(a, b, c, d, e) \
|
||||
SHUFFLE(e) ; \
|
||||
FUNC3(a, b, c, d, e) ; \
|
||||
MIX(a, b, c, d, e)
|
||||
|
||||
#define ROUND4(a, b, c, d, e) \
|
||||
SHUFFLE(e) ; \
|
||||
FUNC4(a, b, c, d, e) ; \
|
||||
MIX(a, b, c, d, e)
|
||||
|
||||
|
||||
// func block(dig *digest, p []byte)
|
||||
TEXT ·block(SB), 0, $352-16
|
||||
MOVW p+4(FP), R(data) // pointer to the data
|
||||
MOVW p_len+8(FP), R(t0) // number of bytes
|
||||
ADD R(data), R(t0)
|
||||
MOVW R(t0), p_end(SP) // pointer to end of data
|
||||
|
||||
// Load up initial SHA1 accumulator
|
||||
MOVW dig+0(FP), R(t0)
|
||||
MOVM.IA (R(t0)), [R(a),R(b),R(c),R(d),R(e)]
|
||||
|
||||
loop:
|
||||
// Save registers at SP+4 onwards
|
||||
MOVM.IB [R(a),R(b),R(c),R(d),R(e)], (R13)
|
||||
|
||||
MOVW $w_buf(SP), R(w)
|
||||
MOVW $0x5A827999, R(const)
|
||||
MOVW $3, R(ctr)
|
||||
loop1: ROUND1(a, b, c, d, e)
|
||||
ROUND1(e, a, b, c, d)
|
||||
ROUND1(d, e, a, b, c)
|
||||
ROUND1(c, d, e, a, b)
|
||||
ROUND1(b, c, d, e, a)
|
||||
SUB.S $1, R(ctr)
|
||||
BNE loop1
|
||||
|
||||
ROUND1(a, b, c, d, e)
|
||||
ROUND1x(e, a, b, c, d)
|
||||
ROUND1x(d, e, a, b, c)
|
||||
ROUND1x(c, d, e, a, b)
|
||||
ROUND1x(b, c, d, e, a)
|
||||
|
||||
MOVW $0x6ED9EBA1, R(const)
|
||||
MOVW $4, R(ctr)
|
||||
loop2: ROUND2(a, b, c, d, e)
|
||||
ROUND2(e, a, b, c, d)
|
||||
ROUND2(d, e, a, b, c)
|
||||
ROUND2(c, d, e, a, b)
|
||||
ROUND2(b, c, d, e, a)
|
||||
SUB.S $1, R(ctr)
|
||||
BNE loop2
|
||||
|
||||
MOVW $0x8F1BBCDC, R(const)
|
||||
MOVW $4, R(ctr)
|
||||
loop3: ROUND3(a, b, c, d, e)
|
||||
ROUND3(e, a, b, c, d)
|
||||
ROUND3(d, e, a, b, c)
|
||||
ROUND3(c, d, e, a, b)
|
||||
ROUND3(b, c, d, e, a)
|
||||
SUB.S $1, R(ctr)
|
||||
BNE loop3
|
||||
|
||||
MOVW $0xCA62C1D6, R(const)
|
||||
MOVW $4, R(ctr)
|
||||
loop4: ROUND4(a, b, c, d, e)
|
||||
ROUND4(e, a, b, c, d)
|
||||
ROUND4(d, e, a, b, c)
|
||||
ROUND4(c, d, e, a, b)
|
||||
ROUND4(b, c, d, e, a)
|
||||
SUB.S $1, R(ctr)
|
||||
BNE loop4
|
||||
|
||||
// Accumulate - restoring registers from SP+4
|
||||
MOVM.IB (R13), [R(t0),R(t1),R(t2),R(ctr),R(w)]
|
||||
ADD R(t0), R(a)
|
||||
ADD R(t1), R(b)
|
||||
ADD R(t2), R(c)
|
||||
ADD R(ctr), R(d)
|
||||
ADD R(w), R(e)
|
||||
|
||||
MOVW p_end(SP), R(t0)
|
||||
CMP R(t0), R(data)
|
||||
BLO loop
|
||||
|
||||
// Save final SHA1 accumulator
|
||||
MOVW dig+0(FP), R(t0)
|
||||
MOVM.IA [R(a),R(b),R(c),R(d),R(e)], (R(t0))
|
||||
|
||||
RET
|
@ -2,7 +2,7 @@
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build amd64 386
|
||||
// +build amd64 386 arm
|
||||
|
||||
package sha1
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user