go/src/crypto/sha1/sha1block_arm.s

// Copyright 2014 The Go Authors.  All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//
// ARM version of md5block.go

#include "textflag.h"

// SHA1 block routine. See sha1block.go for Go equivalent.
//
// There are 80 rounds of 4 types:
//   - rounds 0-15 are type 1 and load data (ROUND1 macro).
//   - rounds 16-19 are type 1 and do not load data (ROUND1x macro).
//   - rounds 20-39 are type 2 and do not load data (ROUND2 macro).
//   - rounds 40-59 are type 3 and do not load data (ROUND3 macro).
//   - rounds 60-79 are type 4 and do not load data (ROUND4 macro).
//
// Each round loads or shuffles the data, then computes a per-round
// function of b, c, d, and then mixes the result into and rotates the
// five registers a, b, c, d, e holding the intermediate results.
//
// The register rotation is implemented by rotating the arguments to
// the round macros instead of by explicit move instructions.

// Register definitions
data = 0	// Pointer to incoming data
const = 1	// Current constant for SHA round
a = 2		// SHA1 accumulator
b = 3		// SHA1 accumulator
c = 4		// SHA1 accumulator
d = 5		// SHA1 accumulator
e = 6		// SHA1 accumulator
t0 = 7		// Temporary
t1 = 8		// Temporary
// r9, r10 are forbidden
// r11 is OK provided you check the assembler that no synthetic instructions use it
t2 = 11		// Temporary
ctr = 12	// loop counter
w = 14		// point to w buffer

// func block(dig *digest, p []byte)
// 0(FP) is *digest
// 4(FP) is p.array (struct Slice)
// 8(FP) is p.len
//12(FP) is p.cap
//
// Stack frame
p_end = -4		// -4(SP) pointer to the end of data
p_data = p_end - 4	// -8(SP) current data pointer
w_buf = p_data - 4*80	// -328(SP) 80 words temporary buffer w uint32[80]
saved = w_buf - 4*5	// -348(SP) saved sha1 registers a,b,c,d,e - these must be last
// Total size +4 for saved LR is 352

	// w[i] = p[j]<<24 | p[j+1]<<16 | p[j+2]<<8 | p[j+3]
	// e += w[i]
#define LOAD(e) \
	MOVBU	2(R(data)), R(t0) ; \
	MOVBU	3(R(data)), R(t1) ; \
	MOVBU	1(R(data)), R(t2) ; \
	ORR	R(t0)<<8, R(t1), R(t0)	    ; \
	MOVBU.P	4(R(data)), R(t1) ; \
	ORR	R(t2)<<16, R(t0), R(t0)	    ; \
	ORR	R(t1)<<24, R(t0), R(t0)	    ; \
	MOVW.P	R(t0), 4(R(w))		    ; \
	ADD	R(t0), R(e), R(e)
	
	// tmp := w[(i-3)&0xf] ^ w[(i-8)&0xf] ^ w[(i-14)&0xf] ^ w[(i)&0xf]
	// w[i&0xf] = tmp<<1 | tmp>>(32-1)
	// e += w[i&0xf] 
#define SHUFFLE(e) \
	MOVW	(-16*4)(R(w)), R(t0) ; \
	MOVW	(-14*4)(R(w)), R(t1) ; \
	MOVW	(-8*4)(R(w)), R(t2)  ; \
	EOR	R(t0), R(t1), R(t0)  ; \
	MOVW	(-3*4)(R(w)), R(t1)  ; \
	EOR	R(t2), R(t0), R(t0)  ; \
	EOR	R(t0), R(t1), R(t0)  ; \
	MOVW	R(t0)@>(32-1), R(t0)  ; \
	MOVW.P	R(t0), 4(R(w))	  ; \
	ADD	R(t0), R(e), R(e)

	// t1 = (b & c) | ((~b) & d)
#define FUNC1(a, b, c, d, e) \
	MVN	R(b), R(t1)	   ; \
	AND	R(b), R(c), R(t0)  ; \
	AND	R(d), R(t1), R(t1) ; \
	ORR	R(t0), R(t1), R(t1)

	// t1 = b ^ c ^ d
#define FUNC2(a, b, c, d, e) \
	EOR	R(b), R(c), R(t1) ; \
	EOR	R(d), R(t1), R(t1)

	// t1 = (b & c) | (b & d) | (c & d) =
	// t1 = (b & c) | ((b | c) & d)
#define FUNC3(a, b, c, d, e) \
	ORR	R(b), R(c), R(t0)  ; \
	AND	R(b), R(c), R(t1)  ; \
	AND	R(d), R(t0), R(t0) ; \
	ORR	R(t0), R(t1), R(t1)

#define FUNC4 FUNC2

	// a5 := a<<5 | a>>(32-5)
	// b = b<<30 | b>>(32-30)
	// e = a5 + t1 + e + const
#define MIX(a, b, c, d, e) \
	ADD	R(t1), R(e), R(e)	 ; \
	MOVW	R(b)@>(32-30), R(b)	 ; \
	ADD	R(a)@>(32-5), R(e), R(e) ; \
	ADD	R(const), R(e), R(e)

#define ROUND1(a, b, c, d, e) \
	LOAD(e)		; \
	FUNC1(a, b, c, d, e)	; \
	MIX(a, b, c, d, e)

#define ROUND1x(a, b, c, d, e) \
	SHUFFLE(e)	; \
	FUNC1(a, b, c, d, e)	; \
	MIX(a, b, c, d, e)

#define ROUND2(a, b, c, d, e) \
	SHUFFLE(e)	; \
	FUNC2(a, b, c, d, e)	; \
	MIX(a, b, c, d, e)

#define ROUND3(a, b, c, d, e) \
	SHUFFLE(e)	; \
	FUNC3(a, b, c, d, e)	; \
	MIX(a, b, c, d, e)

#define ROUND4(a, b, c, d, e) \
	SHUFFLE(e)	; \
	FUNC4(a, b, c, d, e)	; \
	MIX(a, b, c, d, e)


// func block(dig *digest, p []byte)
TEXT	·block(SB), 0, $352-16
	MOVW	p+4(FP), R(data)	// pointer to the data
	MOVW	p_len+8(FP), R(t0)	// number of bytes
	ADD	R(data), R(t0)
	MOVW	R(t0), p_end(SP)	// pointer to end of data

	// Load up initial SHA1 accumulator
	MOVW	dig+0(FP), R(t0)
	MOVM.IA (R(t0)), [R(a),R(b),R(c),R(d),R(e)]

loop:
	// Save registers at SP+4 onwards
	MOVM.IB [R(a),R(b),R(c),R(d),R(e)], (R13)

	MOVW	$w_buf(SP), R(w)
	MOVW	$0x5A827999, R(const)
	MOVW	$3, R(ctr)
loop1:	ROUND1(a, b, c, d, e)
	ROUND1(e, a, b, c, d)
	ROUND1(d, e, a, b, c)
	ROUND1(c, d, e, a, b)
	ROUND1(b, c, d, e, a)
	SUB.S	$1, R(ctr)
	BNE	loop1

	ROUND1(a, b, c, d, e)
	ROUND1x(e, a, b, c, d)
	ROUND1x(d, e, a, b, c)
	ROUND1x(c, d, e, a, b)
	ROUND1x(b, c, d, e, a)
	
	MOVW	$0x6ED9EBA1, R(const)
	MOVW	$4, R(ctr)
loop2:	ROUND2(a, b, c, d, e)
	ROUND2(e, a, b, c, d)
	ROUND2(d, e, a, b, c)
	ROUND2(c, d, e, a, b)
	ROUND2(b, c, d, e, a)
	SUB.S	$1, R(ctr)
	BNE	loop2
	
	MOVW	$0x8F1BBCDC, R(const)
	MOVW	$4, R(ctr)
loop3:	ROUND3(a, b, c, d, e)
	ROUND3(e, a, b, c, d)
	ROUND3(d, e, a, b, c)
	ROUND3(c, d, e, a, b)
	ROUND3(b, c, d, e, a)
	SUB.S	$1, R(ctr)
	BNE	loop3
	
	MOVW	$0xCA62C1D6, R(const)
	MOVW	$4, R(ctr)
loop4:	ROUND4(a, b, c, d, e)
	ROUND4(e, a, b, c, d)
	ROUND4(d, e, a, b, c)
	ROUND4(c, d, e, a, b)
	ROUND4(b, c, d, e, a)
	SUB.S	$1, R(ctr)
	BNE	loop4

	// Accumulate - restoring registers from SP+4
	MOVM.IB (R13), [R(t0),R(t1),R(t2),R(ctr),R(w)]
	ADD	R(t0), R(a)
	ADD	R(t1), R(b)
	ADD	R(t2), R(c)
	ADD	R(ctr), R(d)
	ADD	R(w), R(e)

	MOVW	p_end(SP), R(t0)
	CMP	R(t0), R(data)
	BLO	loop

	// Save final SHA1 accumulator
	MOVW	dig+0(FP), R(t0)
	MOVM.IA [R(a),R(b),R(c),R(d),R(e)], (R(t0))

	RET
An ARM version of sha1block.go with a big improvement in throughput (up to 2.8x). This is a partially unrolled version which performs better for small hashes and only sacrifices a small amount of ultimate speed to a fully unrolled version which uses 5k of code. Code size Before 1636 bytes After 1880 bytes 15% larger Benchmarks on Samsung Exynos 5 ARMv7 Chromebook benchmark old ns/op new ns/op delta BenchmarkHash8Bytes 1907 1136 -40.43% BenchmarkHash1K 20280 7547 -62.79% BenchmarkHash8K 148469 52576 -64.59% benchmark old MB/s new MB/s speedup BenchmarkHash8Bytes 4.19 7.04 1.68x BenchmarkHash1K 50.49 135.68 2.69x BenchmarkHash8K 55.18 155.81 2.82x LGTM=dave, agl R=dave, bradfitz, agl, adg, nick CC=golang-codereviews https://golang.org/cl/56990044 2014-02-12 11:24:52 -07:00			`// Copyright 2014 The Go Authors. All rights reserved.`
			`// Use of this source code is governed by a BSD-style`
			`// license that can be found in the LICENSE file.`
			`//`
			`// ARM version of md5block.go`

all: copy cmd/ld/textflag.h into pkg/GOOS_GOARCH The file is used by assembly code to define symbols like NOSPLIT. Having it hidden inside the cmd directory makes it hard to access outside the standard repository. Solution: As with a couple of other files used by cgo, copy the file into the pkg directory and add a -I argument to the assembler to access it. Thus one can write just #include "textflag.h" in .s files. The names in runtime are not updated because in the boot sequence the file has not been copied yet when runtime is built. All other .s files in the repository are updated. Changes to doc/asm.html, src/cmd/dist/build.c, and src/cmd/go/build.go are hand-made. The rest are just the renaming done by a global substitution. (Yay sam). LGTM=rsc R=rsc CC=golang-codereviews https://golang.org/cl/128050043 2014-08-12 18:04:45 -06:00			`#include "textflag.h"`
An ARM version of sha1block.go with a big improvement in throughput (up to 2.8x). This is a partially unrolled version which performs better for small hashes and only sacrifices a small amount of ultimate speed to a fully unrolled version which uses 5k of code. Code size Before 1636 bytes After 1880 bytes 15% larger Benchmarks on Samsung Exynos 5 ARMv7 Chromebook benchmark old ns/op new ns/op delta BenchmarkHash8Bytes 1907 1136 -40.43% BenchmarkHash1K 20280 7547 -62.79% BenchmarkHash8K 148469 52576 -64.59% benchmark old MB/s new MB/s speedup BenchmarkHash8Bytes 4.19 7.04 1.68x BenchmarkHash1K 50.49 135.68 2.69x BenchmarkHash8K 55.18 155.81 2.82x LGTM=dave, agl R=dave, bradfitz, agl, adg, nick CC=golang-codereviews https://golang.org/cl/56990044 2014-02-12 11:24:52 -07:00
			`// SHA1 block routine. See sha1block.go for Go equivalent.`
			`//`
			`// There are 80 rounds of 4 types:`
			`// - rounds 0-15 are type 1 and load data (ROUND1 macro).`
			`// - rounds 16-19 are type 1 and do not load data (ROUND1x macro).`
			`// - rounds 20-39 are type 2 and do not load data (ROUND2 macro).`
			`// - rounds 40-59 are type 3 and do not load data (ROUND3 macro).`
			`// - rounds 60-79 are type 4 and do not load data (ROUND4 macro).`
			`//`
			`// Each round loads or shuffles the data, then computes a per-round`
			`// function of b, c, d, and then mixes the result into and rotates the`
			`// five registers a, b, c, d, e holding the intermediate results.`
			`//`
			`// The register rotation is implemented by rotating the arguments to`
			`// the round macros instead of by explicit move instructions.`

			`// Register definitions`
			`data = 0 // Pointer to incoming data`
			`const = 1 // Current constant for SHA round`
			`a = 2 // SHA1 accumulator`
			`b = 3 // SHA1 accumulator`
			`c = 4 // SHA1 accumulator`
			`d = 5 // SHA1 accumulator`
			`e = 6 // SHA1 accumulator`
			`t0 = 7 // Temporary`
			`t1 = 8 // Temporary`
			`// r9, r10 are forbidden`
			`// r11 is OK provided you check the assembler that no synthetic instructions use it`
			`t2 = 11 // Temporary`
			`ctr = 12 // loop counter`
			`w = 14 // point to w buffer`

			`// func block(dig *digest, p []byte)`
			`// 0(FP) is *digest`
			`// 4(FP) is p.array (struct Slice)`
			`// 8(FP) is p.len`
			`//12(FP) is p.cap`
			`//`
			`// Stack frame`
			`p_end = -4 // -4(SP) pointer to the end of data`
			`p_data = p_end - 4 // -8(SP) current data pointer`
			`w_buf = p_data - 4*80 // -328(SP) 80 words temporary buffer w uint32[80]`
			`saved = w_buf - 4*5 // -348(SP) saved sha1 registers a,b,c,d,e - these must be last`
			`// Total size +4 for saved LR is 352`

			`// w[i] = p[j]<<24 \| p[j+1]<<16 \| p[j+2]<<8 \| p[j+3]`
			`// e += w[i]`
			`#define LOAD(e) \`
			`MOVBU 2(R(data)), R(t0) ; \`
			`MOVBU 3(R(data)), R(t1) ; \`
			`MOVBU 1(R(data)), R(t2) ; \`
			`ORR R(t0)<<8, R(t1), R(t0) ; \`
			`MOVBU.P 4(R(data)), R(t1) ; \`
			`ORR R(t2)<<16, R(t0), R(t0) ; \`
			`ORR R(t1)<<24, R(t0), R(t0) ; \`
			`MOVW.P R(t0), 4(R(w)) ; \`
			`ADD R(t0), R(e), R(e)`

			`// tmp := w[(i-3)&0xf] ^ w[(i-8)&0xf] ^ w[(i-14)&0xf] ^ w[(i)&0xf]`
			`// w[i&0xf] = tmp<<1 \| tmp>>(32-1)`
			`// e += w[i&0xf]`
			`#define SHUFFLE(e) \`
			`MOVW (-16*4)(R(w)), R(t0) ; \`
			`MOVW (-14*4)(R(w)), R(t1) ; \`
			`MOVW (-8*4)(R(w)), R(t2) ; \`
			`EOR R(t0), R(t1), R(t0) ; \`
			`MOVW (-3*4)(R(w)), R(t1) ; \`
			`EOR R(t2), R(t0), R(t0) ; \`
			`EOR R(t0), R(t1), R(t0) ; \`
			`MOVW R(t0)@>(32-1), R(t0) ; \`
			`MOVW.P R(t0), 4(R(w)) ; \`
			`ADD R(t0), R(e), R(e)`

			`// t1 = (b & c) \| ((~b) & d)`
			`#define FUNC1(a, b, c, d, e) \`
			`MVN R(b), R(t1) ; \`
			`AND R(b), R(c), R(t0) ; \`
			`AND R(d), R(t1), R(t1) ; \`
			`ORR R(t0), R(t1), R(t1)`

			`// t1 = b ^ c ^ d`
			`#define FUNC2(a, b, c, d, e) \`
			`EOR R(b), R(c), R(t1) ; \`
			`EOR R(d), R(t1), R(t1)`

			`// t1 = (b & c) \| (b & d) \| (c & d) =`
			`// t1 = (b & c) \| ((b \| c) & d)`
			`#define FUNC3(a, b, c, d, e) \`
			`ORR R(b), R(c), R(t0) ; \`
			`AND R(b), R(c), R(t1) ; \`
			`AND R(d), R(t0), R(t0) ; \`
			`ORR R(t0), R(t1), R(t1)`

			`#define FUNC4 FUNC2`

			`// a5 := a<<5 \| a>>(32-5)`
			`// b = b<<30 \| b>>(32-30)`
			`// e = a5 + t1 + e + const`
			`#define MIX(a, b, c, d, e) \`
			`ADD R(t1), R(e), R(e) ; \`
			`MOVW R(b)@>(32-30), R(b) ; \`
			`ADD R(a)@>(32-5), R(e), R(e) ; \`
			`ADD R(const), R(e), R(e)`

			`#define ROUND1(a, b, c, d, e) \`
			`LOAD(e) ; \`
			`FUNC1(a, b, c, d, e) ; \`
			`MIX(a, b, c, d, e)`

			`#define ROUND1x(a, b, c, d, e) \`
			`SHUFFLE(e) ; \`
			`FUNC1(a, b, c, d, e) ; \`
			`MIX(a, b, c, d, e)`

			`#define ROUND2(a, b, c, d, e) \`
			`SHUFFLE(e) ; \`
			`FUNC2(a, b, c, d, e) ; \`
			`MIX(a, b, c, d, e)`

			`#define ROUND3(a, b, c, d, e) \`
			`SHUFFLE(e) ; \`
			`FUNC3(a, b, c, d, e) ; \`
			`MIX(a, b, c, d, e)`

			`#define ROUND4(a, b, c, d, e) \`
			`SHUFFLE(e) ; \`
			`FUNC4(a, b, c, d, e) ; \`
			`MIX(a, b, c, d, e)`


			`// func block(dig *digest, p []byte)`
			`TEXT ·block(SB), 0, $352-16`
			`MOVW p+4(FP), R(data) // pointer to the data`
			`MOVW p_len+8(FP), R(t0) // number of bytes`
			`ADD R(data), R(t0)`
			`MOVW R(t0), p_end(SP) // pointer to end of data`

			`// Load up initial SHA1 accumulator`
			`MOVW dig+0(FP), R(t0)`
			`MOVM.IA (R(t0)), [R(a),R(b),R(c),R(d),R(e)]`

			`loop:`
			`// Save registers at SP+4 onwards`
			`MOVM.IB [R(a),R(b),R(c),R(d),R(e)], (R13)`

			`MOVW $w_buf(SP), R(w)`
			`MOVW $0x5A827999, R(const)`
			`MOVW $3, R(ctr)`
			`loop1: ROUND1(a, b, c, d, e)`
			`ROUND1(e, a, b, c, d)`
			`ROUND1(d, e, a, b, c)`
			`ROUND1(c, d, e, a, b)`
			`ROUND1(b, c, d, e, a)`
			`SUB.S $1, R(ctr)`
			`BNE loop1`

			`ROUND1(a, b, c, d, e)`
			`ROUND1x(e, a, b, c, d)`
			`ROUND1x(d, e, a, b, c)`
			`ROUND1x(c, d, e, a, b)`
			`ROUND1x(b, c, d, e, a)`

			`MOVW $0x6ED9EBA1, R(const)`
			`MOVW $4, R(ctr)`
			`loop2: ROUND2(a, b, c, d, e)`
			`ROUND2(e, a, b, c, d)`
			`ROUND2(d, e, a, b, c)`
			`ROUND2(c, d, e, a, b)`
			`ROUND2(b, c, d, e, a)`
			`SUB.S $1, R(ctr)`
			`BNE loop2`

			`MOVW $0x8F1BBCDC, R(const)`
			`MOVW $4, R(ctr)`
			`loop3: ROUND3(a, b, c, d, e)`
			`ROUND3(e, a, b, c, d)`
			`ROUND3(d, e, a, b, c)`
			`ROUND3(c, d, e, a, b)`
			`ROUND3(b, c, d, e, a)`
			`SUB.S $1, R(ctr)`
			`BNE loop3`

			`MOVW $0xCA62C1D6, R(const)`
			`MOVW $4, R(ctr)`
			`loop4: ROUND4(a, b, c, d, e)`
			`ROUND4(e, a, b, c, d)`
			`ROUND4(d, e, a, b, c)`
			`ROUND4(c, d, e, a, b)`
			`ROUND4(b, c, d, e, a)`
			`SUB.S $1, R(ctr)`
			`BNE loop4`

			`// Accumulate - restoring registers from SP+4`
			`MOVM.IB (R13), [R(t0),R(t1),R(t2),R(ctr),R(w)]`
			`ADD R(t0), R(a)`
			`ADD R(t1), R(b)`
			`ADD R(t2), R(c)`
			`ADD R(ctr), R(d)`
			`ADD R(w), R(e)`

			`MOVW p_end(SP), R(t0)`
			`CMP R(t0), R(data)`
			`BLO loop`

			`// Save final SHA1 accumulator`
			`MOVW dig+0(FP), R(t0)`
			`MOVM.IA [R(a),R(b),R(c),R(d),R(e)], (R(t0))`

			`RET`