// Copyright 2014 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // // ARM version of md5block.go #include "textflag.h" // SHA1 block routine. See sha1block.go for Go equivalent. // // There are 80 rounds of 4 types: // - rounds 0-15 are type 1 and load data (ROUND1 macro). // - rounds 16-19 are type 1 and do not load data (ROUND1x macro). // - rounds 20-39 are type 2 and do not load data (ROUND2 macro). // - rounds 40-59 are type 3 and do not load data (ROUND3 macro). // - rounds 60-79 are type 4 and do not load data (ROUND4 macro). // // Each round loads or shuffles the data, then computes a per-round // function of b, c, d, and then mixes the result into and rotates the // five registers a, b, c, d, e holding the intermediate results. // // The register rotation is implemented by rotating the arguments to // the round macros instead of by explicit move instructions. // Register definitions #define Rdata R0 // Pointer to incoming data #define Rconst R1 // Current constant for SHA round #define Ra R2 // SHA1 accumulator #define Rb R3 // SHA1 accumulator #define Rc R4 // SHA1 accumulator #define Rd R5 // SHA1 accumulator #define Re R6 // SHA1 accumulator #define Rt0 R7 // Temporary #define Rt1 R8 // Temporary // r9, r10 are forbidden // r11 is OK provided you check the assembler that no synthetic instructions use it #define Rt2 R11 // Temporary #define Rctr R12 // loop counter #define Rw R14 // point to w buffer // func block(dig *digest, p []byte) // 0(FP) is *digest // 4(FP) is p.array (struct Slice) // 8(FP) is p.len //12(FP) is p.cap // // Stack frame #define p_end -4 // -4(SP) pointer to the end of data #define p_data (p_end - 4) // -8(SP) current data pointer #define w_buf (p_data - 4*80) // -328(SP) 80 words temporary buffer w uint32[80] #define saved (w_buf - 4*5) // -348(SP) saved sha1 registers a,b,c,d,e - these must be last // Total size +4 for saved LR is 352 // w[i] = p[j]<<24 | p[j+1]<<16 | p[j+2]<<8 | p[j+3] // e += w[i] #define LOAD(Re) \ MOVBU 2(Rdata), Rt0 ; \ MOVBU 3(Rdata), Rt1 ; \ MOVBU 1(Rdata), Rt2 ; \ ORR Rt0<<8, Rt1, Rt0 ; \ MOVBU.P 4(Rdata), Rt1 ; \ ORR Rt2<<16, Rt0, Rt0 ; \ ORR Rt1<<24, Rt0, Rt0 ; \ MOVW.P Rt0, 4(Rw) ; \ ADD Rt0, Re, Re // tmp := w[(i-3)&0xf] ^ w[(i-8)&0xf] ^ w[(i-14)&0xf] ^ w[(i)&0xf] // w[i&0xf] = tmp<<1 | tmp>>(32-1) // e += w[i&0xf] #define SHUFFLE(Re) \ MOVW (-16*4)(Rw), Rt0 ; \ MOVW (-14*4)(Rw), Rt1 ; \ MOVW (-8*4)(Rw), Rt2 ; \ EOR Rt0, Rt1, Rt0 ; \ MOVW (-3*4)(Rw), Rt1 ; \ EOR Rt2, Rt0, Rt0 ; \ EOR Rt0, Rt1, Rt0 ; \ MOVW Rt0@>(32-1), Rt0 ; \ MOVW.P Rt0, 4(Rw) ; \ ADD Rt0, Re, Re // t1 = (b & c) | ((~b) & d) #define FUNC1(Ra, Rb, Rc, Rd, Re) \ MVN Rb, Rt1 ; \ AND Rb, Rc, Rt0 ; \ AND Rd, Rt1, Rt1 ; \ ORR Rt0, Rt1, Rt1 // t1 = b ^ c ^ d #define FUNC2(Ra, Rb, Rc, Rd, Re) \ EOR Rb, Rc, Rt1 ; \ EOR Rd, Rt1, Rt1 // t1 = (b & c) | (b & d) | (c & d) = // t1 = (b & c) | ((b | c) & d) #define FUNC3(Ra, Rb, Rc, Rd, Re) \ ORR Rb, Rc, Rt0 ; \ AND Rb, Rc, Rt1 ; \ AND Rd, Rt0, Rt0 ; \ ORR Rt0, Rt1, Rt1 #define FUNC4 FUNC2 // a5 := a<<5 | a>>(32-5) // b = b<<30 | b>>(32-30) // e = a5 + t1 + e + const #define MIX(Ra, Rb, Rc, Rd, Re) \ ADD Rt1, Re, Re ; \ MOVW Rb@>(32-30), Rb ; \ ADD Ra@>(32-5), Re, Re ; \ ADD Rconst, Re, Re #define ROUND1(Ra, Rb, Rc, Rd, Re) \ LOAD(Re) ; \ FUNC1(Ra, Rb, Rc, Rd, Re) ; \ MIX(Ra, Rb, Rc, Rd, Re) #define ROUND1x(Ra, Rb, Rc, Rd, Re) \ SHUFFLE(Re) ; \ FUNC1(Ra, Rb, Rc, Rd, Re) ; \ MIX(Ra, Rb, Rc, Rd, Re) #define ROUND2(Ra, Rb, Rc, Rd, Re) \ SHUFFLE(Re) ; \ FUNC2(Ra, Rb, Rc, Rd, Re) ; \ MIX(Ra, Rb, Rc, Rd, Re) #define ROUND3(Ra, Rb, Rc, Rd, Re) \ SHUFFLE(Re) ; \ FUNC3(Ra, Rb, Rc, Rd, Re) ; \ MIX(Ra, Rb, Rc, Rd, Re) #define ROUND4(Ra, Rb, Rc, Rd, Re) \ SHUFFLE(Re) ; \ FUNC4(Ra, Rb, Rc, Rd, Re) ; \ MIX(Ra, Rb, Rc, Rd, Re) // func block(dig *digest, p []byte) TEXT ·block(SB), 0, $352-16 MOVW p+4(FP), Rdata // pointer to the data MOVW p_len+8(FP), Rt0 // number of bytes ADD Rdata, Rt0 MOVW Rt0, p_end(SP) // pointer to end of data // Load up initial SHA1 accumulator MOVW dig+0(FP), Rt0 MOVM.IA (Rt0), [Ra,Rb,Rc,Rd,Re] loop: // Save registers at SP+4 onwards MOVM.IB [Ra,Rb,Rc,Rd,Re], (R13) MOVW $w_buf(SP), Rw MOVW $0x5A827999, Rconst MOVW $3, Rctr loop1: ROUND1(Ra, Rb, Rc, Rd, Re) ROUND1(Re, Ra, Rb, Rc, Rd) ROUND1(Rd, Re, Ra, Rb, Rc) ROUND1(Rc, Rd, Re, Ra, Rb) ROUND1(Rb, Rc, Rd, Re, Ra) SUB.S $1, Rctr BNE loop1 ROUND1(Ra, Rb, Rc, Rd, Re) ROUND1x(Re, Ra, Rb, Rc, Rd) ROUND1x(Rd, Re, Ra, Rb, Rc) ROUND1x(Rc, Rd, Re, Ra, Rb) ROUND1x(Rb, Rc, Rd, Re, Ra) MOVW $0x6ED9EBA1, Rconst MOVW $4, Rctr loop2: ROUND2(Ra, Rb, Rc, Rd, Re) ROUND2(Re, Ra, Rb, Rc, Rd) ROUND2(Rd, Re, Ra, Rb, Rc) ROUND2(Rc, Rd, Re, Ra, Rb) ROUND2(Rb, Rc, Rd, Re, Ra) SUB.S $1, Rctr BNE loop2 MOVW $0x8F1BBCDC, Rconst MOVW $4, Rctr loop3: ROUND3(Ra, Rb, Rc, Rd, Re) ROUND3(Re, Ra, Rb, Rc, Rd) ROUND3(Rd, Re, Ra, Rb, Rc) ROUND3(Rc, Rd, Re, Ra, Rb) ROUND3(Rb, Rc, Rd, Re, Ra) SUB.S $1, Rctr BNE loop3 MOVW $0xCA62C1D6, Rconst MOVW $4, Rctr loop4: ROUND4(Ra, Rb, Rc, Rd, Re) ROUND4(Re, Ra, Rb, Rc, Rd) ROUND4(Rd, Re, Ra, Rb, Rc) ROUND4(Rc, Rd, Re, Ra, Rb) ROUND4(Rb, Rc, Rd, Re, Ra) SUB.S $1, Rctr BNE loop4 // Accumulate - restoring registers from SP+4 MOVM.IB (R13), [Rt0,Rt1,Rt2,Rctr,Rw] ADD Rt0, Ra ADD Rt1, Rb ADD Rt2, Rc ADD Rctr, Rd ADD Rw, Re MOVW p_end(SP), Rt0 CMP Rt0, Rdata BLO loop // Save final SHA1 accumulator MOVW dig+0(FP), Rt0 MOVM.IA [Ra,Rb,Rc,Rd,Re], (Rt0) RET