diff --git a/src/crypto/md5/md5block_arm.s b/src/crypto/md5/md5block_arm.s index 3b26e549b9..82f2198193 100644 --- a/src/crypto/md5/md5block_arm.s +++ b/src/crypto/md5/md5block_arm.s @@ -7,20 +7,20 @@ #include "textflag.h" // Register definitions -table = 0 // Pointer to MD5 constants table -data = 1 // Pointer to data to hash -a = 2 // MD5 accumulator -b = 3 // MD5 accumulator -c = 4 // MD5 accumulator -d = 5 // MD5 accumulator -c0 = 6 // MD5 constant -c1 = 7 // MD5 constant -c2 = 8 // MD5 constant +#define Rtable R0 // Pointer to MD5 constants table +#define Rdata R1 // Pointer to data to hash +#define Ra R2 // MD5 accumulator +#define Rb R3 // MD5 accumulator +#define Rc R4 // MD5 accumulator +#define Rd R5 // MD5 accumulator +#define Rc0 R6 // MD5 constant +#define Rc1 R7 // MD5 constant +#define Rc2 R8 // MD5 constant // r9, r10 are forbidden // r11 is OK provided you check the assembler that no synthetic instructions use it -c3 = 11 // MD5 constant -t0 = 12 // temporary -t1 = 14 // temporary +#define Rc3 R11 // MD5 constant +#define Rt0 R12 // temporary +#define Rt1 R14 // temporary // func block(dig *digest, p []byte) // 0(FP) is *digest @@ -29,198 +29,198 @@ t1 = 14 // temporary //12(FP) is p.cap // // Stack frame -p_end = -4 // -4(SP) pointer to the end of data -p_data = -8 // -8(SP) current data pointer -buf = -8-4*16 //-72(SP) 16 words temporary buffer +#define p_end -4 // -4(SP) pointer to the end of data +#define p_data -8 // -8(SP) current data pointer +#define buf (-8-4*16) //-72(SP) 16 words temporary buffer // 3 words at 4..12(R13) for called routine parameters TEXT ·block(SB), NOSPLIT, $84-16 - MOVW p+4(FP), R(data) // pointer to the data - MOVW p_len+8(FP), R(t0) // number of bytes - ADD R(data), R(t0) - MOVW R(t0), p_end(SP) // pointer to end of data + MOVW p+4(FP), Rdata // pointer to the data + MOVW p_len+8(FP), Rt0 // number of bytes + ADD Rdata, Rt0 + MOVW Rt0, p_end(SP) // pointer to end of data loop: - MOVW R(data), p_data(SP) // Save R(data) - AND.S $3, R(data), R(t0) // TST $3, R(data) not working see issue 5921 + MOVW Rdata, p_data(SP) // Save Rdata + AND.S $3, Rdata, Rt0 // TST $3, Rdata not working see issue 5921 BEQ aligned // aligned detected - skip copy // Copy the unaligned source data into the aligned temporary buffer // memove(to=4(R13), from=8(R13), n=12(R13)) - Corrupts all registers - MOVW $buf(SP), R(table) // to - MOVW $64, R(c0) // n - MOVM.IB [R(table),R(data),R(c0)], (R13) + MOVW $buf(SP), Rtable // to + MOVW $64, Rc0 // n + MOVM.IB [Rtable,Rdata,Rc0], (R13) BL runtime·memmove(SB) // Point to the local aligned copy of the data - MOVW $buf(SP), R(data) + MOVW $buf(SP), Rdata aligned: // Point to the table of constants // A PC relative add would be cheaper than this - MOVW $·table(SB), R(table) + MOVW $·table(SB), Rtable // Load up initial MD5 accumulator - MOVW dig+0(FP), R(c0) - MOVM.IA (R(c0)), [R(a),R(b),R(c),R(d)] + MOVW dig+0(FP), Rc0 + MOVM.IA (Rc0), [Ra,Rb,Rc,Rd] // a += (((c^d)&b)^d) + X[index] + const // a = a<>(32-shift) + b -#define ROUND1(a, b, c, d, index, shift, const) \ - EOR R(c), R(d), R(t0) ; \ - AND R(b), R(t0) ; \ - EOR R(d), R(t0) ; \ - MOVW (index<<2)(R(data)), R(t1) ; \ - ADD R(t1), R(t0) ; \ - ADD R(const), R(t0) ; \ - ADD R(t0), R(a) ; \ - ADD R(a)@>(32-shift), R(b), R(a) ; +#define ROUND1(a, b, c, d, index, shift, Rconst) \ + EOR Rc, Rd, Rt0 ; \ + AND Rb, Rt0 ; \ + EOR Rd, Rt0 ; \ + MOVW (index<<2)(Rdata), Rt1 ; \ + ADD Rt1, Rt0 ; \ + ADD Rconst, Rt0 ; \ + ADD Rt0, Ra ; \ + ADD Ra@>(32-shift), Rb, Ra ; - MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)] - ROUND1(a, b, c, d, 0, 7, c0) - ROUND1(d, a, b, c, 1, 12, c1) - ROUND1(c, d, a, b, 2, 17, c2) - ROUND1(b, c, d, a, 3, 22, c3) + MOVM.IA.W (Rtable), [Rc0,Rc1,Rc2,Rc3] + ROUND1(a, b, c, d, 0, 7, Rc0) + ROUND1(d, a, b, c, 1, 12, Rc1) + ROUND1(c, d, a, b, 2, 17, Rc2) + ROUND1(b, c, d, a, 3, 22, Rc3) - MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)] - ROUND1(a, b, c, d, 4, 7, c0) - ROUND1(d, a, b, c, 5, 12, c1) - ROUND1(c, d, a, b, 6, 17, c2) - ROUND1(b, c, d, a, 7, 22, c3) + MOVM.IA.W (Rtable), [Rc0,Rc1,Rc2,Rc3] + ROUND1(a, b, c, d, 4, 7, Rc0) + ROUND1(d, a, b, c, 5, 12, Rc1) + ROUND1(c, d, a, b, 6, 17, Rc2) + ROUND1(b, c, d, a, 7, 22, Rc3) - MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)] - ROUND1(a, b, c, d, 8, 7, c0) - ROUND1(d, a, b, c, 9, 12, c1) - ROUND1(c, d, a, b, 10, 17, c2) - ROUND1(b, c, d, a, 11, 22, c3) + MOVM.IA.W (Rtable), [Rc0,Rc1,Rc2,Rc3] + ROUND1(a, b, c, d, 8, 7, Rc0) + ROUND1(d, a, b, c, 9, 12, Rc1) + ROUND1(c, d, a, b, 10, 17, Rc2) + ROUND1(b, c, d, a, 11, 22, Rc3) - MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)] - ROUND1(a, b, c, d, 12, 7, c0) - ROUND1(d, a, b, c, 13, 12, c1) - ROUND1(c, d, a, b, 14, 17, c2) - ROUND1(b, c, d, a, 15, 22, c3) + MOVM.IA.W (Rtable), [Rc0,Rc1,Rc2,Rc3] + ROUND1(a, b, c, d, 12, 7, Rc0) + ROUND1(d, a, b, c, 13, 12, Rc1) + ROUND1(c, d, a, b, 14, 17, Rc2) + ROUND1(b, c, d, a, 15, 22, Rc3) // a += (((b^c)&d)^c) + X[index] + const // a = a<>(32-shift) + b -#define ROUND2(a, b, c, d, index, shift, const) \ - EOR R(b), R(c), R(t0) ; \ - AND R(d), R(t0) ; \ - EOR R(c), R(t0) ; \ - MOVW (index<<2)(R(data)), R(t1) ; \ - ADD R(t1), R(t0) ; \ - ADD R(const), R(t0) ; \ - ADD R(t0), R(a) ; \ - ADD R(a)@>(32-shift), R(b), R(a) ; +#define ROUND2(Ra, Rb, Rc, Rd, index, shift, Rconst) \ + EOR Rb, Rc, Rt0 ; \ + AND Rd, Rt0 ; \ + EOR Rc, Rt0 ; \ + MOVW (index<<2)(Rdata), Rt1 ; \ + ADD Rt1, Rt0 ; \ + ADD Rconst, Rt0 ; \ + ADD Rt0, Ra ; \ + ADD Ra@>(32-shift), Rb, Ra ; - MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)] - ROUND2(a, b, c, d, 1, 5, c0) - ROUND2(d, a, b, c, 6, 9, c1) - ROUND2(c, d, a, b, 11, 14, c2) - ROUND2(b, c, d, a, 0, 20, c3) + MOVM.IA.W (Rtable), [Rc0,Rc1,Rc2,Rc3] + ROUND2(Ra, Rb, Rc, Rd, 1, 5, Rc0) + ROUND2(Rd, Ra, Rb, Rc, 6, 9, Rc1) + ROUND2(Rc, Rd, Ra, Rb, 11, 14, Rc2) + ROUND2(Rb, Rc, Rd, Ra, 0, 20, Rc3) - MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)] - ROUND2(a, b, c, d, 5, 5, c0) - ROUND2(d, a, b, c, 10, 9, c1) - ROUND2(c, d, a, b, 15, 14, c2) - ROUND2(b, c, d, a, 4, 20, c3) + MOVM.IA.W (Rtable), [Rc0,Rc1,Rc2,Rc3] + ROUND2(Ra, Rb, Rc, Rd, 5, 5, Rc0) + ROUND2(Rd, Ra, Rb, Rc, 10, 9, Rc1) + ROUND2(Rc, Rd, Ra, Rb, 15, 14, Rc2) + ROUND2(Rb, Rc, Rd, Ra, 4, 20, Rc3) - MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)] - ROUND2(a, b, c, d, 9, 5, c0) - ROUND2(d, a, b, c, 14, 9, c1) - ROUND2(c, d, a, b, 3, 14, c2) - ROUND2(b, c, d, a, 8, 20, c3) + MOVM.IA.W (Rtable), [Rc0,Rc1,Rc2,Rc3] + ROUND2(Ra, Rb, Rc, Rd, 9, 5, Rc0) + ROUND2(Rd, Ra, Rb, Rc, 14, 9, Rc1) + ROUND2(Rc, Rd, Ra, Rb, 3, 14, Rc2) + ROUND2(Rb, Rc, Rd, Ra, 8, 20, Rc3) - MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)] - ROUND2(a, b, c, d, 13, 5, c0) - ROUND2(d, a, b, c, 2, 9, c1) - ROUND2(c, d, a, b, 7, 14, c2) - ROUND2(b, c, d, a, 12, 20, c3) + MOVM.IA.W (Rtable), [Rc0,Rc1,Rc2,Rc3] + ROUND2(Ra, Rb, Rc, Rd, 13, 5, Rc0) + ROUND2(Rd, Ra, Rb, Rc, 2, 9, Rc1) + ROUND2(Rc, Rd, Ra, Rb, 7, 14, Rc2) + ROUND2(Rb, Rc, Rd, Ra, 12, 20, Rc3) // a += (b^c^d) + X[index] + const // a = a<>(32-shift) + b -#define ROUND3(a, b, c, d, index, shift, const) \ - EOR R(b), R(c), R(t0) ; \ - EOR R(d), R(t0) ; \ - MOVW (index<<2)(R(data)), R(t1) ; \ - ADD R(t1), R(t0) ; \ - ADD R(const), R(t0) ; \ - ADD R(t0), R(a) ; \ - ADD R(a)@>(32-shift), R(b), R(a) ; +#define ROUND3(Ra, Rb, Rc, Rd, index, shift, Rconst) \ + EOR Rb, Rc, Rt0 ; \ + EOR Rd, Rt0 ; \ + MOVW (index<<2)(Rdata), Rt1 ; \ + ADD Rt1, Rt0 ; \ + ADD Rconst, Rt0 ; \ + ADD Rt0, Ra ; \ + ADD Ra@>(32-shift), Rb, Ra ; - MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)] - ROUND3(a, b, c, d, 5, 4, c0) - ROUND3(d, a, b, c, 8, 11, c1) - ROUND3(c, d, a, b, 11, 16, c2) - ROUND3(b, c, d, a, 14, 23, c3) + MOVM.IA.W (Rtable), [Rc0,Rc1,Rc2,Rc3] + ROUND3(Ra, Rb, Rc, Rd, 5, 4, Rc0) + ROUND3(Rd, Ra, Rb, Rc, 8, 11, Rc1) + ROUND3(Rc, Rd, Ra, Rb, 11, 16, Rc2) + ROUND3(Rb, Rc, Rd, Ra, 14, 23, Rc3) - MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)] - ROUND3(a, b, c, d, 1, 4, c0) - ROUND3(d, a, b, c, 4, 11, c1) - ROUND3(c, d, a, b, 7, 16, c2) - ROUND3(b, c, d, a, 10, 23, c3) + MOVM.IA.W (Rtable), [Rc0,Rc1,Rc2,Rc3] + ROUND3(Ra, Rb, Rc, Rd, 1, 4, Rc0) + ROUND3(Rd, Ra, Rb, Rc, 4, 11, Rc1) + ROUND3(Rc, Rd, Ra, Rb, 7, 16, Rc2) + ROUND3(Rb, Rc, Rd, Ra, 10, 23, Rc3) - MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)] - ROUND3(a, b, c, d, 13, 4, c0) - ROUND3(d, a, b, c, 0, 11, c1) - ROUND3(c, d, a, b, 3, 16, c2) - ROUND3(b, c, d, a, 6, 23, c3) + MOVM.IA.W (Rtable), [Rc0,Rc1,Rc2,Rc3] + ROUND3(Ra, Rb, Rc, Rd, 13, 4, Rc0) + ROUND3(Rd, Ra, Rb, Rc, 0, 11, Rc1) + ROUND3(Rc, Rd, Ra, Rb, 3, 16, Rc2) + ROUND3(Rb, Rc, Rd, Ra, 6, 23, Rc3) - MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)] - ROUND3(a, b, c, d, 9, 4, c0) - ROUND3(d, a, b, c, 12, 11, c1) - ROUND3(c, d, a, b, 15, 16, c2) - ROUND3(b, c, d, a, 2, 23, c3) + MOVM.IA.W (Rtable), [Rc0,Rc1,Rc2,Rc3] + ROUND3(Ra, Rb, Rc, Rd, 9, 4, Rc0) + ROUND3(Rd, Ra, Rb, Rc, 12, 11, Rc1) + ROUND3(Rc, Rd, Ra, Rb, 15, 16, Rc2) + ROUND3(Rb, Rc, Rd, Ra, 2, 23, Rc3) // a += (c^(b|^d)) + X[index] + const // a = a<>(32-shift) + b -#define ROUND4(a, b, c, d, index, shift, const) \ - MVN R(d), R(t0) ; \ - ORR R(b), R(t0) ; \ - EOR R(c), R(t0) ; \ - MOVW (index<<2)(R(data)), R(t1) ; \ - ADD R(t1), R(t0) ; \ - ADD R(const), R(t0) ; \ - ADD R(t0), R(a) ; \ - ADD R(a)@>(32-shift), R(b), R(a) ; +#define ROUND4(Ra, Rb, Rc, d, index, shift, Rconst) \ + MVN Rd, Rt0 ; \ + ORR Rb, Rt0 ; \ + EOR Rc, Rt0 ; \ + MOVW (index<<2)(Rdata), Rt1 ; \ + ADD Rt1, Rt0 ; \ + ADD Rconst, Rt0 ; \ + ADD Rt0, Ra ; \ + ADD Ra@>(32-shift), Rb, Ra ; - MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)] - ROUND4(a, b, c, d, 0, 6, c0) - ROUND4(d, a, b, c, 7, 10, c1) - ROUND4(c, d, a, b, 14, 15, c2) - ROUND4(b, c, d, a, 5, 21, c3) + MOVM.IA.W (Rtable), [Rc0,Rc1,Rc2,Rc3] + ROUND4(Ra, Rb, Rc, Rd, 0, 6, Rc0) + ROUND4(Rd, Ra, Rb, Rc, 7, 10, Rc1) + ROUND4(Rc, Rd, Ra, Rb, 14, 15, Rc2) + ROUND4(Rb, Rc, Rd, Ra, 5, 21, Rc3) - MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)] - ROUND4(a, b, c, d, 12, 6, c0) - ROUND4(d, a, b, c, 3, 10, c1) - ROUND4(c, d, a, b, 10, 15, c2) - ROUND4(b, c, d, a, 1, 21, c3) + MOVM.IA.W (Rtable), [Rc0,Rc1,Rc2,Rc3] + ROUND4(Ra, Rb, Rc, Rd, 12, 6, Rc0) + ROUND4(Rd, Ra, Rb, Rc, 3, 10, Rc1) + ROUND4(Rc, Rd, Ra, Rb, 10, 15, Rc2) + ROUND4(Rb, Rc, Rd, Ra, 1, 21, Rc3) - MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)] - ROUND4(a, b, c, d, 8, 6, c0) - ROUND4(d, a, b, c, 15, 10, c1) - ROUND4(c, d, a, b, 6, 15, c2) - ROUND4(b, c, d, a, 13, 21, c3) + MOVM.IA.W (Rtable), [Rc0,Rc1,Rc2,Rc3] + ROUND4(Ra, Rb, Rc, Rd, 8, 6, Rc0) + ROUND4(Rd, Ra, Rb, Rc, 15, 10, Rc1) + ROUND4(Rc, Rd, Ra, Rb, 6, 15, Rc2) + ROUND4(Rb, Rc, Rd, Ra, 13, 21, Rc3) - MOVM.IA.W (R(table)), [R(c0),R(c1),R(c2),R(c3)] - ROUND4(a, b, c, d, 4, 6, c0) - ROUND4(d, a, b, c, 11, 10, c1) - ROUND4(c, d, a, b, 2, 15, c2) - ROUND4(b, c, d, a, 9, 21, c3) + MOVM.IA.W (Rtable), [Rc0,Rc1,Rc2,Rc3] + ROUND4(Ra, Rb, Rc, Rd, 4, 6, Rc0) + ROUND4(Rd, Ra, Rb, Rc, 11, 10, Rc1) + ROUND4(Rc, Rd, Ra, Rb, 2, 15, Rc2) + ROUND4(Rb, Rc, Rd, Ra, 9, 21, Rc3) - MOVW dig+0(FP), R(t0) - MOVM.IA (R(t0)), [R(c0),R(c1),R(c2),R(c3)] + MOVW dig+0(FP), Rt0 + MOVM.IA (Rt0), [Rc0,Rc1,Rc2,Rc3] - ADD R(c0), R(a) - ADD R(c1), R(b) - ADD R(c2), R(c) - ADD R(c3), R(d) + ADD Rc0, Ra + ADD Rc1, Rb + ADD Rc2, Rc + ADD Rc3, Rd - MOVM.IA [R(a),R(b),R(c),R(d)], (R(t0)) + MOVM.IA [Ra,Rb,Rc,Rd], (Rt0) - MOVW p_data(SP), R(data) - MOVW p_end(SP), R(t0) - ADD $64, R(data) - CMP R(t0), R(data) + MOVW p_data(SP), Rdata + MOVW p_end(SP), Rt0 + ADD $64, Rdata + CMP Rt0, Rdata BLO loop RET diff --git a/src/crypto/rc4/rc4_arm.s b/src/crypto/rc4/rc4_arm.s index 51be3bf95b..b4b807ad80 100644 --- a/src/crypto/rc4/rc4_arm.s +++ b/src/crypto/rc4/rc4_arm.s @@ -7,56 +7,56 @@ #include "textflag.h" // Registers -dst = 0 -src = 1 -n = 2 -state = 3 -pi = 4 -pj = 5 -i = 6 -j = 7 -k = 8 -t = 11 -t2 = 12 +#define Rdst R0 +#define Rsrc R1 +#define Rn R2 +#define Rstate R3 +#define Rpi R4 +#define Rpj R5 +#define Ri R6 +#define Rj R7 +#define Rk R8 +#define Rt R11 +#define Rt2 R12 // func xorKeyStream(dst, src *byte, n int, state *[256]byte, i, j *uint8) TEXT ·xorKeyStream(SB),NOSPLIT,$0 - MOVW 0(FP), R(dst) - MOVW 4(FP), R(src) - MOVW 8(FP), R(n) - MOVW 12(FP), R(state) - MOVW 16(FP), R(pi) - MOVW 20(FP), R(pj) - MOVBU (R(pi)), R(i) - MOVBU (R(pj)), R(j) - MOVW $0, R(k) + MOVW 0(FP), Rdst + MOVW 4(FP), Rsrc + MOVW 8(FP), Rn + MOVW 12(FP), Rstate + MOVW 16(FP), Rpi + MOVW 20(FP), Rpj + MOVBU (Rpi), Ri + MOVBU (Rpj), Rj + MOVW $0, Rk loop: // i += 1; j += state[i] - ADD $1, R(i) - AND $0xff, R(i) - MOVBU R(i)<<2(R(state)), R(t) - ADD R(t), R(j) - AND $0xff, R(j) + ADD $1, Ri + AND $0xff, Ri + MOVBU Ri<<2(Rstate), Rt + ADD Rt, Rj + AND $0xff, Rj // swap state[i] <-> state[j] - MOVBU R(j)<<2(R(state)), R(t2) - MOVB R(t2), R(i)<<2(R(state)) - MOVB R(t), R(j)<<2(R(state)) + MOVBU Rj<<2(Rstate), Rt2 + MOVB Rt2, Ri<<2(Rstate) + MOVB Rt, Rj<<2(Rstate) // dst[k] = src[k] ^ state[state[i] + state[j]] - ADD R(t2), R(t) - AND $0xff, R(t) - MOVBU R(t)<<2(R(state)), R(t) - MOVBU R(k)<<0(R(src)), R(t2) - EOR R(t), R(t2) - MOVB R(t2), R(k)<<0(R(dst)) + ADD Rt2, Rt + AND $0xff, Rt + MOVBU Rt<<2(Rstate), Rt + MOVBU Rk<<0(Rsrc), Rt2 + EOR Rt, Rt2 + MOVB Rt2, Rk<<0(Rdst) - ADD $1, R(k) - CMP R(k), R(n) + ADD $1, Rk + CMP Rk, Rn BNE loop done: - MOVB R(i), (R(pi)) - MOVB R(j), (R(pj)) + MOVB Ri, (Rpi) + MOVB Rj, (Rpj) RET diff --git a/src/crypto/sha1/sha1block_arm.s b/src/crypto/sha1/sha1block_arm.s index f11f33dc33..2cc0e09914 100644 --- a/src/crypto/sha1/sha1block_arm.s +++ b/src/crypto/sha1/sha1block_arm.s @@ -23,20 +23,20 @@ // the round macros instead of by explicit move instructions. // Register definitions -data = 0 // Pointer to incoming data -const = 1 // Current constant for SHA round -a = 2 // SHA1 accumulator -b = 3 // SHA1 accumulator -c = 4 // SHA1 accumulator -d = 5 // SHA1 accumulator -e = 6 // SHA1 accumulator -t0 = 7 // Temporary -t1 = 8 // Temporary +#define Rdata R0 // Pointer to incoming data +#define Rconst R1 // Current constant for SHA round +#define Ra R2 // SHA1 accumulator +#define Rb R3 // SHA1 accumulator +#define Rc R4 // SHA1 accumulator +#define Rd R5 // SHA1 accumulator +#define Re R6 // SHA1 accumulator +#define Rt0 R7 // Temporary +#define Rt1 R8 // Temporary // r9, r10 are forbidden // r11 is OK provided you check the assembler that no synthetic instructions use it -t2 = 11 // Temporary -ctr = 12 // loop counter -w = 14 // point to w buffer +#define Rt2 R11 // Temporary +#define Rctr R12 // loop counter +#define Rw R14 // point to w buffer // func block(dig *digest, p []byte) // 0(FP) is *digest @@ -45,173 +45,173 @@ w = 14 // point to w buffer //12(FP) is p.cap // // Stack frame -p_end = -4 // -4(SP) pointer to the end of data -p_data = p_end - 4 // -8(SP) current data pointer -w_buf = p_data - 4*80 // -328(SP) 80 words temporary buffer w uint32[80] -saved = w_buf - 4*5 // -348(SP) saved sha1 registers a,b,c,d,e - these must be last +#define p_end -4 // -4(SP) pointer to the end of data +#define p_data (p_end - 4) // -8(SP) current data pointer +#define w_buf (p_data - 4*80) // -328(SP) 80 words temporary buffer w uint32[80] +#define saved (w_buf - 4*5) // -348(SP) saved sha1 registers a,b,c,d,e - these must be last // Total size +4 for saved LR is 352 // w[i] = p[j]<<24 | p[j+1]<<16 | p[j+2]<<8 | p[j+3] // e += w[i] -#define LOAD(e) \ - MOVBU 2(R(data)), R(t0) ; \ - MOVBU 3(R(data)), R(t1) ; \ - MOVBU 1(R(data)), R(t2) ; \ - ORR R(t0)<<8, R(t1), R(t0) ; \ - MOVBU.P 4(R(data)), R(t1) ; \ - ORR R(t2)<<16, R(t0), R(t0) ; \ - ORR R(t1)<<24, R(t0), R(t0) ; \ - MOVW.P R(t0), 4(R(w)) ; \ - ADD R(t0), R(e), R(e) +#define LOAD(Re) \ + MOVBU 2(Rdata), Rt0 ; \ + MOVBU 3(Rdata), Rt1 ; \ + MOVBU 1(Rdata), Rt2 ; \ + ORR Rt0<<8, Rt1, Rt0 ; \ + MOVBU.P 4(Rdata), Rt1 ; \ + ORR Rt2<<16, Rt0, Rt0 ; \ + ORR Rt1<<24, Rt0, Rt0 ; \ + MOVW.P Rt0, 4(Rw) ; \ + ADD Rt0, Re, Re // tmp := w[(i-3)&0xf] ^ w[(i-8)&0xf] ^ w[(i-14)&0xf] ^ w[(i)&0xf] // w[i&0xf] = tmp<<1 | tmp>>(32-1) // e += w[i&0xf] -#define SHUFFLE(e) \ - MOVW (-16*4)(R(w)), R(t0) ; \ - MOVW (-14*4)(R(w)), R(t1) ; \ - MOVW (-8*4)(R(w)), R(t2) ; \ - EOR R(t0), R(t1), R(t0) ; \ - MOVW (-3*4)(R(w)), R(t1) ; \ - EOR R(t2), R(t0), R(t0) ; \ - EOR R(t0), R(t1), R(t0) ; \ - MOVW R(t0)@>(32-1), R(t0) ; \ - MOVW.P R(t0), 4(R(w)) ; \ - ADD R(t0), R(e), R(e) +#define SHUFFLE(Re) \ + MOVW (-16*4)(Rw), Rt0 ; \ + MOVW (-14*4)(Rw), Rt1 ; \ + MOVW (-8*4)(Rw), Rt2 ; \ + EOR Rt0, Rt1, Rt0 ; \ + MOVW (-3*4)(Rw), Rt1 ; \ + EOR Rt2, Rt0, Rt0 ; \ + EOR Rt0, Rt1, Rt0 ; \ + MOVW Rt0@>(32-1), Rt0 ; \ + MOVW.P Rt0, 4(Rw) ; \ + ADD Rt0, Re, Re // t1 = (b & c) | ((~b) & d) -#define FUNC1(a, b, c, d, e) \ - MVN R(b), R(t1) ; \ - AND R(b), R(c), R(t0) ; \ - AND R(d), R(t1), R(t1) ; \ - ORR R(t0), R(t1), R(t1) +#define FUNC1(Ra, Rb, Rc, Rd, Re) \ + MVN Rb, Rt1 ; \ + AND Rb, Rc, Rt0 ; \ + AND Rd, Rt1, Rt1 ; \ + ORR Rt0, Rt1, Rt1 // t1 = b ^ c ^ d -#define FUNC2(a, b, c, d, e) \ - EOR R(b), R(c), R(t1) ; \ - EOR R(d), R(t1), R(t1) +#define FUNC2(Ra, Rb, Rc, Rd, Re) \ + EOR Rb, Rc, Rt1 ; \ + EOR Rd, Rt1, Rt1 // t1 = (b & c) | (b & d) | (c & d) = // t1 = (b & c) | ((b | c) & d) -#define FUNC3(a, b, c, d, e) \ - ORR R(b), R(c), R(t0) ; \ - AND R(b), R(c), R(t1) ; \ - AND R(d), R(t0), R(t0) ; \ - ORR R(t0), R(t1), R(t1) +#define FUNC3(Ra, Rb, Rc, Rd, Re) \ + ORR Rb, Rc, Rt0 ; \ + AND Rb, Rc, Rt1 ; \ + AND Rd, Rt0, Rt0 ; \ + ORR Rt0, Rt1, Rt1 #define FUNC4 FUNC2 // a5 := a<<5 | a>>(32-5) // b = b<<30 | b>>(32-30) // e = a5 + t1 + e + const -#define MIX(a, b, c, d, e) \ - ADD R(t1), R(e), R(e) ; \ - MOVW R(b)@>(32-30), R(b) ; \ - ADD R(a)@>(32-5), R(e), R(e) ; \ - ADD R(const), R(e), R(e) +#define MIX(Ra, Rb, Rc, Rd, Re) \ + ADD Rt1, Re, Re ; \ + MOVW Rb@>(32-30), Rb ; \ + ADD Ra@>(32-5), Re, Re ; \ + ADD Rconst, Re, Re -#define ROUND1(a, b, c, d, e) \ - LOAD(e) ; \ - FUNC1(a, b, c, d, e) ; \ - MIX(a, b, c, d, e) +#define ROUND1(Ra, Rb, Rc, Rd, Re) \ + LOAD(Re) ; \ + FUNC1(Ra, Rb, Rc, Rd, Re) ; \ + MIX(Ra, Rb, Rc, Rd, Re) -#define ROUND1x(a, b, c, d, e) \ - SHUFFLE(e) ; \ - FUNC1(a, b, c, d, e) ; \ - MIX(a, b, c, d, e) +#define ROUND1x(Ra, Rb, Rc, Rd, Re) \ + SHUFFLE(Re) ; \ + FUNC1(Ra, Rb, Rc, Rd, Re) ; \ + MIX(Ra, Rb, Rc, Rd, Re) -#define ROUND2(a, b, c, d, e) \ - SHUFFLE(e) ; \ - FUNC2(a, b, c, d, e) ; \ - MIX(a, b, c, d, e) +#define ROUND2(Ra, Rb, Rc, Rd, Re) \ + SHUFFLE(Re) ; \ + FUNC2(Ra, Rb, Rc, Rd, Re) ; \ + MIX(Ra, Rb, Rc, Rd, Re) -#define ROUND3(a, b, c, d, e) \ - SHUFFLE(e) ; \ - FUNC3(a, b, c, d, e) ; \ - MIX(a, b, c, d, e) +#define ROUND3(Ra, Rb, Rc, Rd, Re) \ + SHUFFLE(Re) ; \ + FUNC3(Ra, Rb, Rc, Rd, Re) ; \ + MIX(Ra, Rb, Rc, Rd, Re) -#define ROUND4(a, b, c, d, e) \ - SHUFFLE(e) ; \ - FUNC4(a, b, c, d, e) ; \ - MIX(a, b, c, d, e) +#define ROUND4(Ra, Rb, Rc, Rd, Re) \ + SHUFFLE(Re) ; \ + FUNC4(Ra, Rb, Rc, Rd, Re) ; \ + MIX(Ra, Rb, Rc, Rd, Re) // func block(dig *digest, p []byte) TEXT ·block(SB), 0, $352-16 - MOVW p+4(FP), R(data) // pointer to the data - MOVW p_len+8(FP), R(t0) // number of bytes - ADD R(data), R(t0) - MOVW R(t0), p_end(SP) // pointer to end of data + MOVW p+4(FP), Rdata // pointer to the data + MOVW p_len+8(FP), Rt0 // number of bytes + ADD Rdata, Rt0 + MOVW Rt0, p_end(SP) // pointer to end of data // Load up initial SHA1 accumulator - MOVW dig+0(FP), R(t0) - MOVM.IA (R(t0)), [R(a),R(b),R(c),R(d),R(e)] + MOVW dig+0(FP), Rt0 + MOVM.IA (Rt0), [Ra,Rb,Rc,Rd,Re] loop: // Save registers at SP+4 onwards - MOVM.IB [R(a),R(b),R(c),R(d),R(e)], (R13) + MOVM.IB [Ra,Rb,Rc,Rd,Re], (R13) - MOVW $w_buf(SP), R(w) - MOVW $0x5A827999, R(const) - MOVW $3, R(ctr) -loop1: ROUND1(a, b, c, d, e) - ROUND1(e, a, b, c, d) - ROUND1(d, e, a, b, c) - ROUND1(c, d, e, a, b) - ROUND1(b, c, d, e, a) - SUB.S $1, R(ctr) + MOVW $w_buf(SP), Rw + MOVW $0x5A827999, Rconst + MOVW $3, Rctr +loop1: ROUND1(Ra, Rb, Rc, Rd, Re) + ROUND1(Re, Ra, Rb, Rc, Rd) + ROUND1(Rd, Re, Ra, Rb, Rc) + ROUND1(Rc, Rd, Re, Ra, Rb) + ROUND1(Rb, Rc, Rd, Re, Ra) + SUB.S $1, Rctr BNE loop1 - ROUND1(a, b, c, d, e) - ROUND1x(e, a, b, c, d) - ROUND1x(d, e, a, b, c) - ROUND1x(c, d, e, a, b) - ROUND1x(b, c, d, e, a) + ROUND1(Ra, Rb, Rc, Rd, Re) + ROUND1x(Re, Ra, Rb, Rc, Rd) + ROUND1x(Rd, Re, Ra, Rb, Rc) + ROUND1x(Rc, Rd, Re, Ra, Rb) + ROUND1x(Rb, Rc, Rd, Re, Ra) - MOVW $0x6ED9EBA1, R(const) - MOVW $4, R(ctr) -loop2: ROUND2(a, b, c, d, e) - ROUND2(e, a, b, c, d) - ROUND2(d, e, a, b, c) - ROUND2(c, d, e, a, b) - ROUND2(b, c, d, e, a) - SUB.S $1, R(ctr) + MOVW $0x6ED9EBA1, Rconst + MOVW $4, Rctr +loop2: ROUND2(Ra, Rb, Rc, Rd, Re) + ROUND2(Re, Ra, Rb, Rc, Rd) + ROUND2(Rd, Re, Ra, Rb, Rc) + ROUND2(Rc, Rd, Re, Ra, Rb) + ROUND2(Rb, Rc, Rd, Re, Ra) + SUB.S $1, Rctr BNE loop2 - MOVW $0x8F1BBCDC, R(const) - MOVW $4, R(ctr) -loop3: ROUND3(a, b, c, d, e) - ROUND3(e, a, b, c, d) - ROUND3(d, e, a, b, c) - ROUND3(c, d, e, a, b) - ROUND3(b, c, d, e, a) - SUB.S $1, R(ctr) + MOVW $0x8F1BBCDC, Rconst + MOVW $4, Rctr +loop3: ROUND3(Ra, Rb, Rc, Rd, Re) + ROUND3(Re, Ra, Rb, Rc, Rd) + ROUND3(Rd, Re, Ra, Rb, Rc) + ROUND3(Rc, Rd, Re, Ra, Rb) + ROUND3(Rb, Rc, Rd, Re, Ra) + SUB.S $1, Rctr BNE loop3 - MOVW $0xCA62C1D6, R(const) - MOVW $4, R(ctr) -loop4: ROUND4(a, b, c, d, e) - ROUND4(e, a, b, c, d) - ROUND4(d, e, a, b, c) - ROUND4(c, d, e, a, b) - ROUND4(b, c, d, e, a) - SUB.S $1, R(ctr) + MOVW $0xCA62C1D6, Rconst + MOVW $4, Rctr +loop4: ROUND4(Ra, Rb, Rc, Rd, Re) + ROUND4(Re, Ra, Rb, Rc, Rd) + ROUND4(Rd, Re, Ra, Rb, Rc) + ROUND4(Rc, Rd, Re, Ra, Rb) + ROUND4(Rb, Rc, Rd, Re, Ra) + SUB.S $1, Rctr BNE loop4 // Accumulate - restoring registers from SP+4 - MOVM.IB (R13), [R(t0),R(t1),R(t2),R(ctr),R(w)] - ADD R(t0), R(a) - ADD R(t1), R(b) - ADD R(t2), R(c) - ADD R(ctr), R(d) - ADD R(w), R(e) + MOVM.IB (R13), [Rt0,Rt1,Rt2,Rctr,Rw] + ADD Rt0, Ra + ADD Rt1, Rb + ADD Rt2, Rc + ADD Rctr, Rd + ADD Rw, Re - MOVW p_end(SP), R(t0) - CMP R(t0), R(data) + MOVW p_end(SP), Rt0 + CMP Rt0, Rdata BLO loop // Save final SHA1 accumulator - MOVW dig+0(FP), R(t0) - MOVM.IA [R(a),R(b),R(c),R(d),R(e)], (R(t0)) + MOVW dig+0(FP), Rt0 + MOVM.IA [Ra,Rb,Rc,Rd,Re], (Rt0) RET diff --git a/src/runtime/asm_arm.s b/src/runtime/asm_arm.s index 2efeaaa531..cd81c25d6a 100644 --- a/src/runtime/asm_arm.s +++ b/src/runtime/asm_arm.s @@ -107,7 +107,7 @@ TEXT runtime·asminit(SB),NOSPLIT,$0-0 // save state in Gobuf; setjmp TEXT runtime·gosave(SB),NOSPLIT,$-4-4 MOVW 0(FP), R0 // gobuf - MOVW SP, gobuf_sp(R0) + MOVW R13, gobuf_sp(R0) MOVW LR, gobuf_pc(R0) MOVW g, gobuf_g(R0) MOVW $0, R11 @@ -133,7 +133,7 @@ TEXT runtime·gogo(SB),NOSPLIT,$-4-4 // after this point: it must be straight-line code until the // final B instruction. // See large comment in sigprof for more details. - MOVW gobuf_sp(R1), SP // restore SP + MOVW gobuf_sp(R1), R13 // restore SP==R13 MOVW gobuf_lr(R1), LR MOVW gobuf_ret(R1), R0 MOVW gobuf_ctxt(R1), R7 @@ -152,7 +152,7 @@ TEXT runtime·gogo(SB),NOSPLIT,$-4-4 // to keep running g. TEXT runtime·mcall(SB),NOSPLIT,$-4-4 // Save caller state in g->sched. - MOVW SP, (g_sched+gobuf_sp)(g) + MOVW R13, (g_sched+gobuf_sp)(g) MOVW LR, (g_sched+gobuf_pc)(g) MOVW $0, R11 MOVW R11, (g_sched+gobuf_lr)(g) @@ -170,8 +170,8 @@ TEXT runtime·mcall(SB),NOSPLIT,$-4-4 CMP $0, R11 BL.NE runtime·save_g(SB) MOVW fn+0(FP), R0 - MOVW (g_sched+gobuf_sp)(g), SP - SUB $8, SP + MOVW (g_sched+gobuf_sp)(g), R13 + SUB $8, R13 MOVW R1, 4(SP) MOVW R0, R7 MOVW 0(R0), R0 @@ -217,7 +217,7 @@ switch: MOVW $runtime·systemstack_switch(SB), R3 ADD $4, R3, R3 // get past push {lr} MOVW R3, (g_sched+gobuf_pc)(g) - MOVW SP, (g_sched+gobuf_sp)(g) + MOVW R13, (g_sched+gobuf_sp)(g) MOVW LR, (g_sched+gobuf_lr)(g) MOVW g, (g_sched+gobuf_g)(g) @@ -231,7 +231,7 @@ switch: SUB $4, R3, R3 MOVW $runtime·mstart(SB), R4 MOVW R4, 0(R3) - MOVW R3, SP + MOVW R3, R13 // call target function MOVW R0, R7 @@ -242,7 +242,7 @@ switch: MOVW g_m(g), R1 MOVW m_curg(R1), R0 BL setg<>(SB) - MOVW (g_sched+gobuf_sp)(g), SP + MOVW (g_sched+gobuf_sp)(g), R13 MOVW $0, R3 MOVW R3, (g_sched+gobuf_sp)(g) RET @@ -284,21 +284,21 @@ TEXT runtime·morestack(SB),NOSPLIT,$-4-0 // Called from f. // Set g->sched to context in f. MOVW R7, (g_sched+gobuf_ctxt)(g) - MOVW SP, (g_sched+gobuf_sp)(g) + MOVW R13, (g_sched+gobuf_sp)(g) MOVW LR, (g_sched+gobuf_pc)(g) MOVW R3, (g_sched+gobuf_lr)(g) // Called from f. // Set m->morebuf to f's caller. MOVW R3, (m_morebuf+gobuf_pc)(R8) // f's caller's PC - MOVW SP, (m_morebuf+gobuf_sp)(R8) // f's caller's SP + MOVW R13, (m_morebuf+gobuf_sp)(R8) // f's caller's SP MOVW $4(SP), R3 // f's argument pointer MOVW g, (m_morebuf+gobuf_g)(R8) // Call newstack on m->g0's stack. MOVW m_g0(R8), R0 BL setg<>(SB) - MOVW (g_sched+gobuf_sp)(g), SP + MOVW (g_sched+gobuf_sp)(g), R13 BL runtime·newstack(SB) // Not reached, but make sure the return PC from the call to newstack @@ -362,7 +362,7 @@ TEXT NAME(SB), WRAPPER, $MAXSIZE-20; \ /* copy arguments to stack */ \ MOVW argptr+8(FP), R0; \ MOVW argsize+12(FP), R2; \ - ADD $4, SP, R1; \ + ADD $4, R13, R1; \ CMP $0, R2; \ B.EQ 5(PC); \ MOVBU.P 1(R0), R5; \ @@ -378,7 +378,7 @@ TEXT NAME(SB), WRAPPER, $MAXSIZE-20; \ MOVW argptr+8(FP), R0; \ MOVW argsize+12(FP), R2; \ MOVW retoffset+16(FP), R3; \ - ADD $4, SP, R1; \ + ADD $4, R13, R1; \ ADD R3, R1; \ ADD R3, R0; \ SUB R3, R2; \ @@ -443,8 +443,8 @@ TEXT runtime·jmpdefer(SB),NOSPLIT,$0-8 MOVW 0(SP), LR MOVW $-4(LR), LR // BL deferreturn MOVW fv+0(FP), R7 - MOVW argp+4(FP), SP - MOVW $-4(SP), SP // SP is 4 below argp, due to saved LR + MOVW argp+4(FP), R13 + MOVW $-4(SP), R13 // SP is 4 below argp, due to saved LR MOVW 0(R7), R1 B (R1) diff --git a/src/runtime/memclr_arm.s b/src/runtime/memclr_arm.s index 1824d33b14..8b5fe31c51 100644 --- a/src/runtime/memclr_arm.s +++ b/src/runtime/memclr_arm.s @@ -25,31 +25,31 @@ #include "textflag.h" -TO = 8 -TOE = 11 -N = 12 -TMP = 12 /* N and TMP don't overlap */ +#define TO R8 +#define TOE R11 +#define N R12 +#define TMP R12 /* N and TMP don't overlap */ TEXT runtime·memclr(SB),NOSPLIT,$0-8 - MOVW ptr+0(FP), R(TO) - MOVW n+4(FP), R(N) - MOVW $0, R(0) + MOVW ptr+0(FP), TO + MOVW n+4(FP), N + MOVW $0, R0 - ADD R(N), R(TO), R(TOE) /* to end pointer */ + ADD N, TO, TOE /* to end pointer */ - CMP $4, R(N) /* need at least 4 bytes to copy */ + CMP $4, N /* need at least 4 bytes to copy */ BLT _1tail _4align: /* align on 4 */ - AND.S $3, R(TO), R(TMP) + AND.S $3, TO, TMP BEQ _4aligned - MOVBU.P R(0), 1(R(TO)) /* implicit write back */ + MOVBU.P R0, 1(TO) /* implicit write back */ B _4align _4aligned: - SUB $31, R(TOE), R(TMP) /* do 32-byte chunks if possible */ - CMP R(TMP), R(TO) + SUB $31, TOE, TMP /* do 32-byte chunks if possible */ + CMP TMP, TO BHS _4tail MOVW R0, R1 /* replicate */ @@ -61,26 +61,26 @@ _4aligned: MOVW R0, R7 _f32loop: - CMP R(TMP), R(TO) + CMP TMP, TO BHS _4tail - MOVM.IA.W [R0-R7], (R(TO)) + MOVM.IA.W [R0-R7], (TO) B _f32loop _4tail: - SUB $3, R(TOE), R(TMP) /* do remaining words if possible */ + SUB $3, TOE, TMP /* do remaining words if possible */ _4loop: - CMP R(TMP), R(TO) + CMP TMP, TO BHS _1tail - MOVW.P R(0), 4(R(TO)) /* implicit write back */ + MOVW.P R0, 4(TO) /* implicit write back */ B _4loop _1tail: - CMP R(TO), R(TOE) + CMP TO, TOE BEQ _return - MOVBU.P R(0), 1(R(TO)) /* implicit write back */ + MOVBU.P R0, 1(TO) /* implicit write back */ B _1tail _return: diff --git a/src/runtime/memmove_arm.s b/src/runtime/memmove_arm.s index f187d42678..35f04a84bc 100644 --- a/src/runtime/memmove_arm.s +++ b/src/runtime/memmove_arm.s @@ -26,138 +26,138 @@ #include "textflag.h" // TE or TS are spilled to the stack during bulk register moves. -TS = 0 -TE = 8 +#define TS R0 +#define TE R8 // Warning: the linker will use R11 to synthesize certain instructions. Please // take care and double check with objdump. -FROM = 11 -N = 12 -TMP = 12 /* N and TMP don't overlap */ -TMP1 = 5 +#define FROM R11 +#define N R12 +#define TMP R12 /* N and TMP don't overlap */ +#define TMP1 R5 -RSHIFT = 5 -LSHIFT = 6 -OFFSET = 7 +#define RSHIFT R5 +#define LSHIFT R6 +#define OFFSET R7 -BR0 = 0 /* shared with TS */ -BW0 = 1 -BR1 = 1 -BW1 = 2 -BR2 = 2 -BW2 = 3 -BR3 = 3 -BW3 = 4 +#define BR0 R0 /* shared with TS */ +#define BW0 R1 +#define BR1 R1 +#define BW1 R2 +#define BR2 R2 +#define BW2 R3 +#define BR3 R3 +#define BW3 R4 -FW0 = 1 -FR0 = 2 -FW1 = 2 -FR1 = 3 -FW2 = 3 -FR2 = 4 -FW3 = 4 -FR3 = 8 /* shared with TE */ +#define FW0 R1 +#define FR0 R2 +#define FW1 R2 +#define FR1 R3 +#define FW2 R3 +#define FR2 R4 +#define FW3 R4 +#define FR3 R8 /* shared with TE */ TEXT runtime·memmove(SB), NOSPLIT, $4-12 _memmove: - MOVW to+0(FP), R(TS) - MOVW from+4(FP), R(FROM) - MOVW n+8(FP), R(N) + MOVW to+0(FP), TS + MOVW from+4(FP), FROM + MOVW n+8(FP), N - ADD R(N), R(TS), R(TE) /* to end pointer */ + ADD N, TS, TE /* to end pointer */ - CMP R(FROM), R(TS) + CMP FROM, TS BLS _forward _back: - ADD R(N), R(FROM) /* from end pointer */ - CMP $4, R(N) /* need at least 4 bytes to copy */ + ADD N, FROM /* from end pointer */ + CMP $4, N /* need at least 4 bytes to copy */ BLT _b1tail _b4align: /* align destination on 4 */ - AND.S $3, R(TE), R(TMP) + AND.S $3, TE, TMP BEQ _b4aligned - MOVBU.W -1(R(FROM)), R(TMP) /* pre-indexed */ - MOVBU.W R(TMP), -1(R(TE)) /* pre-indexed */ + MOVBU.W -1(FROM), TMP /* pre-indexed */ + MOVBU.W TMP, -1(TE) /* pre-indexed */ B _b4align _b4aligned: /* is source now aligned? */ - AND.S $3, R(FROM), R(TMP) + AND.S $3, FROM, TMP BNE _bunaligned - ADD $31, R(TS), R(TMP) /* do 32-byte chunks if possible */ - MOVW R(TS), savedts-4(SP) + ADD $31, TS, TMP /* do 32-byte chunks if possible */ + MOVW TS, savedts-4(SP) _b32loop: - CMP R(TMP), R(TE) + CMP TMP, TE BLS _b4tail - MOVM.DB.W (R(FROM)), [R0-R7] - MOVM.DB.W [R0-R7], (R(TE)) + MOVM.DB.W (FROM), [R0-R7] + MOVM.DB.W [R0-R7], (TE) B _b32loop _b4tail: /* do remaining words if possible */ - MOVW savedts-4(SP), R(TS) - ADD $3, R(TS), R(TMP) + MOVW savedts-4(SP), TS + ADD $3, TS, TMP _b4loop: - CMP R(TMP), R(TE) + CMP TMP, TE BLS _b1tail - MOVW.W -4(R(FROM)), R(TMP1) /* pre-indexed */ - MOVW.W R(TMP1), -4(R(TE)) /* pre-indexed */ + MOVW.W -4(FROM), TMP1 /* pre-indexed */ + MOVW.W TMP1, -4(TE) /* pre-indexed */ B _b4loop _b1tail: /* remaining bytes */ - CMP R(TE), R(TS) + CMP TE, TS BEQ _return - MOVBU.W -1(R(FROM)), R(TMP) /* pre-indexed */ - MOVBU.W R(TMP), -1(R(TE)) /* pre-indexed */ + MOVBU.W -1(FROM), TMP /* pre-indexed */ + MOVBU.W TMP, -1(TE) /* pre-indexed */ B _b1tail _forward: - CMP $4, R(N) /* need at least 4 bytes to copy */ + CMP $4, N /* need at least 4 bytes to copy */ BLT _f1tail _f4align: /* align destination on 4 */ - AND.S $3, R(TS), R(TMP) + AND.S $3, TS, TMP BEQ _f4aligned - MOVBU.P 1(R(FROM)), R(TMP) /* implicit write back */ - MOVBU.P R(TMP), 1(R(TS)) /* implicit write back */ + MOVBU.P 1(FROM), TMP /* implicit write back */ + MOVBU.P TMP, 1(TS) /* implicit write back */ B _f4align _f4aligned: /* is source now aligned? */ - AND.S $3, R(FROM), R(TMP) + AND.S $3, FROM, TMP BNE _funaligned - SUB $31, R(TE), R(TMP) /* do 32-byte chunks if possible */ - MOVW R(TE), savedte-4(SP) + SUB $31, TE, TMP /* do 32-byte chunks if possible */ + MOVW TE, savedte-4(SP) _f32loop: - CMP R(TMP), R(TS) + CMP TMP, TS BHS _f4tail - MOVM.IA.W (R(FROM)), [R1-R8] - MOVM.IA.W [R1-R8], (R(TS)) + MOVM.IA.W (FROM), [R1-R8] + MOVM.IA.W [R1-R8], (TS) B _f32loop _f4tail: - MOVW savedte-4(SP), R(TE) - SUB $3, R(TE), R(TMP) /* do remaining words if possible */ + MOVW savedte-4(SP), TE + SUB $3, TE, TMP /* do remaining words if possible */ _f4loop: - CMP R(TMP), R(TS) + CMP TMP, TS BHS _f1tail - MOVW.P 4(R(FROM)), R(TMP1) /* implicit write back */ - MOVW.P R(TMP1), 4(R(TS)) /* implicit write back */ + MOVW.P 4(FROM), TMP1 /* implicit write back */ + MOVW.P TMP1, 4(TS) /* implicit write back */ B _f4loop _f1tail: - CMP R(TS), R(TE) + CMP TS, TE BEQ _return - MOVBU.P 1(R(FROM)), R(TMP) /* implicit write back */ - MOVBU.P R(TMP), 1(R(TS)) /* implicit write back */ + MOVBU.P 1(FROM), TMP /* implicit write back */ + MOVBU.P TMP, 1(TS) /* implicit write back */ B _f1tail _return: @@ -165,97 +165,97 @@ _return: RET _bunaligned: - CMP $2, R(TMP) /* is R(TMP) < 2 ? */ + CMP $2, TMP /* is TMP < 2 ? */ - MOVW.LT $8, R(RSHIFT) /* (R(n)<<24)|(R(n-1)>>8) */ - MOVW.LT $24, R(LSHIFT) - MOVW.LT $1, R(OFFSET) + MOVW.LT $8, RSHIFT /* (R(n)<<24)|(R(n-1)>>8) */ + MOVW.LT $24, LSHIFT + MOVW.LT $1, OFFSET - MOVW.EQ $16, R(RSHIFT) /* (R(n)<<16)|(R(n-1)>>16) */ - MOVW.EQ $16, R(LSHIFT) - MOVW.EQ $2, R(OFFSET) + MOVW.EQ $16, RSHIFT /* (R(n)<<16)|(R(n-1)>>16) */ + MOVW.EQ $16, LSHIFT + MOVW.EQ $2, OFFSET - MOVW.GT $24, R(RSHIFT) /* (R(n)<<8)|(R(n-1)>>24) */ - MOVW.GT $8, R(LSHIFT) - MOVW.GT $3, R(OFFSET) + MOVW.GT $24, RSHIFT /* (R(n)<<8)|(R(n-1)>>24) */ + MOVW.GT $8, LSHIFT + MOVW.GT $3, OFFSET - ADD $16, R(TS), R(TMP) /* do 16-byte chunks if possible */ - CMP R(TMP), R(TE) + ADD $16, TS, TMP /* do 16-byte chunks if possible */ + CMP TMP, TE BLS _b1tail - BIC $3, R(FROM) /* align source */ - MOVW R(TS), savedts-4(SP) - MOVW (R(FROM)), R(BR0) /* prime first block register */ + BIC $3, FROM /* align source */ + MOVW TS, savedts-4(SP) + MOVW (FROM), BR0 /* prime first block register */ _bu16loop: - CMP R(TMP), R(TE) + CMP TMP, TE BLS _bu1tail - MOVW R(BR0)<>R(RSHIFT), R(BW3) + MOVW BR0<>RSHIFT, BW3 - MOVW R(BR3)<>R(RSHIFT), R(BW2) + MOVW BR3<>RSHIFT, BW2 - MOVW R(BR2)<>R(RSHIFT), R(BW1) + MOVW BR2<>RSHIFT, BW1 - MOVW R(BR1)<>R(RSHIFT), R(BW0) + MOVW BR1<>RSHIFT, BW0 - MOVM.DB.W [R(BW0)-R(BW3)], (R(TE)) + MOVM.DB.W [BW0-BW3], (TE) B _bu16loop _bu1tail: - MOVW savedts-4(SP), R(TS) - ADD R(OFFSET), R(FROM) + MOVW savedts-4(SP), TS + ADD OFFSET, FROM B _b1tail _funaligned: - CMP $2, R(TMP) + CMP $2, TMP - MOVW.LT $8, R(RSHIFT) /* (R(n+1)<<24)|(R(n)>>8) */ - MOVW.LT $24, R(LSHIFT) - MOVW.LT $3, R(OFFSET) + MOVW.LT $8, RSHIFT /* (R(n+1)<<24)|(R(n)>>8) */ + MOVW.LT $24, LSHIFT + MOVW.LT $3, OFFSET - MOVW.EQ $16, R(RSHIFT) /* (R(n+1)<<16)|(R(n)>>16) */ - MOVW.EQ $16, R(LSHIFT) - MOVW.EQ $2, R(OFFSET) + MOVW.EQ $16, RSHIFT /* (R(n+1)<<16)|(R(n)>>16) */ + MOVW.EQ $16, LSHIFT + MOVW.EQ $2, OFFSET - MOVW.GT $24, R(RSHIFT) /* (R(n+1)<<8)|(R(n)>>24) */ - MOVW.GT $8, R(LSHIFT) - MOVW.GT $1, R(OFFSET) + MOVW.GT $24, RSHIFT /* (R(n+1)<<8)|(R(n)>>24) */ + MOVW.GT $8, LSHIFT + MOVW.GT $1, OFFSET - SUB $16, R(TE), R(TMP) /* do 16-byte chunks if possible */ - CMP R(TMP), R(TS) + SUB $16, TE, TMP /* do 16-byte chunks if possible */ + CMP TMP, TS BHS _f1tail - BIC $3, R(FROM) /* align source */ - MOVW R(TE), savedte-4(SP) - MOVW.P 4(R(FROM)), R(FR3) /* prime last block register, implicit write back */ + BIC $3, FROM /* align source */ + MOVW TE, savedte-4(SP) + MOVW.P 4(FROM), FR3 /* prime last block register, implicit write back */ _fu16loop: - CMP R(TMP), R(TS) + CMP TMP, TS BHS _fu1tail - MOVW R(FR3)>>R(RSHIFT), R(FW0) - MOVM.IA.W (R(FROM)), [R(FR0),R(FR1),R(FR2),R(FR3)] - ORR R(FR0)<>RSHIFT, FW0 + MOVM.IA.W (FROM), [FR0,FR1,FR2,FR3] + ORR FR0<>R(RSHIFT), R(FW1) - ORR R(FR1)<>RSHIFT, FW1 + ORR FR1<>R(RSHIFT), R(FW2) - ORR R(FR2)<>RSHIFT, FW2 + ORR FR2<>R(RSHIFT), R(FW3) - ORR R(FR3)<>RSHIFT, FW3 + ORR FR3<(SB),NOSPLIT,$-4 - ADD $1, PC, R4 + ADD $1, R15, R4 // R15 is hardware PC WORD $0xe12fff14 //BX (R4) // enter thumb mode // TODO(minux): only supports little-endian CPUs WORD $0x4770df01 // swi $1; bx lr diff --git a/src/runtime/sys_linux_arm.s b/src/runtime/sys_linux_arm.s index bf0c810ad1..b0a9b4fc7d 100644 --- a/src/runtime/sys_linux_arm.s +++ b/src/runtime/sys_linux_arm.s @@ -383,7 +383,7 @@ TEXT runtime·usleep(SB),NOSPLIT,$12 // Use kernel version instead of native armcas in asm_arm.s. // See ../sync/atomic/asm_linux_arm.s for details. TEXT cas<>(SB),NOSPLIT,$0 - MOVW $0xffff0fc0, PC + MOVW $0xffff0fc0, R15 // R15 is hardware PC. TEXT runtime·cas(SB),NOSPLIT,$0 MOVW ptr+0(FP), R2 diff --git a/src/runtime/vlop_arm.s b/src/runtime/vlop_arm.s index 5354bf9115..28f75190ec 100644 --- a/src/runtime/vlop_arm.s +++ b/src/runtime/vlop_arm.s @@ -27,8 +27,6 @@ #include "go_tls.h" #include "textflag.h" -arg=0 - /* replaced use of R10 by R11 because the former can be the data segment base register */ TEXT _mulv(SB), NOSPLIT, $0 @@ -111,70 +109,71 @@ TEXT runtime·_sfloatpanic(SB),NOSPLIT,$-4 // Reference: // Sloss, Andrew et. al; ARM System Developer's Guide: Designing and Optimizing System Software // Morgan Kaufmann; 1 edition (April 8, 2004), ISBN 978-1558608740 -q = 0 // input d, output q -r = 1 // input n, output r -s = 2 // three temporary variables -M = 3 -a = 11 -// Be careful: R(a) == R11 will be used by the linker for synthesized instructions. -TEXT udiv<>(SB),NOSPLIT,$-4 - CLZ R(q), R(s) // find normalizing shift - MOVW.S R(q)<-64(SB), R(M) - ADD.NE R(a)>>25, R(M), R(a) // index by most significant 7 bits of divisor - MOVBU.NE (R(a)), R(a) +#define Rq R0 // input d, output q +#define Rr R1 // input n, output r +#define Rs R2 // three temporary variables +#define RM R3 +#define Ra R11 - SUB.S $7, R(s) - RSB $0, R(q), R(M) // M = -q - MOVW.PL R(a)<(SB),NOSPLIT,$-4 + CLZ Rq, Rs // find normalizing shift + MOVW.S Rq<-64(SB), RM + ADD.NE Ra>>25, RM, Ra // index by most significant 7 bits of divisor + MOVBU.NE (Ra), Ra + + SUB.S $7, Rs + RSB $0, Rq, RM // M = -q + MOVW.PL Ra<>32) - TEQ R(M)->1, R(M) // check for d=0 or d=1 + MULAWT Ra, Rq, Rq, Rq // q approx q-(q*q*d>>32) + TEQ RM->1, RM // check for d=0 or d=1 // 2nd Newton iteration - MUL.NE R(M), R(q), R(a) - MOVW.NE $0, R(s) - MULAL.NE R(q), R(a), (R(q),R(s)) + MUL.NE RM, Rq, Ra + MOVW.NE $0, Rs + MULAL.NE Rq, Ra, (Rq,Rs) BEQ udiv_by_0_or_1 // q now accurate enough for a remainder r, 0<=r<3*d - MULLU R(q), R(r), (R(q),R(s)) // q = (r * q) >> 32 - ADD R(M), R(r), R(r) // r = n - d - MULA R(M), R(q), R(r), R(r) // r = n - (q+1)*d + MULLU Rq, Rr, (Rq,Rs) // q = (r * q) >> 32 + ADD RM, Rr, Rr // r = n - d + MULA RM, Rq, Rr, Rr // r = n - (q+1)*d // since 0 <= n-q*d < 3*d; thus -d <= r < 2*d - CMN R(M), R(r) // t = r-d - SUB.CS R(M), R(r), R(r) // if (t<-d || t>=0) r=r+d - ADD.CC $1, R(q) - ADD.PL R(M)<<1, R(r) - ADD.PL $2, R(q) + CMN RM, Rr // t = r-d + SUB.CS RM, Rr, Rr // if (t<-d || t>=0) r=r+d + ADD.CC $1, Rq + ADD.PL RM<<1, Rr + ADD.PL $2, Rq RET udiv_by_large_d: // at this point we know d>=2^(31-6)=2^25 - SUB $4, R(a), R(a) - RSB $0, R(s), R(s) - MOVW R(a)>>R(s), R(q) - MULLU R(q), R(r), (R(q),R(s)) - MULA R(M), R(q), R(r), R(r) + SUB $4, Ra, Ra + RSB $0, Rs, Rs + MOVW Ra>>Rs, Rq + MULLU Rq, Rr, (Rq,Rs) + MULA RM, Rq, Rr, Rr // q now accurate enough for a remainder r, 0<=r<4*d - CMN R(r)>>1, R(M) // if(r/2 >= d) - ADD.CS R(M)<<1, R(r) - ADD.CS $2, R(q) - CMN R(r), R(M) - ADD.CS R(M), R(r) - ADD.CS $1, R(q) + CMN Rr>>1, RM // if(r/2 >= d) + ADD.CS RM<<1, Rr + ADD.CS $2, Rq + CMN Rr, RM + ADD.CS RM, Rr + ADD.CS $1, Rq RET udiv_by_0_or_1: // carry set if d==1, carry clear if d==0 BCC udiv_by_0 - MOVW R(r), R(q) - MOVW $0, R(r) + MOVW Rr, Rq + MOVW $0, Rr RET udiv_by_0: @@ -216,96 +215,96 @@ DATA fast_udiv_tab<>+0x38(SB)/4, $0x85868788 DATA fast_udiv_tab<>+0x3c(SB)/4, $0x81828384 GLOBL fast_udiv_tab<>(SB), RODATA, $64 -// The linker will pass numerator in R(TMP), and it also -// expects the result in R(TMP) -TMP = 11 +// The linker will pass numerator in RTMP, and it also +// expects the result in RTMP +#define RTMP R11 TEXT _divu(SB), NOSPLIT, $16 - MOVW R(q), 4(R13) - MOVW R(r), 8(R13) - MOVW R(s), 12(R13) - MOVW R(M), 16(R13) + MOVW Rq, 4(R13) + MOVW Rr, 8(R13) + MOVW Rs, 12(R13) + MOVW RM, 16(R13) - MOVW R(TMP), R(r) /* numerator */ - MOVW 0(FP), R(q) /* denominator */ + MOVW RTMP, Rr /* numerator */ + MOVW 0(FP), Rq /* denominator */ BL udiv<>(SB) - MOVW R(q), R(TMP) - MOVW 4(R13), R(q) - MOVW 8(R13), R(r) - MOVW 12(R13), R(s) - MOVW 16(R13), R(M) + MOVW Rq, RTMP + MOVW 4(R13), Rq + MOVW 8(R13), Rr + MOVW 12(R13), Rs + MOVW 16(R13), RM RET TEXT _modu(SB), NOSPLIT, $16 - MOVW R(q), 4(R13) - MOVW R(r), 8(R13) - MOVW R(s), 12(R13) - MOVW R(M), 16(R13) + MOVW Rq, 4(R13) + MOVW Rr, 8(R13) + MOVW Rs, 12(R13) + MOVW RM, 16(R13) - MOVW R(TMP), R(r) /* numerator */ - MOVW 0(FP), R(q) /* denominator */ + MOVW RTMP, Rr /* numerator */ + MOVW 0(FP), Rq /* denominator */ BL udiv<>(SB) - MOVW R(r), R(TMP) - MOVW 4(R13), R(q) - MOVW 8(R13), R(r) - MOVW 12(R13), R(s) - MOVW 16(R13), R(M) + MOVW Rr, RTMP + MOVW 4(R13), Rq + MOVW 8(R13), Rr + MOVW 12(R13), Rs + MOVW 16(R13), RM RET TEXT _div(SB),NOSPLIT,$16 - MOVW R(q), 4(R13) - MOVW R(r), 8(R13) - MOVW R(s), 12(R13) - MOVW R(M), 16(R13) - MOVW R(TMP), R(r) /* numerator */ - MOVW 0(FP), R(q) /* denominator */ - CMP $0, R(r) + MOVW Rq, 4(R13) + MOVW Rr, 8(R13) + MOVW Rs, 12(R13) + MOVW RM, 16(R13) + MOVW RTMP, Rr /* numerator */ + MOVW 0(FP), Rq /* denominator */ + CMP $0, Rr BGE d1 - RSB $0, R(r), R(r) - CMP $0, R(q) + RSB $0, Rr, Rr + CMP $0, Rq BGE d2 - RSB $0, R(q), R(q) + RSB $0, Rq, Rq d0: BL udiv<>(SB) /* none/both neg */ - MOVW R(q), R(TMP) + MOVW Rq, RTMP B out1 d1: - CMP $0, R(q) + CMP $0, Rq BGE d0 - RSB $0, R(q), R(q) + RSB $0, Rq, Rq d2: BL udiv<>(SB) /* one neg */ - RSB $0, R(q), R(TMP) + RSB $0, Rq, RTMP out1: - MOVW 4(R13), R(q) - MOVW 8(R13), R(r) - MOVW 12(R13), R(s) - MOVW 16(R13), R(M) + MOVW 4(R13), Rq + MOVW 8(R13), Rr + MOVW 12(R13), Rs + MOVW 16(R13), RM RET TEXT _mod(SB),NOSPLIT,$16 - MOVW R(q), 4(R13) - MOVW R(r), 8(R13) - MOVW R(s), 12(R13) - MOVW R(M), 16(R13) - MOVW R(TMP), R(r) /* numerator */ - MOVW 0(FP), R(q) /* denominator */ - CMP $0, R(q) - RSB.LT $0, R(q), R(q) - CMP $0, R(r) + MOVW Rq, 4(R13) + MOVW Rr, 8(R13) + MOVW Rs, 12(R13) + MOVW RM, 16(R13) + MOVW RTMP, Rr /* numerator */ + MOVW 0(FP), Rq /* denominator */ + CMP $0, Rq + RSB.LT $0, Rq, Rq + CMP $0, Rr BGE m1 - RSB $0, R(r), R(r) + RSB $0, Rr, Rr BL udiv<>(SB) /* neg numerator */ - RSB $0, R(r), R(TMP) + RSB $0, Rr, RTMP B out m1: BL udiv<>(SB) /* pos numerator */ - MOVW R(r), R(TMP) + MOVW Rr, RTMP out: - MOVW 4(R13), R(q) - MOVW 8(R13), R(r) - MOVW 12(R13), R(s) - MOVW 16(R13), R(M) + MOVW 4(R13), Rq + MOVW 8(R13), Rr + MOVW 12(R13), Rs + MOVW 16(R13), RM RET // _mul64by32 and _div64by32 not implemented on arm diff --git a/src/sync/atomic/asm_linux_arm.s b/src/sync/atomic/asm_linux_arm.s index b388e4c550..63562388a2 100644 --- a/src/sync/atomic/asm_linux_arm.s +++ b/src/sync/atomic/asm_linux_arm.s @@ -24,7 +24,7 @@ // http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=b49c0f24cf6744a3f4fd09289fe7cade349dead5 // TEXT cas<>(SB),NOSPLIT,$0 - MOVW $0xffff0fc0, PC + MOVW $0xffff0fc0, R15 TEXT ·CompareAndSwapInt32(SB),NOSPLIT,$0 B ·CompareAndSwapUint32(SB) @@ -95,7 +95,7 @@ TEXT ·SwapUintptr(SB),NOSPLIT,$0 B ·SwapUint32(SB) TEXT cas64<>(SB),NOSPLIT,$0 - MOVW $0xffff0f60, PC // __kuser_cmpxchg64: Linux-3.1 and above + MOVW $0xffff0f60, R15 // R15 = hardware PC. __kuser_cmpxchg64: Linux-3.1 and above TEXT kernelCAS64<>(SB),NOSPLIT,$0-21 // int (*__kuser_cmpxchg64_t)(const int64_t *oldval, const int64_t *newval, volatile int64_t *ptr); @@ -127,17 +127,17 @@ TEXT setupAndCallCAS64<>(SB),NOSPLIT,$-4-21 CMP $5, R0 MOVW.CS $kernelCAS64<>(SB), R1 MOVW.CS R1, armCAS64(SB) - MOVW.CS R1, PC + MOVW.CS R1, R15 // R15 = hardware PC MOVB runtime·armArch(SB), R0 // LDREXD, STREXD only present on ARMv6K or higher CMP $6, R0 // TODO(minux): how to differentiate ARMv6 with ARMv6K? MOVW.CS $·armCompareAndSwapUint64(SB), R1 MOVW.CS R1, armCAS64(SB) - MOVW.CS R1, PC + MOVW.CS R1, R15 // we are out of luck, can only use runtime's emulated 64-bit cas MOVW $·generalCAS64(SB), R1 MOVW R1, armCAS64(SB) - MOVW R1, PC + MOVW R1, R15 TEXT ·CompareAndSwapInt64(SB),NOSPLIT,$0 B ·CompareAndSwapUint64(SB) @@ -145,7 +145,7 @@ TEXT ·CompareAndSwapInt64(SB),NOSPLIT,$0 TEXT ·CompareAndSwapUint64(SB),NOSPLIT,$-4-21 MOVW armCAS64(SB), R0 CMP $0, R0 - MOVW.NE R0, PC + MOVW.NE R0, R15 // R15 = hardware PC B setupAndCallCAS64<>(SB) TEXT ·AddInt64(SB),NOSPLIT,$0