From 705c0382e8f8437558a099a92a1d48674a252ddb Mon Sep 17 00:00:00 2001 From: Russ Cox Date: Tue, 26 Oct 2010 21:10:17 -0700 Subject: [PATCH] big: arm assembly, faster software mulWW, divWW Reduces time spent running crypto/rsa test by 65%. Fixes #1227. R=gri, PeterGo CC=golang-dev https://golang.org/cl/2743041 --- src/pkg/big/arith.go | 232 +++++++------------------------ src/pkg/big/arith_arm.s | 286 ++++++++++++++++++++++++++++++++++++-- src/pkg/big/arith_test.go | 1 + 3 files changed, 328 insertions(+), 191 deletions(-) diff --git a/src/pkg/big/arith.go b/src/pkg/big/arith.go index 29966c7bc57..df3808f5e4a 100644 --- a/src/pkg/big/arith.go +++ b/src/pkg/big/arith.go @@ -56,161 +56,29 @@ func subWW_g(x, y, c Word) (z1, z0 Word) { // z1<<_W + z0 = x*y +// Adapted from Warren, Hacker's Delight, p. 132. func mulWW_g(x, y Word) (z1, z0 Word) { - // Split x and y into 2 halfWords each, multiply - // the halfWords separately while avoiding overflow, - // and return the product as 2 Words. - - if x < y { - x, y = y, x - } - - if x < _B2 { - // y < _B2 because y <= x - // sub-digits of x and y are (0, x) and (0, y) - // z = z[0] = x*y - z0 = x * y - return - } - - if y < _B2 { - // sub-digits of x and y are (x1, x0) and (0, y) - // x = (x1*_B2 + x0) - // y = (y1*_B2 + y0) - x1, x0 := x>>_W2, x&_M2 - - // x*y = t2*_B2*_B2 + t1*_B2 + t0 - t0 := x0 * y - t1 := x1 * y - - // compute result digits but avoid overflow - // z = z[1]*_B + z[0] = x*y - z0 = t1<<_W2 + t0 - z1 = (t1 + t0>>_W2) >> _W2 - return - } - - // general case - // sub-digits of x and y are (x1, x0) and (y1, y0) - // x = (x1*_B2 + x0) - // y = (y1*_B2 + y0) - x1, x0 := x>>_W2, x&_M2 - y1, y0 := y>>_W2, y&_M2 - - // x*y = t2*_B2*_B2 + t1*_B2 + t0 - t0 := x0 * y0 - // t1 := x1*y0 + x0*y1; - var c Word - t1 := x1 * y0 - t1a := t1 - t1 += x0 * y1 - if t1 < t1a { - c++ - } - t2 := x1*y1 + c*_B2 - - // compute result digits but avoid overflow - // z = z[1]*_B + z[0] = x*y - // This may overflow, but that's ok because we also sum t1 and t0 above - // and we take care of the overflow there. - z0 = t1<<_W2 + t0 - - // z1 = t2 + (t1 + t0>>_W2)>>_W2; - var c3 Word - z1 = t1 + t0>>_W2 - if z1 < t1 { - c3++ - } - z1 >>= _W2 - z1 += c3 * _B2 - z1 += t2 + x0 := x & _M2 + x1 := x >> _W2 + y0 := y & _M2 + y1 := y >> _W2 + w0 := x0 * y0 + t := x1*y0 + w0>>_W2 + w1 := t & _M2 + w2 := t >> _W2 + w1 += x0 * y1 + z1 = x1*y1 + w2 + w1>>_W2 + z0 = x * y return } // z1<<_W + z0 = x*y + c func mulAddWWW_g(x, y, c Word) (z1, z0 Word) { - // Split x and y into 2 halfWords each, multiply - // the halfWords separately while avoiding overflow, - // and return the product as 2 Words. - - // TODO(gri) Should implement special cases for faster execution. - - // general case - // sub-digits of x, y, and c are (x1, x0), (y1, y0), (c1, c0) - // x = (x1*_B2 + x0) - // y = (y1*_B2 + y0) - x1, x0 := x>>_W2, x&_M2 - y1, y0 := y>>_W2, y&_M2 - c1, c0 := c>>_W2, c&_M2 - - // x*y + c = t2*_B2*_B2 + t1*_B2 + t0 - // (1<<32-1)^2 == 1<<64 - 1<<33 + 1, so there's space to add c0 in here. - t0 := x0*y0 + c0 - - // t1 := x1*y0 + x0*y1 + c1; - var c2 Word // extra carry - t1 := x1*y0 + c1 - t1a := t1 - t1 += x0 * y1 - if t1 < t1a { // If the number got smaller then we overflowed. - c2++ + z1, zz0 := mulWW(x, y) + if z0 = zz0 + c; z0 < zz0 { + z1++ } - - t2 := x1*y1 + c2*_B2 - - // compute result digits but avoid overflow - // z = z[1]*_B + z[0] = x*y - // z0 = t1<<_W2 + t0; - // This may overflow, but that's ok because we also sum t1 and t0 below - // and we take care of the overflow there. - z0 = t1<<_W2 + t0 - - var c3 Word - z1 = t1 + t0>>_W2 - if z1 < t1 { - c3++ - } - z1 >>= _W2 - z1 += t2 + c3*_B2 - - return -} - - -// q = (x1<<_W + x0 - r)/y -// The most significant bit of y must be 1. -func divStep(x1, x0, y Word) (q, r Word) { - d1, d0 := y>>_W2, y&_M2 - q1, r1 := x1/d1, x1%d1 - m := q1 * d0 - r1 = r1*_B2 | x0>>_W2 - if r1 < m { - q1-- - r1 += y - if r1 >= y && r1 < m { - q1-- - r1 += y - } - } - r1 -= m - - r0 := r1 % d1 - q0 := r1 / d1 - m = q0 * d0 - r0 = r0*_B2 | x0&_M2 - if r0 < m { - q0-- - r0 += y - if r0 >= y && r0 < m { - q0-- - r0 += y - } - } - r0 -= m - - q = q1*_B2 | q0 - r = r0 return } @@ -241,46 +109,48 @@ func leadingZeros(x Word) uint { } -// q = (x1<<_W + x0 - r)/y -func divWW_g(x1, x0, y Word) (q, r Word) { - if x1 == 0 { - q, r = x0/y, x0%y - return +// q = (u1<<_W + u0 - r)/y +// Adapted from Warren, Hacker's Delight, p. 152. +func divWW_g(u1, u0, v Word) (q, r Word) { + if u1 >= v { + return 1<<_W - 1, 1<<_W - 1 } - var q0, q1 Word - z := leadingZeros(y) - if y > x1 { - if z != 0 { - y <<= z - x1 = (x1 << z) | (x0 >> (_W - z)) - x0 <<= z + s := leadingZeros(v) + v <<= s + + vn1 := v >> _W2 + vn0 := v & _M2 + un32 := u1<>(_W-s) + un10 := u0 << s + un1 := un10 >> _W2 + un0 := un10 & _M2 + q1 := un32 / vn1 + rhat := un32 - q1*vn1 + +again1: + if q1 >= _B2 || q1*vn0 > _B2*rhat+un1 { + q1-- + rhat += vn1 + if rhat < _B2 { + goto again1 } - q0, x0 = divStep(x1, x0, y) - q1 = 0 - } else { - if z == 0 { - x1 -= y - q1 = 1 - } else { - z1 := _W - z - y <<= z - x2 := x1 >> z1 - x1 = (x1 << z) | (x0 >> z1) - x0 <<= z - q1, x1 = divStep(x2, x1, y) + } + + un21 := un32*_B2 + un1 - q1*v + q0 := un21 / vn1 + rhat = un21 - q0*vn1 + +again2: + if q0 >= _B2 || q0*vn0 > _B2*rhat+un0 { + q0-- + rhat += vn1 + if rhat < _B2 { + goto again2 } - - q0, x0 = divStep(x1, x0, y) } - r = x0 >> z - - if q1 != 0 { - panic("div out of range") - } - - return q0, r + return q1*_B2 + q0, (un21*_B2 + un0 - q0*v) >> s } diff --git a/src/pkg/big/arith_arm.s b/src/pkg/big/arith_arm.s index 6ab8e989eda..e4a9a962cff 100644 --- a/src/pkg/big/arith_arm.s +++ b/src/pkg/big/arith_arm.s @@ -5,36 +5,302 @@ // This file provides fast assembly versions for the elementary // arithmetic operations on vectors implemented in arith.go. -// TODO(gri) Implement these routines. +#define CFLAG 29 // bit position of carry flag + +// func addVV(z, x, y []Word) (c Word) TEXT ·addVV(SB),7,$0 - B ·addVV_g(SB) + MOVW $0, R0 + MOVW z+0(FP), R1 + MOVW x+12(FP), R2 + MOVW y+24(FP), R3 + MOVW n+4(FP), R4 + MOVW R4<<2, R4 + ADD R1, R4 + B E1 +L1: + MOVW.P 4(R2), R5 + MOVW.P 4(R3), R6 + MOVW R0, CPSR + ADC.S R6, R5 + MOVW.P R5, 4(R1) + MOVW CPSR, R0 +E1: + CMP R1, R4 + BNE L1 + MOVW R0>>CFLAG, R0 + AND $1, R0 + MOVW R0, c+36(FP) + RET + + +// func subVV(z, x, y []Word) (c Word) +// (same as addVV except for SBC instead of ADC and label names) TEXT ·subVV(SB),7,$0 - B ·subVV_g(SB) + MOVW $(1<>CFLAG, R0 + AND $1, R0 + EOR $1, R0 + MOVW R0, c+36(FP) + RET + + +// func addVW(z, x []Word, y Word) (c Word) TEXT ·addVW(SB),7,$0 - B ·addVW_g(SB) + MOVW z+0(FP), R1 + MOVW x+12(FP), R2 + MOVW y+24(FP), R3 + MOVW n+4(FP), R4 + MOVW R4<<2, R4 + ADD R1, R4 + CMP R1, R4 + BNE L3a + MOVW R3, c+28(FP) + RET +L3a: + MOVW.P 4(R2), R5 + ADD.S R3, R5 + MOVW.P R5, 4(R1) + MOVW CPSR, R0 + B E3 +L3: + MOVW.P 4(R2), R5 + MOVW R0, CPSR + ADC.S $0, R5 + MOVW.P R5, 4(R1) + MOVW CPSR, R0 +E3: + CMP R1, R4 + BNE L3 + + MOVW R0>>CFLAG, R0 + AND $1, R0 + MOVW R0, c+28(FP) + RET + TEXT ·subVW(SB),7,$0 - B ·subVW_g(SB) + MOVW z+0(FP), R1 + MOVW x+12(FP), R2 + MOVW y+24(FP), R3 + MOVW n+4(FP), R4 + MOVW R4<<2, R4 + ADD R1, R4 + CMP R1, R4 + BNE L4a + MOVW R3, c+28(FP) + RET +L4a: + MOVW.P 4(R2), R5 + SUB.S R3, R5 + MOVW.P R5, 4(R1) + MOVW CPSR, R0 + B E4 +L4: + MOVW.P 4(R2), R5 + MOVW R0, CPSR + SBC.S $0, R5 + MOVW.P R5, 4(R1) + MOVW CPSR, R0 +E4: + CMP R1, R4 + BNE L4 + MOVW R0>>CFLAG, R0 + AND $1, R0 + EOR $1, R0 + MOVW R0, c+28(FP) + RET + + +// func shlVW(z, x []Word, s Word) (c Word) TEXT ·shlVW(SB),7,$0 - B ·shlVW_g(SB) + MOVW n+4(FP), R5 + CMP $0, R5 + BEQ X7 + + MOVW z+0(FP), R1 + MOVW x+12(FP), R2 + MOVW R5<<2, R5 + ADD R5, R2 + ADD R1, R5 + MOVW s+24(FP), R3 + CMP $0, R3 // shift 0 is special + BEQ Y7 + ADD $4, R1 // stop one word early + MOVW $32, R4 + SUB R3, R4 + MOVW $0, R7 + + MOVW.W -4(R2), R6 + MOVW R6<>R4, R6 + MOVW R6, c+28(FP) + B E7 + +L7: + MOVW.W -4(R2), R6 + ORR R6>>R4, R7 + MOVW.W R7, -4(R5) + MOVW R6<>R3, R7 + MOVW R6<>R3, R7 +E6: + CMP R1, R5 + BNE L6 + + MOVW R7, 0(R1) + RET + +Y6: // copy loop, because shift 0 == shift 32 + MOVW.P 4(R2), R6 + MOVW.P R6, 4(R1) + CMP R1, R5 + BNE Y6 + +X6: + MOVW $0, R1 + MOVW R1, c+28(FP) + RET + TEXT ·mulAddVWW(SB),7,$0 - B ·mulAddVWW_g(SB) + MOVW $0, R0 + MOVW z+0(FP), R1 + MOVW x+12(FP), R2 + MOVW y+24(FP), R3 + MOVW r+28(FP), R4 + MOVW n+4(FP), R5 + MOVW R5<<2, R5 + ADD R1, R5 + B E8 + + // word loop +L8: + MOVW.P 4(R2), R6 + MULLU R6, R3, (R7, R6) + ADD.S R4, R6 + ADC R0, R7 + MOVW.P R6, 4(R1) + MOVW R7, R4 +E8: + CMP R1, R5 + BNE L8 + + MOVW R4, c+32(FP) + RET + TEXT ·addMulVVW(SB),7,$0 - B ·addMulVVW_g(SB) + MOVW $0, R0 + MOVW z+0(FP), R1 + MOVW x+12(FP), R2 + MOVW y+24(FP), R3 + MOVW n+4(FP), R5 + MOVW R5<<2, R5 + ADD R1, R5 + MOVW $0, R4 + B E9 + + // word loop +L9: + MOVW.P 4(R2), R6 + MULLU R6, R3, (R7, R6) + ADD.S R4, R6 + ADC R0, R7 + MOVW 0(R1), R4 + ADD.S R4, R6 + ADC R0, R7 + MOVW.P R6, 4(R1) + MOVW R7, R4 +E9: + CMP R1, R5 + BNE L9 + + MOVW R4, c+28(FP) + RET + TEXT ·divWVW(SB),7,$0 + // ARM has no multiword division, so use portable code. B ·divWVW_g(SB) + TEXT ·divWW(SB),7,$0 + // ARM has no multiword division, so use portable code. B ·divWW_g(SB) + +// func mulWW(x, y Word) (z1, z0 Word) TEXT ·mulWW(SB),7,$0 - B ·mulWW_g(SB) + MOVW x+0(FP), R1 + MOVW y+4(FP), R2 + MULLU R1, R2, (R4, R3) + MOVW R4, z1+8(FP) + MOVW R3, z0+12(FP) + RET diff --git a/src/pkg/big/arith_test.go b/src/pkg/big/arith_test.go index 5765b89d17a..934b302df03 100644 --- a/src/pkg/big/arith_test.go +++ b/src/pkg/big/arith_test.go @@ -116,6 +116,7 @@ type argVW struct { var sumVW = []argVW{ {}, + {nil, nil, 2, 2}, {nat{0}, nat{0}, 0, 0}, {nat{1}, nat{0}, 1, 0}, {nat{1}, nat{1}, 0, 0},