big: arm assembly, faster software mulWW, divWW

Reduces time spent running crypto/rsa test by 65%. Fixes #1227. R=gri, PeterGo CC=golang-dev https://golang.org/cl/2743041
2024-11-25 05:38:01 -07:00 · 2010-10-26 21:10:17 -07:00 · 2010-10-26 21:10:17 -07:00 · 705c0382e8
commit 705c0382e8
parent a8abb64a71
3 changed files with 328 additions and 191 deletions
--- a/src/pkg/big/arith.go
+++ b/src/pkg/big/arith.go
@ -56,161 +56,29 @@ func subWW_g(x, y, c Word) (z1, z0 Word) {
 // z1<<_W + z0 = x*y
 // Adapted from Warren, Hacker's Delight, p. 132.
 func mulWW_g(x, y Word) (z1, z0 Word) {
-	// Split x and y into 2 halfWords each, multiply
+	x0 := x & _M2
-	// the halfWords separately while avoiding overflow,
+	x1 := x >> _W2
-	// and return the product as 2 Words.
+	y0 := y & _M2
-
+	y1 := y >> _W2
-	if x < y {
+	w0 := x0 * y0
-		x, y = y, x
+	t := x1*y0 + w0>>_W2
-	}
+	w1 := t & _M2
-
+	w2 := t >> _W2
-	if x < _B2 {
+	w1 += x0 * y1
-		// y < _B2 because y <= x
+	z1 = x1*y1 + w2 + w1>>_W2
 		// sub-digits of x and y are (0, x) and (0, y)
 		// z = z[0] = x*y
 	z0 = x * y
 	return
 	}
 	if y < _B2 {
 		// sub-digits of x and y are (x1, x0) and (0, y)
 		// x = (x1*_B2 + x0)
 		// y = (y1*_B2 + y0)
 		x1, x0 := x>>_W2, x&_M2
 		// x*y = t2*_B2*_B2 + t1*_B2 + t0
 		t0 := x0 * y
 		t1 := x1 * y
 		// compute result digits but avoid overflow
 		// z = z[1]*_B + z[0] = x*y
 		z0 = t1<<_W2 + t0
 		z1 = (t1 + t0>>_W2) >> _W2
 		return
 	}
 	// general case
 	// sub-digits of x and y are (x1, x0) and (y1, y0)
 	// x = (x1*_B2 + x0)
 	// y = (y1*_B2 + y0)
 	x1, x0 := x>>_W2, x&_M2
 	y1, y0 := y>>_W2, y&_M2
 	// x*y = t2*_B2*_B2 + t1*_B2 + t0
 	t0 := x0 * y0
 	// t1 := x1*y0 + x0*y1;
 	var c Word
 	t1 := x1 * y0
 	t1a := t1
 	t1 += x0 * y1
 	if t1 < t1a {
 		c++
 	}
 	t2 := x1*y1 + c*_B2
 	// compute result digits but avoid overflow
 	// z = z[1]*_B + z[0] = x*y
 	// This may overflow, but that's ok because we also sum t1 and t0 above
 	// and we take care of the overflow there.
 	z0 = t1<<_W2 + t0
 	// z1 = t2 + (t1 + t0>>_W2)>>_W2;
 	var c3 Word
 	z1 = t1 + t0>>_W2
 	if z1 < t1 {
 		c3++
 	}
 	z1 >>= _W2
 	z1 += c3 * _B2
 	z1 += t2
 	return
 }
 // z1<<_W + z0 = x*y + c
 func mulAddWWW_g(x, y, c Word) (z1, z0 Word) {
-	// Split x and y into 2 halfWords each, multiply
+	z1, zz0 := mulWW(x, y)
-	// the halfWords separately while avoiding overflow,
+	if z0 = zz0 + c; z0 < zz0 {
-	// and return the product as 2 Words.
+		z1++
 	// TODO(gri) Should implement special cases for faster execution.
 	// general case
 	// sub-digits of x, y, and c are (x1, x0), (y1, y0), (c1, c0)
 	// x = (x1*_B2 + x0)
 	// y = (y1*_B2 + y0)
 	x1, x0 := x>>_W2, x&_M2
 	y1, y0 := y>>_W2, y&_M2
 	c1, c0 := c>>_W2, c&_M2
 	// x*y + c = t2*_B2*_B2 + t1*_B2 + t0
 	// (1<<32-1)^2 == 1<<64 - 1<<33 + 1, so there's space to add c0 in here.
 	t0 := x0*y0 + c0
 	// t1 := x1*y0 + x0*y1 + c1;
 	var c2 Word // extra carry
 	t1 := x1*y0 + c1
 	t1a := t1
 	t1 += x0 * y1
 	if t1 < t1a { // If the number got smaller then we overflowed.
 		c2++
 	}
 	t2 := x1*y1 + c2*_B2
 	// compute result digits but avoid overflow
 	// z = z[1]*_B + z[0] = x*y
 	// z0 = t1<<_W2 + t0;
 	// This may overflow, but that's ok because we also sum t1 and t0 below
 	// and we take care of the overflow there.
 	z0 = t1<<_W2 + t0
 	var c3 Word
 	z1 = t1 + t0>>_W2
 	if z1 < t1 {
 		c3++
 	}
 	z1 >>= _W2
 	z1 += t2 + c3*_B2
 	return
 }
 // q = (x1<<_W + x0 - r)/y
 // The most significant bit of y must be 1.
 func divStep(x1, x0, y Word) (q, r Word) {
 	d1, d0 := y>>_W2, y&_M2
 	q1, r1 := x1/d1, x1%d1
 	m := q1 * d0
 	r1 = r1*_B2 | x0>>_W2
 	if r1 < m {
 		q1--
 		r1 += y
 		if r1 >= y && r1 < m {
 			q1--
 			r1 += y
 		}
 	}
 	r1 -= m
 	r0 := r1 % d1
 	q0 := r1 / d1
 	m = q0 * d0
 	r0 = r0*_B2 | x0&_M2
 	if r0 < m {
 		q0--
 		r0 += y
 		if r0 >= y && r0 < m {
 			q0--
 			r0 += y
 		}
 	}
 	r0 -= m
 	q = q1*_B2 | q0
 	r = r0
 	return
 }
@ -241,46 +109,48 @@ func leadingZeros(x Word) uint {
 }
-// q = (x1<<_W + x0 - r)/y
+// q = (u1<<_W + u0 - r)/y
-func divWW_g(x1, x0, y Word) (q, r Word) {
+// Adapted from Warren, Hacker's Delight, p. 152.
-	if x1 == 0 {
+func divWW_g(u1, u0, v Word) (q, r Word) {
-		q, r = x0/y, x0%y
+	if u1 >= v {
-		return
+		return 1<<_W - 1, 1<<_W - 1
 	}
-	var q0, q1 Word
+	s := leadingZeros(v)
-	z := leadingZeros(y)
+	v <<= s
-	if y > x1 {
+
-		if z != 0 {
+	vn1 := v >> _W2
-			y <<= z
+	vn0 := v & _M2
-			x1 = (x1 << z) | (x0 >> (_W - z))
+	un32 := u1<<s | u0>>(_W-s)
-			x0 <<= z
+	un10 := u0 << s
 	un1 := un10 >> _W2
 	un0 := un10 & _M2
 	q1 := un32 / vn1
 	rhat := un32 - q1*vn1
 again1:
 	if q1 >= _B2 || q1*vn0 > _B2*rhat+un1 {
 		q1--
 		rhat += vn1
 		if rhat < _B2 {
 			goto again1
 		}
 		q0, x0 = divStep(x1, x0, y)
 		q1 = 0
 	} else {
 		if z == 0 {
 			x1 -= y
 			q1 = 1
 		} else {
 			z1 := _W - z
 			y <<= z
 			x2 := x1 >> z1
 			x1 = (x1 << z) | (x0 >> z1)
 			x0 <<= z
 			q1, x1 = divStep(x2, x1, y)
 	}
-		q0, x0 = divStep(x1, x0, y)
+	un21 := un32*_B2 + un1 - q1*v
 	q0 := un21 / vn1
 	rhat = un21 - q0*vn1
 again2:
 	if q0 >= _B2 || q0*vn0 > _B2*rhat+un0 {
 		q0--
 		rhat += vn1
 		if rhat < _B2 {
 			goto again2
 		}
 	}
-	r = x0 >> z
+	return q1*_B2 + q0, (un21*_B2 + un0 - q0*v) >> s
 	if q1 != 0 {
 		panic("div out of range")
 	}
 	return q0, r
 }
--- a/src/pkg/big/arith_arm.s
+++ b/src/pkg/big/arith_arm.s
@ -5,36 +5,302 @@
 // This file provides fast assembly versions for the elementary
 // arithmetic operations on vectors implemented in arith.go.
-// TODO(gri) Implement these routines.
+#define CFLAG 29	// bit position of carry flag
 // func addVV(z, x, y []Word) (c Word)
 TEXT ·addVV(SB),7,$0
-	B ·addVV_g(SB)
+	MOVW	$0, R0
 	MOVW	z+0(FP), R1
 	MOVW	x+12(FP), R2
 	MOVW	y+24(FP), R3
 	MOVW	n+4(FP), R4
 	MOVW	R4<<2, R4
 	ADD	R1, R4
 	B E1
 L1:
 	MOVW.P	4(R2), R5
 	MOVW.P	4(R3), R6
 	MOVW	R0, CPSR
 	ADC.S	R6, R5
 	MOVW.P	R5, 4(R1)
 	MOVW	CPSR, R0
 E1:
 	CMP	R1, R4
 	BNE L1
 	MOVW	R0>>CFLAG, R0
 	AND	$1, R0
 	MOVW	R0, c+36(FP)
 	RET
 // func subVV(z, x, y []Word) (c Word)
 // (same as addVV except for SBC instead of ADC and label names)
 TEXT ·subVV(SB),7,$0
-	B ·subVV_g(SB)
+	MOVW	$(1<<CFLAG), R0
 	MOVW	z+0(FP), R1
 	MOVW	x+12(FP), R2
 	MOVW	y+24(FP), R3
 	MOVW	n+4(FP), R4
 	MOVW	R4<<2, R4
 	ADD	R1, R4
 	B E2
 L2:
 	MOVW.P	4(R2), R5
 	MOVW.P	4(R3), R6
 	MOVW	R0, CPSR
 	SBC.S	R6, R5
 	MOVW.P	R5, 4(R1)
 	MOVW	CPSR, R0
 E2:
 	CMP	R1, R4
 	BNE L2
 	MOVW	R0>>CFLAG, R0
 	AND	$1, R0
 	EOR	$1, R0
 	MOVW	R0, c+36(FP)
 	RET
 // func addVW(z, x []Word, y Word) (c Word)
 TEXT ·addVW(SB),7,$0
-	B ·addVW_g(SB)
+	MOVW	z+0(FP), R1
 	MOVW	x+12(FP), R2
 	MOVW	y+24(FP), R3
 	MOVW	n+4(FP), R4
 	MOVW	R4<<2, R4
 	ADD	R1, R4
 	CMP	R1, R4
 	BNE L3a
 	MOVW	R3, c+28(FP)
 	RET
 L3a:
 	MOVW.P	4(R2), R5
 	ADD.S	R3, R5
 	MOVW.P	R5, 4(R1)
 	MOVW	CPSR, R0
 	B	E3
 L3:
 	MOVW.P	4(R2), R5
 	MOVW	R0, CPSR
 	ADC.S	$0, R5
 	MOVW.P	R5, 4(R1)
 	MOVW	CPSR, R0
 E3:
 	CMP	R1, R4
 	BNE	L3
 	MOVW	R0>>CFLAG, R0
 	AND	$1, R0
 	MOVW	R0, c+28(FP)
 	RET
 TEXT ·subVW(SB),7,$0
-	B ·subVW_g(SB)
+	MOVW	z+0(FP), R1
 	MOVW	x+12(FP), R2
 	MOVW	y+24(FP), R3
 	MOVW	n+4(FP), R4
 	MOVW	R4<<2, R4
 	ADD	R1, R4
 	CMP	R1, R4
 	BNE L4a
 	MOVW	R3, c+28(FP)
 	RET
 L4a:
 	MOVW.P	4(R2), R5
 	SUB.S	R3, R5
 	MOVW.P	R5, 4(R1)
 	MOVW	CPSR, R0
 	B	E4
 L4:
 	MOVW.P	4(R2), R5
 	MOVW	R0, CPSR
 	SBC.S	$0, R5
 	MOVW.P	R5, 4(R1)
 	MOVW	CPSR, R0
 E4:
 	CMP	R1, R4
 	BNE	L4
 	MOVW	R0>>CFLAG, R0
 	AND	$1, R0
 	EOR	$1, R0
 	MOVW	R0, c+28(FP)
 	RET
 // func shlVW(z, x []Word, s Word) (c Word)
 TEXT ·shlVW(SB),7,$0
-	B ·shlVW_g(SB)
+	MOVW	n+4(FP), R5
 	CMP	$0, R5
 	BEQ	X7
 	MOVW	z+0(FP), R1
 	MOVW	x+12(FP), R2
 	MOVW	R5<<2, R5
 	ADD	R5, R2
 	ADD	R1, R5
 	MOVW	s+24(FP), R3
 	CMP	$0, R3	// shift 0 is special
 	BEQ	Y7
 	ADD	$4, R1	// stop one word early
 	MOVW	$32, R4
 	SUB	R3, R4
 	MOVW	$0, R7
 	MOVW.W	-4(R2), R6
 	MOVW	R6<<R3, R7
 	MOVW	R6>>R4, R6
 	MOVW	R6, c+28(FP)
 	B E7
 L7:
 	MOVW.W	-4(R2), R6
 	ORR	R6>>R4, R7
 	MOVW.W	R7, -4(R5)
 	MOVW	R6<<R3, R7
 E7:
 	CMP	R1, R5
 	BNE	L7
 	MOVW	R7, -4(R5)
 	RET
 Y7:	// copy loop, because shift 0 == shift 32
 	MOVW.W	-4(R2), R6
 	MOVW.W	R6, -4(R5)
 	CMP	R1, R5
 	BNE Y7
 X7:
 	MOVW	$0, R1
 	MOVW	R1, c+28(FP)
 	RET
 TEXT ·shrVW(SB),7,$0
-	B ·shrVW_g(SB)
+	MOVW	n+4(FP), R5
 	CMP	$0, R5
 	BEQ	X6
 	MOVW	z+0(FP), R1
 	MOVW	x+12(FP), R2
 	MOVW	R5<<2, R5
 	ADD	R1, R5
 	MOVW	s+24(FP), R3
 	CMP	$0, R3	// shift 0 is special
 	BEQ Y6
 	SUB	$4, R5	// stop one word early
 	MOVW	$32, R4
 	SUB	R3, R4
 	MOVW	$0, R7
 	// first word
 	MOVW.P	4(R2), R6
 	MOVW	R6>>R3, R7
 	MOVW	R6<<R4, R6
 	MOVW	R6, c+28(FP)
 	B E6
 	// word loop
 L6:
 	MOVW.P	4(R2), R6
 	ORR	R6<<R4, R7
 	MOVW.P	R7, 4(R1)
 	MOVW	R6>>R3, R7
 E6:
 	CMP	R1, R5
 	BNE	L6
 	MOVW	R7, 0(R1)
 	RET
 Y6:	// copy loop, because shift 0 == shift 32
 	MOVW.P	4(R2), R6
 	MOVW.P	R6, 4(R1)
 	CMP R1, R5
 	BNE Y6
 X6:
 	MOVW	$0, R1
 	MOVW	R1, c+28(FP)
 	RET
 TEXT ·mulAddVWW(SB),7,$0
-	B ·mulAddVWW_g(SB)
+	MOVW	$0, R0
 	MOVW	z+0(FP), R1
 	MOVW	x+12(FP), R2
 	MOVW	y+24(FP), R3
 	MOVW	r+28(FP), R4
 	MOVW	n+4(FP), R5
 	MOVW	R5<<2, R5
 	ADD	R1, R5
 	B E8
 	// word loop
 L8:
 	MOVW.P	4(R2), R6
 	MULLU	R6, R3, (R7, R6)
 	ADD.S	R4, R6
 	ADC	R0, R7
 	MOVW.P	R6, 4(R1)
 	MOVW	R7, R4
 E8:
 	CMP	R1, R5
 	BNE	L8
 	MOVW	R4, c+32(FP)
 	RET
 TEXT ·addMulVVW(SB),7,$0
-	B ·addMulVVW_g(SB)
+	MOVW	$0, R0
 	MOVW	z+0(FP), R1
 	MOVW	x+12(FP), R2
 	MOVW	y+24(FP), R3
 	MOVW	n+4(FP), R5
 	MOVW	R5<<2, R5
 	ADD	R1, R5
 	MOVW	$0, R4
 	B E9
 	// word loop
 L9:
 	MOVW.P	4(R2), R6
 	MULLU	R6, R3, (R7, R6)
 	ADD.S	R4, R6
 	ADC	R0, R7
 	MOVW	0(R1), R4
 	ADD.S	R4, R6
 	ADC	R0, R7
 	MOVW.P	R6, 4(R1)
 	MOVW	R7, R4
 E9:
 	CMP	R1, R5
 	BNE	L9
 	MOVW	R4, c+28(FP)
 	RET
 TEXT ·divWVW(SB),7,$0
 	// ARM has no multiword division, so use portable code.
 	B ·divWVW_g(SB)
 TEXT ·divWW(SB),7,$0
 	// ARM has no multiword division, so use portable code.
 	B ·divWW_g(SB)
 // func mulWW(x, y Word) (z1, z0 Word)
 TEXT ·mulWW(SB),7,$0
-	B ·mulWW_g(SB)
+	MOVW	x+0(FP), R1
 	MOVW	y+4(FP), R2
 	MULLU	R1, R2, (R4, R3)
 	MOVW	R4, z1+8(FP)
 	MOVW	R3, z0+12(FP)
 	RET
--- a/src/pkg/big/arith_test.go
+++ b/src/pkg/big/arith_test.go
@ -116,6 +116,7 @@ type argVW struct {
 var sumVW = []argVW{
 	{},
 	{nil, nil, 2, 2},
 	{nat{0}, nat{0}, 0, 0},
 	{nat{1}, nat{0}, 1, 0},
 	{nat{1}, nat{1}, 0, 0},