math/big: faster "pure Go" addition/subtraction for long vectors

(platforms w/o corresponding assembly kernels) For short vector adds there's some erradic slow-down, but overall these routines have become significantly faster. This only matters for platforms w/o native (assembly) versions of these kernels, so we are not concerned about the minor slow-down for short vectors. This code was already reviewed under Mercurial (golang.org/cl/172810043) but wasn't submitted before the switch to git. Benchmarks run on 2.3GHz Intel Core i7, running OS X 10.9.5, with the respective AddVV and AddVW assembly routines disabled. benchmark old ns/op new ns/op delta BenchmarkAddVV_1 6.59 7.09 +7.59% BenchmarkAddVV_2 10.3 10.1 -1.94% BenchmarkAddVV_3 10.9 12.6 +15.60% BenchmarkAddVV_4 13.9 15.6 +12.23% BenchmarkAddVV_5 16.8 17.3 +2.98% BenchmarkAddVV_1e1 29.5 29.9 +1.36% BenchmarkAddVV_1e2 246 232 -5.69% BenchmarkAddVV_1e3 2374 2185 -7.96% BenchmarkAddVV_1e4 58942 22292 -62.18% BenchmarkAddVV_1e5 668622 225279 -66.31% BenchmarkAddVW_1 6.81 5.58 -18.06% BenchmarkAddVW_2 7.69 6.86 -10.79% BenchmarkAddVW_3 9.56 8.32 -12.97% BenchmarkAddVW_4 12.1 9.53 -21.24% BenchmarkAddVW_5 13.2 10.9 -17.42% BenchmarkAddVW_1e1 23.4 18.0 -23.08% BenchmarkAddVW_1e2 175 141 -19.43% BenchmarkAddVW_1e3 1568 1266 -19.26% BenchmarkAddVW_1e4 15425 12596 -18.34% BenchmarkAddVW_1e5 156737 133539 -14.80% BenchmarkFibo 381678466 132958666 -65.16% benchmark old MB/s new MB/s speedup BenchmarkAddVV_1 9715.25 9028.30 0.93x BenchmarkAddVV_2 12461.72 12622.60 1.01x BenchmarkAddVV_3 17549.64 15243.82 0.87x BenchmarkAddVV_4 18392.54 16398.29 0.89x BenchmarkAddVV_5 18995.23 18496.57 0.97x BenchmarkAddVV_1e1 21708.98 21438.28 0.99x BenchmarkAddVV_1e2 25956.53 27506.88 1.06x BenchmarkAddVV_1e3 26947.93 29286.66 1.09x BenchmarkAddVV_1e4 10857.96 28709.46 2.64x BenchmarkAddVV_1e5 9571.91 28409.21 2.97x BenchmarkAddVW_1 1175.28 1433.98 1.22x BenchmarkAddVW_2 2080.01 2332.54 1.12x BenchmarkAddVW_3 2509.28 2883.97 1.15x BenchmarkAddVW_4 2646.09 3356.83 1.27x BenchmarkAddVW_5 3020.69 3671.07 1.22x BenchmarkAddVW_1e1 3425.76 4441.40 1.30x BenchmarkAddVW_1e2 4553.17 5642.96 1.24x BenchmarkAddVW_1e3 5100.14 6318.72 1.24x BenchmarkAddVW_1e4 5186.15 6350.96 1.22x BenchmarkAddVW_1e5 5104.07 5990.74 1.17x Change-Id: I7a62023b1105248a0e85e5b9819d3fd4266123d4 Reviewed-on: https://go-review.googlesource.com/2480 Reviewed-by: Russ Cox <rsc@golang.org> Reviewed-by: Alan Donovan <adonovan@google.com>
2024-11-19 03:54:42 -07:00 · 2015-01-07 11:11:57 -08:00 · 2015-01-07 11:11:57 -08:00 · 067acd51b0
commit 067acd51b0
parent 80b3ff9f82
3 changed files with 113 additions and 10 deletions
--- a/src/math/big/arith.go
+++ b/src/math/big/arith.go
@ -70,7 +70,7 @@ func mulWW_g(x, y Word) (z1, z0 Word) {

 // z1<<_W + z0 = x*y + c
 func mulAddWWW_g(x, y, c Word) (z1, z0 Word) {
-	z1, zz0 := mulWW(x, y)
+	z1, zz0 := mulWW_g(x, y)
 	if z0 = zz0 + c; z0 < zz0 {
 		z1++
 	}
@ -154,21 +154,52 @@ func divWW_g(u1, u0, v Word) (q, r Word) {
 	return q1*_B2 + q0, (un21*_B2 + un0 - q0*v) >> s
 }

+// Keep for performance debugging.
+// Using addWW_g is likely slower.
+const use_addWW_g = false
+
+// The resulting carry c is either 0 or 1.
 func addVV_g(z, x, y []Word) (c Word) {
+	if use_addWW_g {
 		for i := range z {
 			c, z[i] = addWW_g(x[i], y[i], c)
 		}
 		return
 	}

+	for i, xi := range x[:len(z)] {
+		yi := y[i]
+		zi := xi + yi + c
+		z[i] = zi
+		// see "Hacker's Delight", section 2-12 (overflow detection)
+		c = (xi&yi | (xi|yi)&^zi) >> (_W - 1)
+	}
+	return
+}
+
+// The resulting carry c is either 0 or 1.
 func subVV_g(z, x, y []Word) (c Word) {
+	if use_addWW_g {
 		for i := range z {
 			c, z[i] = subWW_g(x[i], y[i], c)
 		}
 		return
 	}

+	for i, xi := range x[:len(z)] {
+		yi := y[i]
+		zi := xi - yi - c
+		z[i] = zi
+		// see "Hacker's Delight", section 2-12 (overflow detection)
+		c = (yi&^xi | (yi|^xi)&zi) >> (_W - 1)
+	}
+	return
+}
+
+// Argument y must be either 0 or 1.
+// The resulting carry c is either 0 or 1.
 func addVW_g(z, x []Word, y Word) (c Word) {
+	if use_addWW_g {
 		c = y
 		for i := range z {
 			c, z[i] = addWW_g(x[i], c, 0)
@ -176,7 +207,17 @@ func addVW_g(z, x []Word, y Word) (c Word) {
 		return
 	}

+	c = y
+	for i, xi := range x[:len(z)] {
+		zi := xi + c
+		z[i] = zi
+		c = xi &^ zi >> (_W - 1)
+	}
+	return
+}
+
 func subVW_g(z, x []Word, y Word) (c Word) {
+	if use_addWW_g {
 		c = y
 		for i := range z {
 			c, z[i] = subWW_g(x[i], c, 0)
@ -184,6 +225,15 @@ func subVW_g(z, x []Word, y Word) (c Word) {
 		return
 	}

+	c = y
+	for i, xi := range x[:len(z)] {
+		zi := xi - c
+		z[i] = zi
+		c = (zi &^ xi) >> (_W - 1)
+	}
+	return
+}
+
 func shlVU_g(z, x []Word, s uint) (c Word) {
 	if n := len(z); n > 0 {
 		ŝ := _W - s
@ -222,6 +272,7 @@ func mulAddVWW_g(z, x []Word, y, r Word) (c Word) {
 	return
 }

+// TODO(gri) Remove use of addWW_g here and then we can remove addWW_g and subWW_g.
 func addMulVVW_g(z, x []Word, y Word) (c Word) {
 	for i := range z {
 		z1, z0 := mulAddWWW_g(x[i], y, z[i])
--- a/src/math/big/arith_test.go
+++ b/src/math/big/arith_test.go
@ -254,7 +254,7 @@ func benchmarkFunVW(b *testing.B, f funVW, n int) {
 	x := rndV(n)
 	y := rndW()
 	z := make([]Word, n)
-	b.SetBytes(int64(n * _W))
+	b.SetBytes(int64(n * _S))
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
 		f(z, x, y)
--- a/src/math/big/nat_test.go
+++ b/src/math/big/nat_test.go
@ -769,3 +769,55 @@ func BenchmarkExp3Power0x10000(b *testing.B)  { ExpHelper(b, 3, 0x10000) }
 func BenchmarkExp3Power0x40000(b *testing.B)  { ExpHelper(b, 3, 0x40000) }
 func BenchmarkExp3Power0x100000(b *testing.B) { ExpHelper(b, 3, 0x100000) }
 func BenchmarkExp3Power0x400000(b *testing.B) { ExpHelper(b, 3, 0x400000) }
+
+func fibo(n int) nat {
+	switch n {
+	case 0:
+		return nil
+	case 1:
+		return nat{1}
+	}
+	f0 := fibo(0)
+	f1 := fibo(1)
+	var f2 nat
+	for i := 1; i < n; i++ {
+		f2 = f2.add(f0, f1)
+		f0, f1, f2 = f1, f2, f0
+	}
+	return f1
+}
+
+var fiboNums = []string{
+	"0",
+	"55",
+	"6765",
+	"832040",
+	"102334155",
+	"12586269025",
+	"1548008755920",
+	"190392490709135",
+	"23416728348467685",
+	"2880067194370816120",
+	"354224848179261915075",
+}
+
+func TestFibo(t *testing.T) {
+	for i, want := range fiboNums {
+		n := i * 10
+		got := fibo(n).decimalString()
+		if got != want {
+			t.Errorf("fibo(%d) failed: got %s want %s", n, got, want)
+		}
+	}
+}
+
+func BenchmarkFibo(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		fibo(1e0)
+		fibo(1e1)
+		fibo(1e2)
+		fibo(1e3)
+		fibo(1e4)
+		fibo(1e5)
+	}
+}