math/big: use recursive subdivision for significant speedup

This change adds the second aspect to the conversion code, the use of large divisiors (powers of big base) to greatly speed up the divsion of large numbers. Speedups of 30x are common in the large cases. Also includes new tests and tuning code for the key internal parameters. R=gri CC=golang-dev https://golang.org/cl/5438058
2024-11-21 17:04:42 -07:00 · 2011-11-27 11:10:59 -08:00 · 2011-11-27 11:10:59 -08:00 · 4c113ffe16
commit 4c113ffe16
parent d859d7deee
2 changed files with 305 additions and 162 deletions
--- a/src/pkg/math/big/nat.go
+++ b/src/pkg/math/big/nat.go
@ -21,7 +21,9 @@ package big
 import (
 	"errors"
 	"io"
+	"math"
 	"math/rand"
+	"sync"
 )

 // An unsigned integer x of the form
@ -719,17 +721,17 @@ func (x nat) string(charset string) string {

 	// special cases
 	switch {
-	case b < 2 || b > 256:
+	case b < 2 || MaxBase < b:
 		panic("illegal base")
 	case len(x) == 0:
 		return string(charset[0])
 	}

 	// allocate buffer for conversion
-	i := x.bitLen()/log2(b) + 1 // +1: round up
+	i := int(float64(x.bitLen())/math.Log2(float64(b))) + 1 // off by one at most
 	s := make([]byte, i)

-	// special case: power of two bases can avoid divisions completely
+	// convert power of two and non power of two bases separately
 	if b == b&-b {
 		// shift is base-b digit size in bits
 		shift := uint(trailingZeroBits(b)) // shift > 0 because b >= 2
@ -771,65 +773,209 @@ func (x nat) string(charset string) string {
 			w >>= shift
 			nbits -= shift
 		}
+	} else {
+		// determine "big base" as in 10^19 for 19 decimal digits in a 64 bit Word
+		bb := Word(1) // big base is b**ndigits
+		ndigits := 0  // number of base b digits
+		for max := Word(_M / b); bb <= max; bb *= b {
+			ndigits++ // maximize ndigits where bb = b**ndigits, bb <= _M
+		}

-		return string(s[i:])
+		// construct table of successive squares of bb*leafSize to use in subdivisions
+		table := divisors(len(x), b, ndigits, bb)
+
+		// preserve x, create local copy for use in divisions
+		q := nat(nil).set(x)
+
+		// convert q to string s in base b with index of MSD indicated by return value
+		i = q.convertWords(0, i, s, charset, b, ndigits, bb, table)
 	}

-	// general case: extract groups of digits by multiprecision division
+	return string(s[i:])
+}

-	// maximize ndigits where b**ndigits < 2^_W; bb (big base) is b**ndigits
-	bb := Word(1)
-	ndigits := 0
-	for max := Word(_M / b); bb <= max; bb *= b {
-		ndigits++
-	}
+// Convert words of q to base b digits in s directly using iterated nat/Word divison to extract
+// low-order Words and indirectly by recursive subdivision and nat/nat division by tabulated 
+// divisors. 
+//
+// The direct method processes n Words by n divW() calls, each of which visits every Word in the 
+// incrementally shortened q for a total of n + (n-1) + (n-2) ... + 2 + 1, or n(n+1)/2 divW()'s. 
+// Indirect conversion divides q by its approximate square root, yielding two parts, each half 
+// the size of q. Using the direct method on both halves means 2 * (n/2)(n/2 + 1)/2 divW()'s plus 
+// the expensive long div(). Asymptotically, the ratio is favorable at 1/2 the divW()'s, and is 
+// made better by splitting the subblocks recursively. Best is to split blocks until one more 
+// split would take longer (because of the nat/nat div()) than the twice as many divW()'s of the 
+// direct approach. This threshold is represented by leafSize. Benchmarking of leafSize in the 
+// range 2..64 shows that values of 8 and 16 work well, with a 4x speedup at medium lengths and 
+// ~30x for 20000 digits. Use nat_test.go's BenchmarkLeafSize tests to optimize leafSize for 
+// specfic hardware.
+//
+// lo and hi index character array s. conversion starts with the LSD at hi and moves down toward
+// the MSD, which will be at s[0] or s[1]. lo == 0 signals span includes the most significant word.
+//
+func (q nat) convertWords(lo, hi int, s []byte, charset string, b Word, ndigits int, bb Word, table []divisor) int {
+	// indirect conversion: split larger blocks to reduce quadratic expense of iterated nat/W division
+	if leafSize > 0 && len(q) > leafSize && table != nil {
+		var r nat
+		index := len(table) - 1
+		for len(q) > leafSize {
+			// find divisor close to sqrt(q) if possible, but in any case < q
+			maxLength := q.bitLen()     // ~= log2 q, or at of least largest possible q of this bit length
+			minLength := maxLength >> 1 // ~= log2 sqrt(q)
+			for index > 0 && table[index-1].nbits > minLength {
+				index-- // desired
+			}
+			if table[index].nbits >= maxLength && table[index].bbb.cmp(q) >= 0 {
+				index--
+				if index < 0 {
+					panic("internal inconsistency")
+				}
+			}

-	// preserve x, create local copy for use in repeated divisions
-	q := nat(nil).set(x)
+			// split q into the two digit number (q'*bbb + r) to form independent subblocks
+			q, r = q.div(r, q, table[index].bbb)
+
+			// convert subblocks and collect results in s[lo:partition] and s[partition:hi]
+			partition := hi - table[index].ndigits
+			r.convertWords(partition, hi, s, charset, b, ndigits, bb, table[0:index])
+			hi = partition // i.e., q.convertWords(lo, partition, s, charset, b, ndigits, bb, table[0:index+1])
+		}
+	} // having split any large blocks now process the remaining small block
+
+	// direct conversion: process smaller blocks monolithically to avoid overhead of nat/nat division
 	var r Word
-
-	// convert
-	if b == 10 { // hard-coding for 10 here speeds this up by 1.25x
+	if b == 10 { // hard-coding for 10 here speeds this up by 1.25x (allows mod as mul vs div)
 		for len(q) > 0 {
 			// extract least significant, base bb "digit"
-			q, r = q.divW(q, bb) // N.B. >82% of time is here. Optimize divW
-			if len(q) == 0 {
+			q, r = q.divW(q, bb)
+			if lo == 0 && len(q) == 0 {
 				// skip leading zeros in most-significant group of digits
 				for j := 0; j < ndigits && r != 0; j++ {
-					i--
-					s[i] = charset[r%10]
-					r /= 10
+					hi--
+					t := r / 10
+					s[hi] = charset[r-(t<<3+t<<1)] // 8*t + 2*t = 10*t; r - 10*int(r/10) = r mod 10
+					r = t
 				}
 			} else {
-				for j := 0; j < ndigits; j++ {
-					i--
-					s[i] = charset[r%10]
-					r /= 10
+				for j := 0; j < ndigits && hi > lo; j++ {
+					hi--
+					t := r / 10
+					s[hi] = charset[r-(t<<3+t<<1)] // 8*t + 2*t = 10*t; r - 10*int(r/10) = r mod 10
+					r = t
 				}
 			}
 		}
 	} else {
 		for len(q) > 0 {
 			// extract least significant group of digits
-			q, r = q.divW(q, bb) // N.B. >82% of time is here. Optimize divW
-			if len(q) == 0 {
+			q, r = q.divW(q, bb)
+			if lo == 0 && len(q) == 0 {
 				// skip leading zeros in most-significant group of digits
 				for j := 0; j < ndigits && r != 0; j++ {
-					i--
-					s[i] = charset[r%b]
-					r /= b
+					hi--
+					s[hi] = charset[r%b]
+					r = r / b
 				}
 			} else {
-				for j := 0; j < ndigits; j++ {
-					i--
-					s[i] = charset[r%b]
-					r /= b
+				for j := 0; j < ndigits && hi > lo; j++ {
+					hi--
+					s[hi] = charset[r%b]
+					r = r / b
 				}
 			}
 		}
 	}

-	return string(s[i:])
+	// prepend high-order zeroes when q has been normalized to a short number of Words.
+	// however, do not prepend zeroes when converting the most dignificant digits.
+	if lo != 0 { // if not MSD
+		zero := charset[0]
+		for hi > lo { // while need more leading zeroes
+			hi--
+			s[hi] = zero
+		}
+	}
+
+	// return index of most significant output digit in s[] (stored in lowest index)
+	return hi
+}
+
+// Split blocks greater than leafSize Words (or set to 0 to disable indirect conversion)
+// Benchmark and configure leafSize using: gotest -test.bench="Leaf"
+//   8 and 16 effective on 3.0 GHz Xeon "Clovertown" CPU (128 byte cache lines)
+//   8 and 16 effective on 2.66 GHz Core 2 Duo "Penryn" CPU
+var leafSize int = 8 // number of Word-size binary values treat as a monolithic block
+
+type divisor struct {
+	bbb     nat // divisor
+	nbits   int // bit length of divisor (discounting leading zeroes) ~= log2(bbb)
+	ndigits int // digit length of divisor in terms of output base digits
+}
+
+const maxCache = 64               // maximum number of divisors in a single table
+var cacheBase10 [maxCache]divisor // cached divisors for base 10
+var cacheLock sync.Mutex          // defense against concurrent table extensions
+
+// construct table of powers of bb*leafSize to use in subdivisions
+func divisors(m int, b Word, ndigits int, bb Word) []divisor {
+	// only build table when indirect conversion is enabled and x is large
+	if leafSize == 0 || m <= leafSize {
+		return nil
+	}
+
+	// determine k where (bb**leafSize)**(2**k) >= sqrt(x)
+	k := 1
+	for words := leafSize; words < m>>1 && k < maxCache; words <<= 1 {
+		k++
+	}
+
+	// create new table of divisors or extend and reuse existing table as appropriate
+	var cached bool
+	var table []divisor
+	switch b {
+	case 10:
+		table = cacheBase10[0:k] // reuse old table for this conversion
+		cached = true
+	default:
+		table = make([]divisor, k) // new table for this conversion
+	}
+
+	// extend table
+	if table[k-1].ndigits == 0 {
+		if cached {
+			cacheLock.Lock() // begin critical section
+		}
+
+		var i int
+		var larger nat
+		for i < k && table[i].ndigits != 0 { // skip existing entries
+			i++
+		}
+		for ; i < k; i++ { // add new entries
+			if i == 0 {
+				table[i].bbb = nat(nil).expWW(bb, Word(leafSize))
+				table[i].ndigits = ndigits * leafSize
+			} else {
+				table[i].bbb = nat(nil).mul(table[i-1].bbb, table[i-1].bbb)
+				table[i].ndigits = 2 * table[i-1].ndigits
+			}
+
+			// optimization: exploit aggregated extra bits in macro blocks
+			larger = nat(nil).set(table[i].bbb)
+			for mulAddVWW(larger, larger, b, 0) == 0 {
+				table[i].bbb = table[i].bbb.set(larger)
+				table[i].ndigits++
+			}
+
+			table[i].nbits = table[i].bbb.bitLen()
+		}
+
+		if cached {
+			cacheLock.Unlock() // end critical section
+		}
+	}
+
+	return table
 }

 const deBruijn32 = 0x077CB531
@ -1140,7 +1286,12 @@ func (z nat) expNN(x, y, m nat) nat {
 		}
 	}

-	return z
+	return z.norm()
+}
+
+// calculate x**y for Word arguments y and y
+func (z nat) expWW(x, y Word) nat {
+	return z.expNN(nat(nil).setWord(x), nat(nil).setWord(y), nil)
 }

 // probablyPrime performs reps Miller-Rabin tests to check whether n is prime.
--- a/src/pkg/math/big/nat_test.go
+++ b/src/pkg/math/big/nat_test.go
@ -370,86 +370,34 @@ func BenchmarkScanPi(b *testing.B) {
 	}
 }

-const (
-	// 314**271
-	// base  2: 2249 digits
-	// base  8:  751 digits
-	// base 10:  678 digits
-	// base 16:  563 digits
-	shortBase     = 314
-	shortExponent = 271
+func BenchmarkScan10Base2(b *testing.B)     { ScanHelper(b, 2, 10, 10) }
+func BenchmarkScan100Base2(b *testing.B)    { ScanHelper(b, 2, 10, 100) }
+func BenchmarkScan1000Base2(b *testing.B)   { ScanHelper(b, 2, 10, 1000) }
+func BenchmarkScan10000Base2(b *testing.B)  { ScanHelper(b, 2, 10, 10000) }
+func BenchmarkScan100000Base2(b *testing.B) { ScanHelper(b, 2, 10, 100000) }

-	// 3141**2178
-	// base  2: 31577 digits
-	// base  8: 10527 digits
-	// base 10:  9507 digits
-	// base 16:  7895 digits
-	mediumBase     = 3141
-	mediumExponent = 2718
+func BenchmarkScan10Base8(b *testing.B)     { ScanHelper(b, 8, 10, 10) }
+func BenchmarkScan100Base8(b *testing.B)    { ScanHelper(b, 8, 10, 100) }
+func BenchmarkScan1000Base8(b *testing.B)   { ScanHelper(b, 8, 10, 1000) }
+func BenchmarkScan10000Base8(b *testing.B)  { ScanHelper(b, 8, 10, 10000) }
+func BenchmarkScan100000Base8(b *testing.B) { ScanHelper(b, 8, 10, 100000) }

-	// 3141**2178
-	// base  2: 406078 digits
-	// base  8: 135360 digits
-	// base 10: 122243 digits
-	// base 16: 101521 digits
-	longBase     = 31415
-	longExponent = 27182
-)
+func BenchmarkScan10Base10(b *testing.B)     { ScanHelper(b, 10, 10, 10) }
+func BenchmarkScan100Base10(b *testing.B)    { ScanHelper(b, 10, 10, 100) }
+func BenchmarkScan1000Base10(b *testing.B)   { ScanHelper(b, 10, 10, 1000) }
+func BenchmarkScan10000Base10(b *testing.B)  { ScanHelper(b, 10, 10, 10000) }
+func BenchmarkScan100000Base10(b *testing.B) { ScanHelper(b, 10, 10, 100000) }

-func BenchmarkScanShort2(b *testing.B) {
-	ScanHelper(b, 2, shortBase, shortExponent)
-}
+func BenchmarkScan10Base16(b *testing.B)     { ScanHelper(b, 16, 10, 10) }
+func BenchmarkScan100Base16(b *testing.B)    { ScanHelper(b, 16, 10, 100) }
+func BenchmarkScan1000Base16(b *testing.B)   { ScanHelper(b, 16, 10, 1000) }
+func BenchmarkScan10000Base16(b *testing.B)  { ScanHelper(b, 16, 10, 10000) }
+func BenchmarkScan100000Base16(b *testing.B) { ScanHelper(b, 16, 10, 100000) }

-func BenchmarkScanShort8(b *testing.B) {
-	ScanHelper(b, 8, shortBase, shortExponent)
-}
-
-func BenchmarkScanSort10(b *testing.B) {
-	ScanHelper(b, 10, shortBase, shortExponent)
-}
-
-func BenchmarkScanShort16(b *testing.B) {
-	ScanHelper(b, 16, shortBase, shortExponent)
-}
-
-func BenchmarkScanMedium2(b *testing.B) {
-	ScanHelper(b, 2, mediumBase, mediumExponent)
-}
-
-func BenchmarkScanMedium8(b *testing.B) {
-	ScanHelper(b, 8, mediumBase, mediumExponent)
-}
-
-func BenchmarkScanMedium10(b *testing.B) {
-	ScanHelper(b, 10, mediumBase, mediumExponent)
-}
-
-func BenchmarkScanMedium16(b *testing.B) {
-	ScanHelper(b, 16, mediumBase, mediumExponent)
-}
-
-func BenchmarkScanLong2(b *testing.B) {
-	ScanHelper(b, 2, longBase, longExponent)
-}
-
-func BenchmarkScanLong8(b *testing.B) {
-	ScanHelper(b, 8, longBase, longExponent)
-}
-
-func BenchmarkScanLong10(b *testing.B) {
-	ScanHelper(b, 10, longBase, longExponent)
-}
-
-func BenchmarkScanLong16(b *testing.B) {
-	ScanHelper(b, 16, longBase, longExponent)
-}
-
-func ScanHelper(b *testing.B, base int, xv, yv Word) {
+func ScanHelper(b *testing.B, base int, x, y Word) {
 	b.StopTimer()
-	var x, y, z nat
-	x = x.setWord(xv)
-	y = y.setWord(yv)
-	z = z.expNN(x, y, nil)
+	var z nat
+	z = z.expWW(x, y)

 	var s string
 	s = z.string(lowercaseDigits[0:base])
@ -459,68 +407,112 @@ func ScanHelper(b *testing.B, base int, xv, yv Word) {
 	b.StartTimer()

 	for i := 0; i < b.N; i++ {
-		x.scan(strings.NewReader(s), base)
+		z.scan(strings.NewReader(s), base)
 	}
 }

-func BenchmarkStringShort2(b *testing.B) {
-	StringHelper(b, 2, shortBase, shortExponent)
-}
+func BenchmarkString10Base2(b *testing.B)     { StringHelper(b, 2, 10, 10) }
+func BenchmarkString100Base2(b *testing.B)    { StringHelper(b, 2, 10, 100) }
+func BenchmarkString1000Base2(b *testing.B)   { StringHelper(b, 2, 10, 1000) }
+func BenchmarkString10000Base2(b *testing.B)  { StringHelper(b, 2, 10, 10000) }
+func BenchmarkString100000Base2(b *testing.B) { StringHelper(b, 2, 10, 100000) }

-func BenchmarkStringShort8(b *testing.B) {
-	StringHelper(b, 8, shortBase, shortExponent)
-}
+func BenchmarkString10Base8(b *testing.B)     { StringHelper(b, 8, 10, 10) }
+func BenchmarkString100Base8(b *testing.B)    { StringHelper(b, 8, 10, 100) }
+func BenchmarkString1000Base8(b *testing.B)   { StringHelper(b, 8, 10, 1000) }
+func BenchmarkString10000Base8(b *testing.B)  { StringHelper(b, 8, 10, 10000) }
+func BenchmarkString100000Base8(b *testing.B) { StringHelper(b, 8, 10, 100000) }

-func BenchmarkStringShort10(b *testing.B) {
-	StringHelper(b, 10, shortBase, shortExponent)
-}
+func BenchmarkString10Base10(b *testing.B)     { StringHelper(b, 10, 10, 10) }
+func BenchmarkString100Base10(b *testing.B)    { StringHelper(b, 10, 10, 100) }
+func BenchmarkString1000Base10(b *testing.B)   { StringHelper(b, 10, 10, 1000) }
+func BenchmarkString10000Base10(b *testing.B)  { StringHelper(b, 10, 10, 10000) }
+func BenchmarkString100000Base10(b *testing.B) { StringHelper(b, 10, 10, 100000) }

-func BenchmarkStringShort16(b *testing.B) {
-	StringHelper(b, 16, shortBase, shortExponent)
-}
+func BenchmarkString10Base16(b *testing.B)     { StringHelper(b, 16, 10, 10) }
+func BenchmarkString100Base16(b *testing.B)    { StringHelper(b, 16, 10, 100) }
+func BenchmarkString1000Base16(b *testing.B)   { StringHelper(b, 16, 10, 1000) }
+func BenchmarkString10000Base16(b *testing.B)  { StringHelper(b, 16, 10, 10000) }
+func BenchmarkString100000Base16(b *testing.B) { StringHelper(b, 16, 10, 100000) }

-func BenchmarkStringMedium2(b *testing.B) {
-	StringHelper(b, 2, mediumBase, mediumExponent)
-}
-
-func BenchmarkStringMedium8(b *testing.B) {
-	StringHelper(b, 8, mediumBase, mediumExponent)
-}
-
-func BenchmarkStringMedium10(b *testing.B) {
-	StringHelper(b, 10, mediumBase, mediumExponent)
-}
-
-func BenchmarkStringMedium16(b *testing.B) {
-	StringHelper(b, 16, mediumBase, mediumExponent)
-}
-
-func BenchmarkStringLong2(b *testing.B) {
-	StringHelper(b, 2, longBase, longExponent)
-}
-
-func BenchmarkStringLong8(b *testing.B) {
-	StringHelper(b, 8, longBase, longExponent)
-}
-
-func BenchmarkStringLong10(b *testing.B) {
-	StringHelper(b, 10, longBase, longExponent)
-}
-
-func BenchmarkStringLong16(b *testing.B) {
-	StringHelper(b, 16, longBase, longExponent)
-}
-
-func StringHelper(b *testing.B, base int, xv, yv Word) {
+func StringHelper(b *testing.B, base int, x, y Word) {
 	b.StopTimer()
-	var x, y, z nat
-	x = x.setWord(xv)
-	y = y.setWord(yv)
-	z = z.expNN(x, y, nil)
+	var z nat
+	z = z.expWW(x, y)
+	z.string(lowercaseDigits[0:base]) // warm divisor cache
 	b.StartTimer()

 	for i := 0; i < b.N; i++ {
-		z.string(lowercaseDigits[0:base])
+		_ = z.string(lowercaseDigits[0:base])
+	}
+}
+
+func BenchmarkLeafSize0(b *testing.B)  { LeafSizeHelper(b, 10, 0) } // test without splitting
+func BenchmarkLeafSize1(b *testing.B)  { LeafSizeHelper(b, 10, 1) }
+func BenchmarkLeafSize2(b *testing.B)  { LeafSizeHelper(b, 10, 2) }
+func BenchmarkLeafSize3(b *testing.B)  { LeafSizeHelper(b, 10, 3) }
+func BenchmarkLeafSize4(b *testing.B)  { LeafSizeHelper(b, 10, 4) }
+func BenchmarkLeafSize5(b *testing.B)  { LeafSizeHelper(b, 10, 5) }
+func BenchmarkLeafSize6(b *testing.B)  { LeafSizeHelper(b, 10, 6) }
+func BenchmarkLeafSize7(b *testing.B)  { LeafSizeHelper(b, 10, 7) }
+func BenchmarkLeafSize8(b *testing.B)  { LeafSizeHelper(b, 10, 8) }
+func BenchmarkLeafSize9(b *testing.B)  { LeafSizeHelper(b, 10, 9) }
+func BenchmarkLeafSize10(b *testing.B) { LeafSizeHelper(b, 10, 10) }
+func BenchmarkLeafSize11(b *testing.B) { LeafSizeHelper(b, 10, 11) }
+func BenchmarkLeafSize12(b *testing.B) { LeafSizeHelper(b, 10, 12) }
+func BenchmarkLeafSize13(b *testing.B) { LeafSizeHelper(b, 10, 13) }
+func BenchmarkLeafSize14(b *testing.B) { LeafSizeHelper(b, 10, 14) }
+func BenchmarkLeafSize15(b *testing.B) { LeafSizeHelper(b, 10, 15) }
+func BenchmarkLeafSize16(b *testing.B) { LeafSizeHelper(b, 10, 16) }
+func BenchmarkLeafSize32(b *testing.B) { LeafSizeHelper(b, 10, 32) } // try some large lengths 
+func BenchmarkLeafSize64(b *testing.B) { LeafSizeHelper(b, 10, 64) }
+
+func LeafSizeHelper(b *testing.B, base Word, size int) {
+	b.StopTimer()
+	originalLeafSize := leafSize
+	resetTable(cacheBase10[:])
+	leafSize = size
+	b.StartTimer()
+
+	for d := 1; d <= 10000; d *= 10 {
+		b.StopTimer()
+		var z nat
+		z = z.expWW(base, Word(d))            // build target number
+		_ = z.string(lowercaseDigits[0:base]) // warm divisor cache
+		b.StartTimer()
+
+		for i := 0; i < b.N; i++ {
+			_ = z.string(lowercaseDigits[0:base])
+		}
+	}
+
+	b.StopTimer()
+	resetTable(cacheBase10[:])
+	leafSize = originalLeafSize
+	b.StartTimer()
+}
+
+func resetTable(table []divisor) {
+	if table != nil && table[0].bbb != nil {
+		for i := 0; i < len(table); i++ {
+			table[i].bbb = nil
+			table[i].nbits = 0
+			table[i].ndigits = 0
+		}
+	}
+}
+
+func TestStringPowers(t *testing.T) {
+	var b, p Word
+	for b = 2; b <= 16; b++ {
+		for p = 0; p <= 512; p++ {
+			x := nat(nil).expWW(b, p)
+			xs := x.string(lowercaseDigits[0:b])
+			xs2 := toString(x, lowercaseDigits[0:b])
+			if xs != xs2 {
+				t.Errorf("failed at %d ** %d in base %d: %s != %s", b, p, b, xs, xs2)
+			}
+		}
 	}
 }