math/big: use recursive subdivision for significant speedup

This change adds the second aspect to the conversion code, the use of large divisiors (powers of big base) to greatly speed up the divsion of large numbers. Speedups of 30x are common in the large cases. Also includes new tests and tuning code for the key internal parameters. R=gri CC=golang-dev https://golang.org/cl/5438058
2024-11-25 02:17:57 -07:00 · 2011-11-27 11:10:59 -08:00 · 2011-11-27 11:10:59 -08:00 · 4c113ffe16
commit 4c113ffe16
parent d859d7deee
2 changed files with 305 additions and 162 deletions
--- a/src/pkg/math/big/nat.go
+++ b/src/pkg/math/big/nat.go
@ -21,7 +21,9 @@ package big
 import (
 	"errors"
 	"io"
 	"math"
 	"math/rand"
 	"sync"
 )
 // An unsigned integer x of the form
@ -719,17 +721,17 @@ func (x nat) string(charset string) string {
 	// special cases
 	switch {
-	case b < 2 || b > 256:
+	case b < 2 || MaxBase < b:
 		panic("illegal base")
 	case len(x) == 0:
 		return string(charset[0])
 	}
 	// allocate buffer for conversion
-	i := x.bitLen()/log2(b) + 1 // +1: round up
+	i := int(float64(x.bitLen())/math.Log2(float64(b))) + 1 // off by one at most
 	s := make([]byte, i)
-	// special case: power of two bases can avoid divisions completely
+	// convert power of two and non power of two bases separately
 	if b == b&-b {
 		// shift is base-b digit size in bits
 		shift := uint(trailingZeroBits(b)) // shift > 0 because b >= 2
@ -771,65 +773,209 @@ func (x nat) string(charset string) string {
 			w >>= shift
 			nbits -= shift
 		}
 	} else {
 		// determine "big base" as in 10^19 for 19 decimal digits in a 64 bit Word
 		bb := Word(1) // big base is b**ndigits
 		ndigits := 0  // number of base b digits
 		for max := Word(_M / b); bb <= max; bb *= b {
 			ndigits++ // maximize ndigits where bb = b**ndigits, bb <= _M
 		}
 		// construct table of successive squares of bb*leafSize to use in subdivisions
 		table := divisors(len(x), b, ndigits, bb)
 		// preserve x, create local copy for use in divisions
 		q := nat(nil).set(x)
 		// convert q to string s in base b with index of MSD indicated by return value
 		i = q.convertWords(0, i, s, charset, b, ndigits, bb, table)
 	}
 	return string(s[i:])
 }
 // Convert words of q to base b digits in s directly using iterated nat/Word divison to extract
 // low-order Words and indirectly by recursive subdivision and nat/nat division by tabulated 
 // divisors. 
 //
 // The direct method processes n Words by n divW() calls, each of which visits every Word in the 
 // incrementally shortened q for a total of n + (n-1) + (n-2) ... + 2 + 1, or n(n+1)/2 divW()'s. 
 // Indirect conversion divides q by its approximate square root, yielding two parts, each half 
 // the size of q. Using the direct method on both halves means 2 * (n/2)(n/2 + 1)/2 divW()'s plus 
 // the expensive long div(). Asymptotically, the ratio is favorable at 1/2 the divW()'s, and is 
 // made better by splitting the subblocks recursively. Best is to split blocks until one more 
 // split would take longer (because of the nat/nat div()) than the twice as many divW()'s of the 
 // direct approach. This threshold is represented by leafSize. Benchmarking of leafSize in the 
 // range 2..64 shows that values of 8 and 16 work well, with a 4x speedup at medium lengths and 
 // ~30x for 20000 digits. Use nat_test.go's BenchmarkLeafSize tests to optimize leafSize for 
 // specfic hardware.
 //
 // lo and hi index character array s. conversion starts with the LSD at hi and moves down toward
 // the MSD, which will be at s[0] or s[1]. lo == 0 signals span includes the most significant word.
 //
 func (q nat) convertWords(lo, hi int, s []byte, charset string, b Word, ndigits int, bb Word, table []divisor) int {
 	// indirect conversion: split larger blocks to reduce quadratic expense of iterated nat/W division
 	if leafSize > 0 && len(q) > leafSize && table != nil {
 		var r nat
 		index := len(table) - 1
 		for len(q) > leafSize {
 			// find divisor close to sqrt(q) if possible, but in any case < q
 			maxLength := q.bitLen()     // ~= log2 q, or at of least largest possible q of this bit length
 			minLength := maxLength >> 1 // ~= log2 sqrt(q)
 			for index > 0 && table[index-1].nbits > minLength {
 				index-- // desired
 			}
 			if table[index].nbits >= maxLength && table[index].bbb.cmp(q) >= 0 {
 				index--
 				if index < 0 {
 					panic("internal inconsistency")
 				}
 			}
-	// general case: extract groups of digits by multiprecision division
+			// split q into the two digit number (q'*bbb + r) to form independent subblocks
 			q, r = q.div(r, q, table[index].bbb)
-	// maximize ndigits where b**ndigits < 2^_W; bb (big base) is b**ndigits
+			// convert subblocks and collect results in s[lo:partition] and s[partition:hi]
-	bb := Word(1)
+			partition := hi - table[index].ndigits
-	ndigits := 0
+			r.convertWords(partition, hi, s, charset, b, ndigits, bb, table[0:index])
-	for max := Word(_M / b); bb <= max; bb *= b {
+			hi = partition // i.e., q.convertWords(lo, partition, s, charset, b, ndigits, bb, table[0:index+1])
 		ndigits++
 		}
 	} // having split any large blocks now process the remaining small block
-	// preserve x, create local copy for use in repeated divisions
+	// direct conversion: process smaller blocks monolithically to avoid overhead of nat/nat division
 	q := nat(nil).set(x)
 	var r Word
-
+	if b == 10 { // hard-coding for 10 here speeds this up by 1.25x (allows mod as mul vs div)
 	// convert
 	if b == 10 { // hard-coding for 10 here speeds this up by 1.25x
 		for len(q) > 0 {
 			// extract least significant, base bb "digit"
-			q, r = q.divW(q, bb) // N.B. >82% of time is here. Optimize divW
+			q, r = q.divW(q, bb)
-			if len(q) == 0 {
+			if lo == 0 && len(q) == 0 {
 				// skip leading zeros in most-significant group of digits
 				for j := 0; j < ndigits && r != 0; j++ {
-					i--
+					hi--
-					s[i] = charset[r%10]
+					t := r / 10
-					r /= 10
+					s[hi] = charset[r-(t<<3+t<<1)] // 8*t + 2*t = 10*t; r - 10*int(r/10) = r mod 10
 					r = t
 				}
 			} else {
-				for j := 0; j < ndigits; j++ {
+				for j := 0; j < ndigits && hi > lo; j++ {
-					i--
+					hi--
-					s[i] = charset[r%10]
+					t := r / 10
-					r /= 10
+					s[hi] = charset[r-(t<<3+t<<1)] // 8*t + 2*t = 10*t; r - 10*int(r/10) = r mod 10
 					r = t
 				}
 			}
 		}
 	} else {
 		for len(q) > 0 {
 			// extract least significant group of digits
-			q, r = q.divW(q, bb) // N.B. >82% of time is here. Optimize divW
+			q, r = q.divW(q, bb)
-			if len(q) == 0 {
+			if lo == 0 && len(q) == 0 {
 				// skip leading zeros in most-significant group of digits
 				for j := 0; j < ndigits && r != 0; j++ {
-					i--
+					hi--
-					s[i] = charset[r%b]
+					s[hi] = charset[r%b]
-					r /= b
+					r = r / b
 				}
 			} else {
-				for j := 0; j < ndigits; j++ {
+				for j := 0; j < ndigits && hi > lo; j++ {
-					i--
+					hi--
-					s[i] = charset[r%b]
+					s[hi] = charset[r%b]
-					r /= b
+					r = r / b
 				}
 			}
 		}
 	}
-	return string(s[i:])
+	// prepend high-order zeroes when q has been normalized to a short number of Words.
 	// however, do not prepend zeroes when converting the most dignificant digits.
 	if lo != 0 { // if not MSD
 		zero := charset[0]
 		for hi > lo { // while need more leading zeroes
 			hi--
 			s[hi] = zero
 		}
 	}
 	// return index of most significant output digit in s[] (stored in lowest index)
 	return hi
 }
 // Split blocks greater than leafSize Words (or set to 0 to disable indirect conversion)
 // Benchmark and configure leafSize using: gotest -test.bench="Leaf"
 //   8 and 16 effective on 3.0 GHz Xeon "Clovertown" CPU (128 byte cache lines)
 //   8 and 16 effective on 2.66 GHz Core 2 Duo "Penryn" CPU
 var leafSize int = 8 // number of Word-size binary values treat as a monolithic block
 type divisor struct {
 	bbb     nat // divisor
 	nbits   int // bit length of divisor (discounting leading zeroes) ~= log2(bbb)
 	ndigits int // digit length of divisor in terms of output base digits
 }
 const maxCache = 64               // maximum number of divisors in a single table
 var cacheBase10 [maxCache]divisor // cached divisors for base 10
 var cacheLock sync.Mutex          // defense against concurrent table extensions
 // construct table of powers of bb*leafSize to use in subdivisions
 func divisors(m int, b Word, ndigits int, bb Word) []divisor {
 	// only build table when indirect conversion is enabled and x is large
 	if leafSize == 0 || m <= leafSize {
 		return nil
 	}
 	// determine k where (bb**leafSize)**(2**k) >= sqrt(x)
 	k := 1
 	for words := leafSize; words < m>>1 && k < maxCache; words <<= 1 {
 		k++
 	}
 	// create new table of divisors or extend and reuse existing table as appropriate
 	var cached bool
 	var table []divisor
 	switch b {
 	case 10:
 		table = cacheBase10[0:k] // reuse old table for this conversion
 		cached = true
 	default:
 		table = make([]divisor, k) // new table for this conversion
 	}
 	// extend table
 	if table[k-1].ndigits == 0 {
 		if cached {
 			cacheLock.Lock() // begin critical section
 		}
 		var i int
 		var larger nat
 		for i < k && table[i].ndigits != 0 { // skip existing entries
 			i++
 		}
 		for ; i < k; i++ { // add new entries
 			if i == 0 {
 				table[i].bbb = nat(nil).expWW(bb, Word(leafSize))
 				table[i].ndigits = ndigits * leafSize
 			} else {
 				table[i].bbb = nat(nil).mul(table[i-1].bbb, table[i-1].bbb)
 				table[i].ndigits = 2 * table[i-1].ndigits
 			}
 			// optimization: exploit aggregated extra bits in macro blocks
 			larger = nat(nil).set(table[i].bbb)
 			for mulAddVWW(larger, larger, b, 0) == 0 {
 				table[i].bbb = table[i].bbb.set(larger)
 				table[i].ndigits++
 			}
 			table[i].nbits = table[i].bbb.bitLen()
 		}
 		if cached {
 			cacheLock.Unlock() // end critical section
 		}
 	}
 	return table
 }
 const deBruijn32 = 0x077CB531
@ -1140,7 +1286,12 @@ func (z nat) expNN(x, y, m nat) nat {
 		}
 	}
-	return z
+	return z.norm()
 }
 // calculate x**y for Word arguments y and y
 func (z nat) expWW(x, y Word) nat {
 	return z.expNN(nat(nil).setWord(x), nat(nil).setWord(y), nil)
 }
 // probablyPrime performs reps Miller-Rabin tests to check whether n is prime.
--- a/src/pkg/math/big/nat_test.go
+++ b/src/pkg/math/big/nat_test.go
@ -370,86 +370,34 @@ func BenchmarkScanPi(b *testing.B) {
 	}
 }
-const (
+func BenchmarkScan10Base2(b *testing.B)     { ScanHelper(b, 2, 10, 10) }
-	// 314**271
+func BenchmarkScan100Base2(b *testing.B)    { ScanHelper(b, 2, 10, 100) }
-	// base  2: 2249 digits
+func BenchmarkScan1000Base2(b *testing.B)   { ScanHelper(b, 2, 10, 1000) }
-	// base  8:  751 digits
+func BenchmarkScan10000Base2(b *testing.B)  { ScanHelper(b, 2, 10, 10000) }
-	// base 10:  678 digits
+func BenchmarkScan100000Base2(b *testing.B) { ScanHelper(b, 2, 10, 100000) }
 	// base 16:  563 digits
 	shortBase     = 314
 	shortExponent = 271
-	// 3141**2178
+func BenchmarkScan10Base8(b *testing.B)     { ScanHelper(b, 8, 10, 10) }
-	// base  2: 31577 digits
+func BenchmarkScan100Base8(b *testing.B)    { ScanHelper(b, 8, 10, 100) }
-	// base  8: 10527 digits
+func BenchmarkScan1000Base8(b *testing.B)   { ScanHelper(b, 8, 10, 1000) }
-	// base 10:  9507 digits
+func BenchmarkScan10000Base8(b *testing.B)  { ScanHelper(b, 8, 10, 10000) }
-	// base 16:  7895 digits
+func BenchmarkScan100000Base8(b *testing.B) { ScanHelper(b, 8, 10, 100000) }
 	mediumBase     = 3141
 	mediumExponent = 2718
-	// 3141**2178
+func BenchmarkScan10Base10(b *testing.B)     { ScanHelper(b, 10, 10, 10) }
-	// base  2: 406078 digits
+func BenchmarkScan100Base10(b *testing.B)    { ScanHelper(b, 10, 10, 100) }
-	// base  8: 135360 digits
+func BenchmarkScan1000Base10(b *testing.B)   { ScanHelper(b, 10, 10, 1000) }
-	// base 10: 122243 digits
+func BenchmarkScan10000Base10(b *testing.B)  { ScanHelper(b, 10, 10, 10000) }
-	// base 16: 101521 digits
+func BenchmarkScan100000Base10(b *testing.B) { ScanHelper(b, 10, 10, 100000) }
 	longBase     = 31415
 	longExponent = 27182
 )
-func BenchmarkScanShort2(b *testing.B) {
+func BenchmarkScan10Base16(b *testing.B)     { ScanHelper(b, 16, 10, 10) }
-	ScanHelper(b, 2, shortBase, shortExponent)
+func BenchmarkScan100Base16(b *testing.B)    { ScanHelper(b, 16, 10, 100) }
-}
+func BenchmarkScan1000Base16(b *testing.B)   { ScanHelper(b, 16, 10, 1000) }
 func BenchmarkScan10000Base16(b *testing.B)  { ScanHelper(b, 16, 10, 10000) }
 func BenchmarkScan100000Base16(b *testing.B) { ScanHelper(b, 16, 10, 100000) }
-func BenchmarkScanShort8(b *testing.B) {
+func ScanHelper(b *testing.B, base int, x, y Word) {
 	ScanHelper(b, 8, shortBase, shortExponent)
 }
 func BenchmarkScanSort10(b *testing.B) {
 	ScanHelper(b, 10, shortBase, shortExponent)
 }
 func BenchmarkScanShort16(b *testing.B) {
 	ScanHelper(b, 16, shortBase, shortExponent)
 }
 func BenchmarkScanMedium2(b *testing.B) {
 	ScanHelper(b, 2, mediumBase, mediumExponent)
 }
 func BenchmarkScanMedium8(b *testing.B) {
 	ScanHelper(b, 8, mediumBase, mediumExponent)
 }
 func BenchmarkScanMedium10(b *testing.B) {
 	ScanHelper(b, 10, mediumBase, mediumExponent)
 }
 func BenchmarkScanMedium16(b *testing.B) {
 	ScanHelper(b, 16, mediumBase, mediumExponent)
 }
 func BenchmarkScanLong2(b *testing.B) {
 	ScanHelper(b, 2, longBase, longExponent)
 }
 func BenchmarkScanLong8(b *testing.B) {
 	ScanHelper(b, 8, longBase, longExponent)
 }
 func BenchmarkScanLong10(b *testing.B) {
 	ScanHelper(b, 10, longBase, longExponent)
 }
 func BenchmarkScanLong16(b *testing.B) {
 	ScanHelper(b, 16, longBase, longExponent)
 }
 func ScanHelper(b *testing.B, base int, xv, yv Word) {
 	b.StopTimer()
-	var x, y, z nat
+	var z nat
-	x = x.setWord(xv)
+	z = z.expWW(x, y)
 	y = y.setWord(yv)
 	z = z.expNN(x, y, nil)
 	var s string
 	s = z.string(lowercaseDigits[0:base])
@ -459,68 +407,112 @@ func ScanHelper(b *testing.B, base int, xv, yv Word) {
 	b.StartTimer()
 	for i := 0; i < b.N; i++ {
-		x.scan(strings.NewReader(s), base)
+		z.scan(strings.NewReader(s), base)
 	}
 }
-func BenchmarkStringShort2(b *testing.B) {
+func BenchmarkString10Base2(b *testing.B)     { StringHelper(b, 2, 10, 10) }
-	StringHelper(b, 2, shortBase, shortExponent)
+func BenchmarkString100Base2(b *testing.B)    { StringHelper(b, 2, 10, 100) }
-}
+func BenchmarkString1000Base2(b *testing.B)   { StringHelper(b, 2, 10, 1000) }
 func BenchmarkString10000Base2(b *testing.B)  { StringHelper(b, 2, 10, 10000) }
 func BenchmarkString100000Base2(b *testing.B) { StringHelper(b, 2, 10, 100000) }
-func BenchmarkStringShort8(b *testing.B) {
+func BenchmarkString10Base8(b *testing.B)     { StringHelper(b, 8, 10, 10) }
-	StringHelper(b, 8, shortBase, shortExponent)
+func BenchmarkString100Base8(b *testing.B)    { StringHelper(b, 8, 10, 100) }
-}
+func BenchmarkString1000Base8(b *testing.B)   { StringHelper(b, 8, 10, 1000) }
 func BenchmarkString10000Base8(b *testing.B)  { StringHelper(b, 8, 10, 10000) }
 func BenchmarkString100000Base8(b *testing.B) { StringHelper(b, 8, 10, 100000) }
-func BenchmarkStringShort10(b *testing.B) {
+func BenchmarkString10Base10(b *testing.B)     { StringHelper(b, 10, 10, 10) }
-	StringHelper(b, 10, shortBase, shortExponent)
+func BenchmarkString100Base10(b *testing.B)    { StringHelper(b, 10, 10, 100) }
-}
+func BenchmarkString1000Base10(b *testing.B)   { StringHelper(b, 10, 10, 1000) }
 func BenchmarkString10000Base10(b *testing.B)  { StringHelper(b, 10, 10, 10000) }
 func BenchmarkString100000Base10(b *testing.B) { StringHelper(b, 10, 10, 100000) }
-func BenchmarkStringShort16(b *testing.B) {
+func BenchmarkString10Base16(b *testing.B)     { StringHelper(b, 16, 10, 10) }
-	StringHelper(b, 16, shortBase, shortExponent)
+func BenchmarkString100Base16(b *testing.B)    { StringHelper(b, 16, 10, 100) }
-}
+func BenchmarkString1000Base16(b *testing.B)   { StringHelper(b, 16, 10, 1000) }
 func BenchmarkString10000Base16(b *testing.B)  { StringHelper(b, 16, 10, 10000) }
 func BenchmarkString100000Base16(b *testing.B) { StringHelper(b, 16, 10, 100000) }
-func BenchmarkStringMedium2(b *testing.B) {
+func StringHelper(b *testing.B, base int, x, y Word) {
 	StringHelper(b, 2, mediumBase, mediumExponent)
 }
 func BenchmarkStringMedium8(b *testing.B) {
 	StringHelper(b, 8, mediumBase, mediumExponent)
 }
 func BenchmarkStringMedium10(b *testing.B) {
 	StringHelper(b, 10, mediumBase, mediumExponent)
 }
 func BenchmarkStringMedium16(b *testing.B) {
 	StringHelper(b, 16, mediumBase, mediumExponent)
 }
 func BenchmarkStringLong2(b *testing.B) {
 	StringHelper(b, 2, longBase, longExponent)
 }
 func BenchmarkStringLong8(b *testing.B) {
 	StringHelper(b, 8, longBase, longExponent)
 }
 func BenchmarkStringLong10(b *testing.B) {
 	StringHelper(b, 10, longBase, longExponent)
 }
 func BenchmarkStringLong16(b *testing.B) {
 	StringHelper(b, 16, longBase, longExponent)
 }
 func StringHelper(b *testing.B, base int, xv, yv Word) {
 	b.StopTimer()
-	var x, y, z nat
+	var z nat
-	x = x.setWord(xv)
+	z = z.expWW(x, y)
-	y = y.setWord(yv)
+	z.string(lowercaseDigits[0:base]) // warm divisor cache
 	z = z.expNN(x, y, nil)
 	b.StartTimer()
 	for i := 0; i < b.N; i++ {
-		z.string(lowercaseDigits[0:base])
+		_ = z.string(lowercaseDigits[0:base])
 	}
 }
 func BenchmarkLeafSize0(b *testing.B)  { LeafSizeHelper(b, 10, 0) } // test without splitting
 func BenchmarkLeafSize1(b *testing.B)  { LeafSizeHelper(b, 10, 1) }
 func BenchmarkLeafSize2(b *testing.B)  { LeafSizeHelper(b, 10, 2) }
 func BenchmarkLeafSize3(b *testing.B)  { LeafSizeHelper(b, 10, 3) }
 func BenchmarkLeafSize4(b *testing.B)  { LeafSizeHelper(b, 10, 4) }
 func BenchmarkLeafSize5(b *testing.B)  { LeafSizeHelper(b, 10, 5) }
 func BenchmarkLeafSize6(b *testing.B)  { LeafSizeHelper(b, 10, 6) }
 func BenchmarkLeafSize7(b *testing.B)  { LeafSizeHelper(b, 10, 7) }
 func BenchmarkLeafSize8(b *testing.B)  { LeafSizeHelper(b, 10, 8) }
 func BenchmarkLeafSize9(b *testing.B)  { LeafSizeHelper(b, 10, 9) }
 func BenchmarkLeafSize10(b *testing.B) { LeafSizeHelper(b, 10, 10) }
 func BenchmarkLeafSize11(b *testing.B) { LeafSizeHelper(b, 10, 11) }
 func BenchmarkLeafSize12(b *testing.B) { LeafSizeHelper(b, 10, 12) }
 func BenchmarkLeafSize13(b *testing.B) { LeafSizeHelper(b, 10, 13) }
 func BenchmarkLeafSize14(b *testing.B) { LeafSizeHelper(b, 10, 14) }
 func BenchmarkLeafSize15(b *testing.B) { LeafSizeHelper(b, 10, 15) }
 func BenchmarkLeafSize16(b *testing.B) { LeafSizeHelper(b, 10, 16) }
 func BenchmarkLeafSize32(b *testing.B) { LeafSizeHelper(b, 10, 32) } // try some large lengths 
 func BenchmarkLeafSize64(b *testing.B) { LeafSizeHelper(b, 10, 64) }
 func LeafSizeHelper(b *testing.B, base Word, size int) {
 	b.StopTimer()
 	originalLeafSize := leafSize
 	resetTable(cacheBase10[:])
 	leafSize = size
 	b.StartTimer()
 	for d := 1; d <= 10000; d *= 10 {
 		b.StopTimer()
 		var z nat
 		z = z.expWW(base, Word(d))            // build target number
 		_ = z.string(lowercaseDigits[0:base]) // warm divisor cache
 		b.StartTimer()
 		for i := 0; i < b.N; i++ {
 			_ = z.string(lowercaseDigits[0:base])
 		}
 	}
 	b.StopTimer()
 	resetTable(cacheBase10[:])
 	leafSize = originalLeafSize
 	b.StartTimer()
 }
 func resetTable(table []divisor) {
 	if table != nil && table[0].bbb != nil {
 		for i := 0; i < len(table); i++ {
 			table[i].bbb = nil
 			table[i].nbits = 0
 			table[i].ndigits = 0
 		}
 	}
 }
 func TestStringPowers(t *testing.T) {
 	var b, p Word
 	for b = 2; b <= 16; b++ {
 		for p = 0; p <= 512; p++ {
 			x := nat(nil).expWW(b, p)
 			xs := x.string(lowercaseDigits[0:b])
 			xs2 := toString(x, lowercaseDigits[0:b])
 			if xs != xs2 {
 				t.Errorf("failed at %d ** %d in base %d: %s != %s", b, p, b, xs, xs2)
 			}
 		}
 	}
 }