1
0
mirror of https://github.com/golang/go synced 2024-11-21 17:04:42 -07:00

math/big: use recursive subdivision for significant speedup

This change adds the second aspect to the conversion code, the
use of large divisiors (powers of big base) to greatly speed up
the divsion of large numbers. Speedups of 30x are common in the
large cases. Also includes new tests and tuning code for the
key internal parameters.

R=gri
CC=golang-dev
https://golang.org/cl/5438058
This commit is contained in:
Michael T. Jones 2011-11-27 11:10:59 -08:00 committed by Robert Griesemer
parent d859d7deee
commit 4c113ffe16
2 changed files with 305 additions and 162 deletions

View File

@ -21,7 +21,9 @@ package big
import (
"errors"
"io"
"math"
"math/rand"
"sync"
)
// An unsigned integer x of the form
@ -719,17 +721,17 @@ func (x nat) string(charset string) string {
// special cases
switch {
case b < 2 || b > 256:
case b < 2 || MaxBase < b:
panic("illegal base")
case len(x) == 0:
return string(charset[0])
}
// allocate buffer for conversion
i := x.bitLen()/log2(b) + 1 // +1: round up
i := int(float64(x.bitLen())/math.Log2(float64(b))) + 1 // off by one at most
s := make([]byte, i)
// special case: power of two bases can avoid divisions completely
// convert power of two and non power of two bases separately
if b == b&-b {
// shift is base-b digit size in bits
shift := uint(trailingZeroBits(b)) // shift > 0 because b >= 2
@ -771,65 +773,209 @@ func (x nat) string(charset string) string {
w >>= shift
nbits -= shift
}
} else {
// determine "big base" as in 10^19 for 19 decimal digits in a 64 bit Word
bb := Word(1) // big base is b**ndigits
ndigits := 0 // number of base b digits
for max := Word(_M / b); bb <= max; bb *= b {
ndigits++ // maximize ndigits where bb = b**ndigits, bb <= _M
}
return string(s[i:])
// construct table of successive squares of bb*leafSize to use in subdivisions
table := divisors(len(x), b, ndigits, bb)
// preserve x, create local copy for use in divisions
q := nat(nil).set(x)
// convert q to string s in base b with index of MSD indicated by return value
i = q.convertWords(0, i, s, charset, b, ndigits, bb, table)
}
// general case: extract groups of digits by multiprecision division
return string(s[i:])
}
// maximize ndigits where b**ndigits < 2^_W; bb (big base) is b**ndigits
bb := Word(1)
ndigits := 0
for max := Word(_M / b); bb <= max; bb *= b {
ndigits++
}
// Convert words of q to base b digits in s directly using iterated nat/Word divison to extract
// low-order Words and indirectly by recursive subdivision and nat/nat division by tabulated
// divisors.
//
// The direct method processes n Words by n divW() calls, each of which visits every Word in the
// incrementally shortened q for a total of n + (n-1) + (n-2) ... + 2 + 1, or n(n+1)/2 divW()'s.
// Indirect conversion divides q by its approximate square root, yielding two parts, each half
// the size of q. Using the direct method on both halves means 2 * (n/2)(n/2 + 1)/2 divW()'s plus
// the expensive long div(). Asymptotically, the ratio is favorable at 1/2 the divW()'s, and is
// made better by splitting the subblocks recursively. Best is to split blocks until one more
// split would take longer (because of the nat/nat div()) than the twice as many divW()'s of the
// direct approach. This threshold is represented by leafSize. Benchmarking of leafSize in the
// range 2..64 shows that values of 8 and 16 work well, with a 4x speedup at medium lengths and
// ~30x for 20000 digits. Use nat_test.go's BenchmarkLeafSize tests to optimize leafSize for
// specfic hardware.
//
// lo and hi index character array s. conversion starts with the LSD at hi and moves down toward
// the MSD, which will be at s[0] or s[1]. lo == 0 signals span includes the most significant word.
//
func (q nat) convertWords(lo, hi int, s []byte, charset string, b Word, ndigits int, bb Word, table []divisor) int {
// indirect conversion: split larger blocks to reduce quadratic expense of iterated nat/W division
if leafSize > 0 && len(q) > leafSize && table != nil {
var r nat
index := len(table) - 1
for len(q) > leafSize {
// find divisor close to sqrt(q) if possible, but in any case < q
maxLength := q.bitLen() // ~= log2 q, or at of least largest possible q of this bit length
minLength := maxLength >> 1 // ~= log2 sqrt(q)
for index > 0 && table[index-1].nbits > minLength {
index-- // desired
}
if table[index].nbits >= maxLength && table[index].bbb.cmp(q) >= 0 {
index--
if index < 0 {
panic("internal inconsistency")
}
}
// preserve x, create local copy for use in repeated divisions
q := nat(nil).set(x)
// split q into the two digit number (q'*bbb + r) to form independent subblocks
q, r = q.div(r, q, table[index].bbb)
// convert subblocks and collect results in s[lo:partition] and s[partition:hi]
partition := hi - table[index].ndigits
r.convertWords(partition, hi, s, charset, b, ndigits, bb, table[0:index])
hi = partition // i.e., q.convertWords(lo, partition, s, charset, b, ndigits, bb, table[0:index+1])
}
} // having split any large blocks now process the remaining small block
// direct conversion: process smaller blocks monolithically to avoid overhead of nat/nat division
var r Word
// convert
if b == 10 { // hard-coding for 10 here speeds this up by 1.25x
if b == 10 { // hard-coding for 10 here speeds this up by 1.25x (allows mod as mul vs div)
for len(q) > 0 {
// extract least significant, base bb "digit"
q, r = q.divW(q, bb) // N.B. >82% of time is here. Optimize divW
if len(q) == 0 {
q, r = q.divW(q, bb)
if lo == 0 && len(q) == 0 {
// skip leading zeros in most-significant group of digits
for j := 0; j < ndigits && r != 0; j++ {
i--
s[i] = charset[r%10]
r /= 10
hi--
t := r / 10
s[hi] = charset[r-(t<<3+t<<1)] // 8*t + 2*t = 10*t; r - 10*int(r/10) = r mod 10
r = t
}
} else {
for j := 0; j < ndigits; j++ {
i--
s[i] = charset[r%10]
r /= 10
for j := 0; j < ndigits && hi > lo; j++ {
hi--
t := r / 10
s[hi] = charset[r-(t<<3+t<<1)] // 8*t + 2*t = 10*t; r - 10*int(r/10) = r mod 10
r = t
}
}
}
} else {
for len(q) > 0 {
// extract least significant group of digits
q, r = q.divW(q, bb) // N.B. >82% of time is here. Optimize divW
if len(q) == 0 {
q, r = q.divW(q, bb)
if lo == 0 && len(q) == 0 {
// skip leading zeros in most-significant group of digits
for j := 0; j < ndigits && r != 0; j++ {
i--
s[i] = charset[r%b]
r /= b
hi--
s[hi] = charset[r%b]
r = r / b
}
} else {
for j := 0; j < ndigits; j++ {
i--
s[i] = charset[r%b]
r /= b
for j := 0; j < ndigits && hi > lo; j++ {
hi--
s[hi] = charset[r%b]
r = r / b
}
}
}
}
return string(s[i:])
// prepend high-order zeroes when q has been normalized to a short number of Words.
// however, do not prepend zeroes when converting the most dignificant digits.
if lo != 0 { // if not MSD
zero := charset[0]
for hi > lo { // while need more leading zeroes
hi--
s[hi] = zero
}
}
// return index of most significant output digit in s[] (stored in lowest index)
return hi
}
// Split blocks greater than leafSize Words (or set to 0 to disable indirect conversion)
// Benchmark and configure leafSize using: gotest -test.bench="Leaf"
// 8 and 16 effective on 3.0 GHz Xeon "Clovertown" CPU (128 byte cache lines)
// 8 and 16 effective on 2.66 GHz Core 2 Duo "Penryn" CPU
var leafSize int = 8 // number of Word-size binary values treat as a monolithic block
type divisor struct {
bbb nat // divisor
nbits int // bit length of divisor (discounting leading zeroes) ~= log2(bbb)
ndigits int // digit length of divisor in terms of output base digits
}
const maxCache = 64 // maximum number of divisors in a single table
var cacheBase10 [maxCache]divisor // cached divisors for base 10
var cacheLock sync.Mutex // defense against concurrent table extensions
// construct table of powers of bb*leafSize to use in subdivisions
func divisors(m int, b Word, ndigits int, bb Word) []divisor {
// only build table when indirect conversion is enabled and x is large
if leafSize == 0 || m <= leafSize {
return nil
}
// determine k where (bb**leafSize)**(2**k) >= sqrt(x)
k := 1
for words := leafSize; words < m>>1 && k < maxCache; words <<= 1 {
k++
}
// create new table of divisors or extend and reuse existing table as appropriate
var cached bool
var table []divisor
switch b {
case 10:
table = cacheBase10[0:k] // reuse old table for this conversion
cached = true
default:
table = make([]divisor, k) // new table for this conversion
}
// extend table
if table[k-1].ndigits == 0 {
if cached {
cacheLock.Lock() // begin critical section
}
var i int
var larger nat
for i < k && table[i].ndigits != 0 { // skip existing entries
i++
}
for ; i < k; i++ { // add new entries
if i == 0 {
table[i].bbb = nat(nil).expWW(bb, Word(leafSize))
table[i].ndigits = ndigits * leafSize
} else {
table[i].bbb = nat(nil).mul(table[i-1].bbb, table[i-1].bbb)
table[i].ndigits = 2 * table[i-1].ndigits
}
// optimization: exploit aggregated extra bits in macro blocks
larger = nat(nil).set(table[i].bbb)
for mulAddVWW(larger, larger, b, 0) == 0 {
table[i].bbb = table[i].bbb.set(larger)
table[i].ndigits++
}
table[i].nbits = table[i].bbb.bitLen()
}
if cached {
cacheLock.Unlock() // end critical section
}
}
return table
}
const deBruijn32 = 0x077CB531
@ -1140,7 +1286,12 @@ func (z nat) expNN(x, y, m nat) nat {
}
}
return z
return z.norm()
}
// calculate x**y for Word arguments y and y
func (z nat) expWW(x, y Word) nat {
return z.expNN(nat(nil).setWord(x), nat(nil).setWord(y), nil)
}
// probablyPrime performs reps Miller-Rabin tests to check whether n is prime.

View File

@ -370,86 +370,34 @@ func BenchmarkScanPi(b *testing.B) {
}
}
const (
// 314**271
// base 2: 2249 digits
// base 8: 751 digits
// base 10: 678 digits
// base 16: 563 digits
shortBase = 314
shortExponent = 271
func BenchmarkScan10Base2(b *testing.B) { ScanHelper(b, 2, 10, 10) }
func BenchmarkScan100Base2(b *testing.B) { ScanHelper(b, 2, 10, 100) }
func BenchmarkScan1000Base2(b *testing.B) { ScanHelper(b, 2, 10, 1000) }
func BenchmarkScan10000Base2(b *testing.B) { ScanHelper(b, 2, 10, 10000) }
func BenchmarkScan100000Base2(b *testing.B) { ScanHelper(b, 2, 10, 100000) }
// 3141**2178
// base 2: 31577 digits
// base 8: 10527 digits
// base 10: 9507 digits
// base 16: 7895 digits
mediumBase = 3141
mediumExponent = 2718
func BenchmarkScan10Base8(b *testing.B) { ScanHelper(b, 8, 10, 10) }
func BenchmarkScan100Base8(b *testing.B) { ScanHelper(b, 8, 10, 100) }
func BenchmarkScan1000Base8(b *testing.B) { ScanHelper(b, 8, 10, 1000) }
func BenchmarkScan10000Base8(b *testing.B) { ScanHelper(b, 8, 10, 10000) }
func BenchmarkScan100000Base8(b *testing.B) { ScanHelper(b, 8, 10, 100000) }
// 3141**2178
// base 2: 406078 digits
// base 8: 135360 digits
// base 10: 122243 digits
// base 16: 101521 digits
longBase = 31415
longExponent = 27182
)
func BenchmarkScan10Base10(b *testing.B) { ScanHelper(b, 10, 10, 10) }
func BenchmarkScan100Base10(b *testing.B) { ScanHelper(b, 10, 10, 100) }
func BenchmarkScan1000Base10(b *testing.B) { ScanHelper(b, 10, 10, 1000) }
func BenchmarkScan10000Base10(b *testing.B) { ScanHelper(b, 10, 10, 10000) }
func BenchmarkScan100000Base10(b *testing.B) { ScanHelper(b, 10, 10, 100000) }
func BenchmarkScanShort2(b *testing.B) {
ScanHelper(b, 2, shortBase, shortExponent)
}
func BenchmarkScan10Base16(b *testing.B) { ScanHelper(b, 16, 10, 10) }
func BenchmarkScan100Base16(b *testing.B) { ScanHelper(b, 16, 10, 100) }
func BenchmarkScan1000Base16(b *testing.B) { ScanHelper(b, 16, 10, 1000) }
func BenchmarkScan10000Base16(b *testing.B) { ScanHelper(b, 16, 10, 10000) }
func BenchmarkScan100000Base16(b *testing.B) { ScanHelper(b, 16, 10, 100000) }
func BenchmarkScanShort8(b *testing.B) {
ScanHelper(b, 8, shortBase, shortExponent)
}
func BenchmarkScanSort10(b *testing.B) {
ScanHelper(b, 10, shortBase, shortExponent)
}
func BenchmarkScanShort16(b *testing.B) {
ScanHelper(b, 16, shortBase, shortExponent)
}
func BenchmarkScanMedium2(b *testing.B) {
ScanHelper(b, 2, mediumBase, mediumExponent)
}
func BenchmarkScanMedium8(b *testing.B) {
ScanHelper(b, 8, mediumBase, mediumExponent)
}
func BenchmarkScanMedium10(b *testing.B) {
ScanHelper(b, 10, mediumBase, mediumExponent)
}
func BenchmarkScanMedium16(b *testing.B) {
ScanHelper(b, 16, mediumBase, mediumExponent)
}
func BenchmarkScanLong2(b *testing.B) {
ScanHelper(b, 2, longBase, longExponent)
}
func BenchmarkScanLong8(b *testing.B) {
ScanHelper(b, 8, longBase, longExponent)
}
func BenchmarkScanLong10(b *testing.B) {
ScanHelper(b, 10, longBase, longExponent)
}
func BenchmarkScanLong16(b *testing.B) {
ScanHelper(b, 16, longBase, longExponent)
}
func ScanHelper(b *testing.B, base int, xv, yv Word) {
func ScanHelper(b *testing.B, base int, x, y Word) {
b.StopTimer()
var x, y, z nat
x = x.setWord(xv)
y = y.setWord(yv)
z = z.expNN(x, y, nil)
var z nat
z = z.expWW(x, y)
var s string
s = z.string(lowercaseDigits[0:base])
@ -459,68 +407,112 @@ func ScanHelper(b *testing.B, base int, xv, yv Word) {
b.StartTimer()
for i := 0; i < b.N; i++ {
x.scan(strings.NewReader(s), base)
z.scan(strings.NewReader(s), base)
}
}
func BenchmarkStringShort2(b *testing.B) {
StringHelper(b, 2, shortBase, shortExponent)
}
func BenchmarkString10Base2(b *testing.B) { StringHelper(b, 2, 10, 10) }
func BenchmarkString100Base2(b *testing.B) { StringHelper(b, 2, 10, 100) }
func BenchmarkString1000Base2(b *testing.B) { StringHelper(b, 2, 10, 1000) }
func BenchmarkString10000Base2(b *testing.B) { StringHelper(b, 2, 10, 10000) }
func BenchmarkString100000Base2(b *testing.B) { StringHelper(b, 2, 10, 100000) }
func BenchmarkStringShort8(b *testing.B) {
StringHelper(b, 8, shortBase, shortExponent)
}
func BenchmarkString10Base8(b *testing.B) { StringHelper(b, 8, 10, 10) }
func BenchmarkString100Base8(b *testing.B) { StringHelper(b, 8, 10, 100) }
func BenchmarkString1000Base8(b *testing.B) { StringHelper(b, 8, 10, 1000) }
func BenchmarkString10000Base8(b *testing.B) { StringHelper(b, 8, 10, 10000) }
func BenchmarkString100000Base8(b *testing.B) { StringHelper(b, 8, 10, 100000) }
func BenchmarkStringShort10(b *testing.B) {
StringHelper(b, 10, shortBase, shortExponent)
}
func BenchmarkString10Base10(b *testing.B) { StringHelper(b, 10, 10, 10) }
func BenchmarkString100Base10(b *testing.B) { StringHelper(b, 10, 10, 100) }
func BenchmarkString1000Base10(b *testing.B) { StringHelper(b, 10, 10, 1000) }
func BenchmarkString10000Base10(b *testing.B) { StringHelper(b, 10, 10, 10000) }
func BenchmarkString100000Base10(b *testing.B) { StringHelper(b, 10, 10, 100000) }
func BenchmarkStringShort16(b *testing.B) {
StringHelper(b, 16, shortBase, shortExponent)
}
func BenchmarkString10Base16(b *testing.B) { StringHelper(b, 16, 10, 10) }
func BenchmarkString100Base16(b *testing.B) { StringHelper(b, 16, 10, 100) }
func BenchmarkString1000Base16(b *testing.B) { StringHelper(b, 16, 10, 1000) }
func BenchmarkString10000Base16(b *testing.B) { StringHelper(b, 16, 10, 10000) }
func BenchmarkString100000Base16(b *testing.B) { StringHelper(b, 16, 10, 100000) }
func BenchmarkStringMedium2(b *testing.B) {
StringHelper(b, 2, mediumBase, mediumExponent)
}
func BenchmarkStringMedium8(b *testing.B) {
StringHelper(b, 8, mediumBase, mediumExponent)
}
func BenchmarkStringMedium10(b *testing.B) {
StringHelper(b, 10, mediumBase, mediumExponent)
}
func BenchmarkStringMedium16(b *testing.B) {
StringHelper(b, 16, mediumBase, mediumExponent)
}
func BenchmarkStringLong2(b *testing.B) {
StringHelper(b, 2, longBase, longExponent)
}
func BenchmarkStringLong8(b *testing.B) {
StringHelper(b, 8, longBase, longExponent)
}
func BenchmarkStringLong10(b *testing.B) {
StringHelper(b, 10, longBase, longExponent)
}
func BenchmarkStringLong16(b *testing.B) {
StringHelper(b, 16, longBase, longExponent)
}
func StringHelper(b *testing.B, base int, xv, yv Word) {
func StringHelper(b *testing.B, base int, x, y Word) {
b.StopTimer()
var x, y, z nat
x = x.setWord(xv)
y = y.setWord(yv)
z = z.expNN(x, y, nil)
var z nat
z = z.expWW(x, y)
z.string(lowercaseDigits[0:base]) // warm divisor cache
b.StartTimer()
for i := 0; i < b.N; i++ {
z.string(lowercaseDigits[0:base])
_ = z.string(lowercaseDigits[0:base])
}
}
func BenchmarkLeafSize0(b *testing.B) { LeafSizeHelper(b, 10, 0) } // test without splitting
func BenchmarkLeafSize1(b *testing.B) { LeafSizeHelper(b, 10, 1) }
func BenchmarkLeafSize2(b *testing.B) { LeafSizeHelper(b, 10, 2) }
func BenchmarkLeafSize3(b *testing.B) { LeafSizeHelper(b, 10, 3) }
func BenchmarkLeafSize4(b *testing.B) { LeafSizeHelper(b, 10, 4) }
func BenchmarkLeafSize5(b *testing.B) { LeafSizeHelper(b, 10, 5) }
func BenchmarkLeafSize6(b *testing.B) { LeafSizeHelper(b, 10, 6) }
func BenchmarkLeafSize7(b *testing.B) { LeafSizeHelper(b, 10, 7) }
func BenchmarkLeafSize8(b *testing.B) { LeafSizeHelper(b, 10, 8) }
func BenchmarkLeafSize9(b *testing.B) { LeafSizeHelper(b, 10, 9) }
func BenchmarkLeafSize10(b *testing.B) { LeafSizeHelper(b, 10, 10) }
func BenchmarkLeafSize11(b *testing.B) { LeafSizeHelper(b, 10, 11) }
func BenchmarkLeafSize12(b *testing.B) { LeafSizeHelper(b, 10, 12) }
func BenchmarkLeafSize13(b *testing.B) { LeafSizeHelper(b, 10, 13) }
func BenchmarkLeafSize14(b *testing.B) { LeafSizeHelper(b, 10, 14) }
func BenchmarkLeafSize15(b *testing.B) { LeafSizeHelper(b, 10, 15) }
func BenchmarkLeafSize16(b *testing.B) { LeafSizeHelper(b, 10, 16) }
func BenchmarkLeafSize32(b *testing.B) { LeafSizeHelper(b, 10, 32) } // try some large lengths
func BenchmarkLeafSize64(b *testing.B) { LeafSizeHelper(b, 10, 64) }
func LeafSizeHelper(b *testing.B, base Word, size int) {
b.StopTimer()
originalLeafSize := leafSize
resetTable(cacheBase10[:])
leafSize = size
b.StartTimer()
for d := 1; d <= 10000; d *= 10 {
b.StopTimer()
var z nat
z = z.expWW(base, Word(d)) // build target number
_ = z.string(lowercaseDigits[0:base]) // warm divisor cache
b.StartTimer()
for i := 0; i < b.N; i++ {
_ = z.string(lowercaseDigits[0:base])
}
}
b.StopTimer()
resetTable(cacheBase10[:])
leafSize = originalLeafSize
b.StartTimer()
}
func resetTable(table []divisor) {
if table != nil && table[0].bbb != nil {
for i := 0; i < len(table); i++ {
table[i].bbb = nil
table[i].nbits = 0
table[i].ndigits = 0
}
}
}
func TestStringPowers(t *testing.T) {
var b, p Word
for b = 2; b <= 16; b++ {
for p = 0; p <= 512; p++ {
x := nat(nil).expWW(b, p)
xs := x.string(lowercaseDigits[0:b])
xs2 := toString(x, lowercaseDigits[0:b])
if xs != xs2 {
t.Errorf("failed at %d ** %d in base %d: %s != %s", b, p, b, xs, xs2)
}
}
}
}