diff --git a/src/pkg/math/big/nat.go b/src/pkg/math/big/nat.go
index eee8ee3f66..9fba2d2a06 100644
--- a/src/pkg/math/big/nat.go
+++ b/src/pkg/math/big/nat.go
@@ -21,7 +21,9 @@ package big
 import (
 	"errors"
 	"io"
+	"math"
 	"math/rand"
+	"sync"
 )
 
 // An unsigned integer x of the form
@@ -719,17 +721,17 @@ func (x nat) string(charset string) string {
 
 	// special cases
 	switch {
-	case b < 2 || b > 256:
+	case b < 2 || MaxBase < b:
 		panic("illegal base")
 	case len(x) == 0:
 		return string(charset[0])
 	}
 
 	// allocate buffer for conversion
-	i := x.bitLen()/log2(b) + 1 // +1: round up
+	i := int(float64(x.bitLen())/math.Log2(float64(b))) + 1 // off by one at most
 	s := make([]byte, i)
 
-	// special case: power of two bases can avoid divisions completely
+	// convert power of two and non power of two bases separately
 	if b == b&-b {
 		// shift is base-b digit size in bits
 		shift := uint(trailingZeroBits(b)) // shift > 0 because b >= 2
@@ -771,65 +773,209 @@ func (x nat) string(charset string) string {
 			w >>= shift
 			nbits -= shift
 		}
+	} else {
+		// determine "big base" as in 10^19 for 19 decimal digits in a 64 bit Word
+		bb := Word(1) // big base is b**ndigits
+		ndigits := 0  // number of base b digits
+		for max := Word(_M / b); bb <= max; bb *= b {
+			ndigits++ // maximize ndigits where bb = b**ndigits, bb <= _M
+		}
 
-		return string(s[i:])
+		// construct table of successive squares of bb*leafSize to use in subdivisions
+		table := divisors(len(x), b, ndigits, bb)
+
+		// preserve x, create local copy for use in divisions
+		q := nat(nil).set(x)
+
+		// convert q to string s in base b with index of MSD indicated by return value
+		i = q.convertWords(0, i, s, charset, b, ndigits, bb, table)
 	}
 
-	// general case: extract groups of digits by multiprecision division
+	return string(s[i:])
+}
 
-	// maximize ndigits where b**ndigits < 2^_W; bb (big base) is b**ndigits
-	bb := Word(1)
-	ndigits := 0
-	for max := Word(_M / b); bb <= max; bb *= b {
-		ndigits++
-	}
+// Convert words of q to base b digits in s directly using iterated nat/Word divison to extract
+// low-order Words and indirectly by recursive subdivision and nat/nat division by tabulated 
+// divisors. 
+//
+// The direct method processes n Words by n divW() calls, each of which visits every Word in the 
+// incrementally shortened q for a total of n + (n-1) + (n-2) ... + 2 + 1, or n(n+1)/2 divW()'s. 
+// Indirect conversion divides q by its approximate square root, yielding two parts, each half 
+// the size of q. Using the direct method on both halves means 2 * (n/2)(n/2 + 1)/2 divW()'s plus 
+// the expensive long div(). Asymptotically, the ratio is favorable at 1/2 the divW()'s, and is 
+// made better by splitting the subblocks recursively. Best is to split blocks until one more 
+// split would take longer (because of the nat/nat div()) than the twice as many divW()'s of the 
+// direct approach. This threshold is represented by leafSize. Benchmarking of leafSize in the 
+// range 2..64 shows that values of 8 and 16 work well, with a 4x speedup at medium lengths and 
+// ~30x for 20000 digits. Use nat_test.go's BenchmarkLeafSize tests to optimize leafSize for 
+// specfic hardware.
+//
+// lo and hi index character array s. conversion starts with the LSD at hi and moves down toward
+// the MSD, which will be at s[0] or s[1]. lo == 0 signals span includes the most significant word.
+//
+func (q nat) convertWords(lo, hi int, s []byte, charset string, b Word, ndigits int, bb Word, table []divisor) int {
+	// indirect conversion: split larger blocks to reduce quadratic expense of iterated nat/W division
+	if leafSize > 0 && len(q) > leafSize && table != nil {
+		var r nat
+		index := len(table) - 1
+		for len(q) > leafSize {
+			// find divisor close to sqrt(q) if possible, but in any case < q
+			maxLength := q.bitLen()     // ~= log2 q, or at of least largest possible q of this bit length
+			minLength := maxLength >> 1 // ~= log2 sqrt(q)
+			for index > 0 && table[index-1].nbits > minLength {
+				index-- // desired
+			}
+			if table[index].nbits >= maxLength && table[index].bbb.cmp(q) >= 0 {
+				index--
+				if index < 0 {
+					panic("internal inconsistency")
+				}
+			}
 
-	// preserve x, create local copy for use in repeated divisions
-	q := nat(nil).set(x)
+			// split q into the two digit number (q'*bbb + r) to form independent subblocks
+			q, r = q.div(r, q, table[index].bbb)
+
+			// convert subblocks and collect results in s[lo:partition] and s[partition:hi]
+			partition := hi - table[index].ndigits
+			r.convertWords(partition, hi, s, charset, b, ndigits, bb, table[0:index])
+			hi = partition // i.e., q.convertWords(lo, partition, s, charset, b, ndigits, bb, table[0:index+1])
+		}
+	} // having split any large blocks now process the remaining small block
+
+	// direct conversion: process smaller blocks monolithically to avoid overhead of nat/nat division
 	var r Word
-
-	// convert
-	if b == 10 { // hard-coding for 10 here speeds this up by 1.25x
+	if b == 10 { // hard-coding for 10 here speeds this up by 1.25x (allows mod as mul vs div)
 		for len(q) > 0 {
 			// extract least significant, base bb "digit"
-			q, r = q.divW(q, bb) // N.B. >82% of time is here. Optimize divW
-			if len(q) == 0 {
+			q, r = q.divW(q, bb)
+			if lo == 0 && len(q) == 0 {
 				// skip leading zeros in most-significant group of digits
 				for j := 0; j < ndigits && r != 0; j++ {
-					i--
-					s[i] = charset[r%10]
-					r /= 10
+					hi--
+					t := r / 10
+					s[hi] = charset[r-(t<<3+t<<1)] // 8*t + 2*t = 10*t; r - 10*int(r/10) = r mod 10
+					r = t
 				}
 			} else {
-				for j := 0; j < ndigits; j++ {
-					i--
-					s[i] = charset[r%10]
-					r /= 10
+				for j := 0; j < ndigits && hi > lo; j++ {
+					hi--
+					t := r / 10
+					s[hi] = charset[r-(t<<3+t<<1)] // 8*t + 2*t = 10*t; r - 10*int(r/10) = r mod 10
+					r = t
 				}
 			}
 		}
 	} else {
 		for len(q) > 0 {
 			// extract least significant group of digits
-			q, r = q.divW(q, bb) // N.B. >82% of time is here. Optimize divW
-			if len(q) == 0 {
+			q, r = q.divW(q, bb)
+			if lo == 0 && len(q) == 0 {
 				// skip leading zeros in most-significant group of digits
 				for j := 0; j < ndigits && r != 0; j++ {
-					i--
-					s[i] = charset[r%b]
-					r /= b
+					hi--
+					s[hi] = charset[r%b]
+					r = r / b
 				}
 			} else {
-				for j := 0; j < ndigits; j++ {
-					i--
-					s[i] = charset[r%b]
-					r /= b
+				for j := 0; j < ndigits && hi > lo; j++ {
+					hi--
+					s[hi] = charset[r%b]
+					r = r / b
 				}
 			}
 		}
 	}
 
-	return string(s[i:])
+	// prepend high-order zeroes when q has been normalized to a short number of Words.
+	// however, do not prepend zeroes when converting the most dignificant digits.
+	if lo != 0 { // if not MSD
+		zero := charset[0]
+		for hi > lo { // while need more leading zeroes
+			hi--
+			s[hi] = zero
+		}
+	}
+
+	// return index of most significant output digit in s[] (stored in lowest index)
+	return hi
+}
+
+// Split blocks greater than leafSize Words (or set to 0 to disable indirect conversion)
+// Benchmark and configure leafSize using: gotest -test.bench="Leaf"
+//   8 and 16 effective on 3.0 GHz Xeon "Clovertown" CPU (128 byte cache lines)
+//   8 and 16 effective on 2.66 GHz Core 2 Duo "Penryn" CPU
+var leafSize int = 8 // number of Word-size binary values treat as a monolithic block
+
+type divisor struct {
+	bbb     nat // divisor
+	nbits   int // bit length of divisor (discounting leading zeroes) ~= log2(bbb)
+	ndigits int // digit length of divisor in terms of output base digits
+}
+
+const maxCache = 64               // maximum number of divisors in a single table
+var cacheBase10 [maxCache]divisor // cached divisors for base 10
+var cacheLock sync.Mutex          // defense against concurrent table extensions
+
+// construct table of powers of bb*leafSize to use in subdivisions
+func divisors(m int, b Word, ndigits int, bb Word) []divisor {
+	// only build table when indirect conversion is enabled and x is large
+	if leafSize == 0 || m <= leafSize {
+		return nil
+	}
+
+	// determine k where (bb**leafSize)**(2**k) >= sqrt(x)
+	k := 1
+	for words := leafSize; words < m>>1 && k < maxCache; words <<= 1 {
+		k++
+	}
+
+	// create new table of divisors or extend and reuse existing table as appropriate
+	var cached bool
+	var table []divisor
+	switch b {
+	case 10:
+		table = cacheBase10[0:k] // reuse old table for this conversion
+		cached = true
+	default:
+		table = make([]divisor, k) // new table for this conversion
+	}
+
+	// extend table
+	if table[k-1].ndigits == 0 {
+		if cached {
+			cacheLock.Lock() // begin critical section
+		}
+
+		var i int
+		var larger nat
+		for i < k && table[i].ndigits != 0 { // skip existing entries
+			i++
+		}
+		for ; i < k; i++ { // add new entries
+			if i == 0 {
+				table[i].bbb = nat(nil).expWW(bb, Word(leafSize))
+				table[i].ndigits = ndigits * leafSize
+			} else {
+				table[i].bbb = nat(nil).mul(table[i-1].bbb, table[i-1].bbb)
+				table[i].ndigits = 2 * table[i-1].ndigits
+			}
+
+			// optimization: exploit aggregated extra bits in macro blocks
+			larger = nat(nil).set(table[i].bbb)
+			for mulAddVWW(larger, larger, b, 0) == 0 {
+				table[i].bbb = table[i].bbb.set(larger)
+				table[i].ndigits++
+			}
+
+			table[i].nbits = table[i].bbb.bitLen()
+		}
+
+		if cached {
+			cacheLock.Unlock() // end critical section
+		}
+	}
+
+	return table
 }
 
 const deBruijn32 = 0x077CB531
@@ -1140,7 +1286,12 @@ func (z nat) expNN(x, y, m nat) nat {
 		}
 	}
 
-	return z
+	return z.norm()
+}
+
+// calculate x**y for Word arguments y and y
+func (z nat) expWW(x, y Word) nat {
+	return z.expNN(nat(nil).setWord(x), nat(nil).setWord(y), nil)
 }
 
 // probablyPrime performs reps Miller-Rabin tests to check whether n is prime.
diff --git a/src/pkg/math/big/nat_test.go b/src/pkg/math/big/nat_test.go
index b208646f2f..e3c6552d9f 100644
--- a/src/pkg/math/big/nat_test.go
+++ b/src/pkg/math/big/nat_test.go
@@ -370,86 +370,34 @@ func BenchmarkScanPi(b *testing.B) {
 	}
 }
 
-const (
-	// 314**271
-	// base  2: 2249 digits
-	// base  8:  751 digits
-	// base 10:  678 digits
-	// base 16:  563 digits
-	shortBase     = 314
-	shortExponent = 271
+func BenchmarkScan10Base2(b *testing.B)     { ScanHelper(b, 2, 10, 10) }
+func BenchmarkScan100Base2(b *testing.B)    { ScanHelper(b, 2, 10, 100) }
+func BenchmarkScan1000Base2(b *testing.B)   { ScanHelper(b, 2, 10, 1000) }
+func BenchmarkScan10000Base2(b *testing.B)  { ScanHelper(b, 2, 10, 10000) }
+func BenchmarkScan100000Base2(b *testing.B) { ScanHelper(b, 2, 10, 100000) }
 
-	// 3141**2178
-	// base  2: 31577 digits
-	// base  8: 10527 digits
-	// base 10:  9507 digits
-	// base 16:  7895 digits
-	mediumBase     = 3141
-	mediumExponent = 2718
+func BenchmarkScan10Base8(b *testing.B)     { ScanHelper(b, 8, 10, 10) }
+func BenchmarkScan100Base8(b *testing.B)    { ScanHelper(b, 8, 10, 100) }
+func BenchmarkScan1000Base8(b *testing.B)   { ScanHelper(b, 8, 10, 1000) }
+func BenchmarkScan10000Base8(b *testing.B)  { ScanHelper(b, 8, 10, 10000) }
+func BenchmarkScan100000Base8(b *testing.B) { ScanHelper(b, 8, 10, 100000) }
 
-	// 3141**2178
-	// base  2: 406078 digits
-	// base  8: 135360 digits
-	// base 10: 122243 digits
-	// base 16: 101521 digits
-	longBase     = 31415
-	longExponent = 27182
-)
+func BenchmarkScan10Base10(b *testing.B)     { ScanHelper(b, 10, 10, 10) }
+func BenchmarkScan100Base10(b *testing.B)    { ScanHelper(b, 10, 10, 100) }
+func BenchmarkScan1000Base10(b *testing.B)   { ScanHelper(b, 10, 10, 1000) }
+func BenchmarkScan10000Base10(b *testing.B)  { ScanHelper(b, 10, 10, 10000) }
+func BenchmarkScan100000Base10(b *testing.B) { ScanHelper(b, 10, 10, 100000) }
 
-func BenchmarkScanShort2(b *testing.B) {
-	ScanHelper(b, 2, shortBase, shortExponent)
-}
+func BenchmarkScan10Base16(b *testing.B)     { ScanHelper(b, 16, 10, 10) }
+func BenchmarkScan100Base16(b *testing.B)    { ScanHelper(b, 16, 10, 100) }
+func BenchmarkScan1000Base16(b *testing.B)   { ScanHelper(b, 16, 10, 1000) }
+func BenchmarkScan10000Base16(b *testing.B)  { ScanHelper(b, 16, 10, 10000) }
+func BenchmarkScan100000Base16(b *testing.B) { ScanHelper(b, 16, 10, 100000) }
 
-func BenchmarkScanShort8(b *testing.B) {
-	ScanHelper(b, 8, shortBase, shortExponent)
-}
-
-func BenchmarkScanSort10(b *testing.B) {
-	ScanHelper(b, 10, shortBase, shortExponent)
-}
-
-func BenchmarkScanShort16(b *testing.B) {
-	ScanHelper(b, 16, shortBase, shortExponent)
-}
-
-func BenchmarkScanMedium2(b *testing.B) {
-	ScanHelper(b, 2, mediumBase, mediumExponent)
-}
-
-func BenchmarkScanMedium8(b *testing.B) {
-	ScanHelper(b, 8, mediumBase, mediumExponent)
-}
-
-func BenchmarkScanMedium10(b *testing.B) {
-	ScanHelper(b, 10, mediumBase, mediumExponent)
-}
-
-func BenchmarkScanMedium16(b *testing.B) {
-	ScanHelper(b, 16, mediumBase, mediumExponent)
-}
-
-func BenchmarkScanLong2(b *testing.B) {
-	ScanHelper(b, 2, longBase, longExponent)
-}
-
-func BenchmarkScanLong8(b *testing.B) {
-	ScanHelper(b, 8, longBase, longExponent)
-}
-
-func BenchmarkScanLong10(b *testing.B) {
-	ScanHelper(b, 10, longBase, longExponent)
-}
-
-func BenchmarkScanLong16(b *testing.B) {
-	ScanHelper(b, 16, longBase, longExponent)
-}
-
-func ScanHelper(b *testing.B, base int, xv, yv Word) {
+func ScanHelper(b *testing.B, base int, x, y Word) {
 	b.StopTimer()
-	var x, y, z nat
-	x = x.setWord(xv)
-	y = y.setWord(yv)
-	z = z.expNN(x, y, nil)
+	var z nat
+	z = z.expWW(x, y)
 
 	var s string
 	s = z.string(lowercaseDigits[0:base])
@@ -459,68 +407,112 @@ func ScanHelper(b *testing.B, base int, xv, yv Word) {
 	b.StartTimer()
 
 	for i := 0; i < b.N; i++ {
-		x.scan(strings.NewReader(s), base)
+		z.scan(strings.NewReader(s), base)
 	}
 }
 
-func BenchmarkStringShort2(b *testing.B) {
-	StringHelper(b, 2, shortBase, shortExponent)
-}
+func BenchmarkString10Base2(b *testing.B)     { StringHelper(b, 2, 10, 10) }
+func BenchmarkString100Base2(b *testing.B)    { StringHelper(b, 2, 10, 100) }
+func BenchmarkString1000Base2(b *testing.B)   { StringHelper(b, 2, 10, 1000) }
+func BenchmarkString10000Base2(b *testing.B)  { StringHelper(b, 2, 10, 10000) }
+func BenchmarkString100000Base2(b *testing.B) { StringHelper(b, 2, 10, 100000) }
 
-func BenchmarkStringShort8(b *testing.B) {
-	StringHelper(b, 8, shortBase, shortExponent)
-}
+func BenchmarkString10Base8(b *testing.B)     { StringHelper(b, 8, 10, 10) }
+func BenchmarkString100Base8(b *testing.B)    { StringHelper(b, 8, 10, 100) }
+func BenchmarkString1000Base8(b *testing.B)   { StringHelper(b, 8, 10, 1000) }
+func BenchmarkString10000Base8(b *testing.B)  { StringHelper(b, 8, 10, 10000) }
+func BenchmarkString100000Base8(b *testing.B) { StringHelper(b, 8, 10, 100000) }
 
-func BenchmarkStringShort10(b *testing.B) {
-	StringHelper(b, 10, shortBase, shortExponent)
-}
+func BenchmarkString10Base10(b *testing.B)     { StringHelper(b, 10, 10, 10) }
+func BenchmarkString100Base10(b *testing.B)    { StringHelper(b, 10, 10, 100) }
+func BenchmarkString1000Base10(b *testing.B)   { StringHelper(b, 10, 10, 1000) }
+func BenchmarkString10000Base10(b *testing.B)  { StringHelper(b, 10, 10, 10000) }
+func BenchmarkString100000Base10(b *testing.B) { StringHelper(b, 10, 10, 100000) }
 
-func BenchmarkStringShort16(b *testing.B) {
-	StringHelper(b, 16, shortBase, shortExponent)
-}
+func BenchmarkString10Base16(b *testing.B)     { StringHelper(b, 16, 10, 10) }
+func BenchmarkString100Base16(b *testing.B)    { StringHelper(b, 16, 10, 100) }
+func BenchmarkString1000Base16(b *testing.B)   { StringHelper(b, 16, 10, 1000) }
+func BenchmarkString10000Base16(b *testing.B)  { StringHelper(b, 16, 10, 10000) }
+func BenchmarkString100000Base16(b *testing.B) { StringHelper(b, 16, 10, 100000) }
 
-func BenchmarkStringMedium2(b *testing.B) {
-	StringHelper(b, 2, mediumBase, mediumExponent)
-}
-
-func BenchmarkStringMedium8(b *testing.B) {
-	StringHelper(b, 8, mediumBase, mediumExponent)
-}
-
-func BenchmarkStringMedium10(b *testing.B) {
-	StringHelper(b, 10, mediumBase, mediumExponent)
-}
-
-func BenchmarkStringMedium16(b *testing.B) {
-	StringHelper(b, 16, mediumBase, mediumExponent)
-}
-
-func BenchmarkStringLong2(b *testing.B) {
-	StringHelper(b, 2, longBase, longExponent)
-}
-
-func BenchmarkStringLong8(b *testing.B) {
-	StringHelper(b, 8, longBase, longExponent)
-}
-
-func BenchmarkStringLong10(b *testing.B) {
-	StringHelper(b, 10, longBase, longExponent)
-}
-
-func BenchmarkStringLong16(b *testing.B) {
-	StringHelper(b, 16, longBase, longExponent)
-}
-
-func StringHelper(b *testing.B, base int, xv, yv Word) {
+func StringHelper(b *testing.B, base int, x, y Word) {
 	b.StopTimer()
-	var x, y, z nat
-	x = x.setWord(xv)
-	y = y.setWord(yv)
-	z = z.expNN(x, y, nil)
+	var z nat
+	z = z.expWW(x, y)
+	z.string(lowercaseDigits[0:base]) // warm divisor cache
 	b.StartTimer()
 
 	for i := 0; i < b.N; i++ {
-		z.string(lowercaseDigits[0:base])
+		_ = z.string(lowercaseDigits[0:base])
+	}
+}
+
+func BenchmarkLeafSize0(b *testing.B)  { LeafSizeHelper(b, 10, 0) } // test without splitting
+func BenchmarkLeafSize1(b *testing.B)  { LeafSizeHelper(b, 10, 1) }
+func BenchmarkLeafSize2(b *testing.B)  { LeafSizeHelper(b, 10, 2) }
+func BenchmarkLeafSize3(b *testing.B)  { LeafSizeHelper(b, 10, 3) }
+func BenchmarkLeafSize4(b *testing.B)  { LeafSizeHelper(b, 10, 4) }
+func BenchmarkLeafSize5(b *testing.B)  { LeafSizeHelper(b, 10, 5) }
+func BenchmarkLeafSize6(b *testing.B)  { LeafSizeHelper(b, 10, 6) }
+func BenchmarkLeafSize7(b *testing.B)  { LeafSizeHelper(b, 10, 7) }
+func BenchmarkLeafSize8(b *testing.B)  { LeafSizeHelper(b, 10, 8) }
+func BenchmarkLeafSize9(b *testing.B)  { LeafSizeHelper(b, 10, 9) }
+func BenchmarkLeafSize10(b *testing.B) { LeafSizeHelper(b, 10, 10) }
+func BenchmarkLeafSize11(b *testing.B) { LeafSizeHelper(b, 10, 11) }
+func BenchmarkLeafSize12(b *testing.B) { LeafSizeHelper(b, 10, 12) }
+func BenchmarkLeafSize13(b *testing.B) { LeafSizeHelper(b, 10, 13) }
+func BenchmarkLeafSize14(b *testing.B) { LeafSizeHelper(b, 10, 14) }
+func BenchmarkLeafSize15(b *testing.B) { LeafSizeHelper(b, 10, 15) }
+func BenchmarkLeafSize16(b *testing.B) { LeafSizeHelper(b, 10, 16) }
+func BenchmarkLeafSize32(b *testing.B) { LeafSizeHelper(b, 10, 32) } // try some large lengths 
+func BenchmarkLeafSize64(b *testing.B) { LeafSizeHelper(b, 10, 64) }
+
+func LeafSizeHelper(b *testing.B, base Word, size int) {
+	b.StopTimer()
+	originalLeafSize := leafSize
+	resetTable(cacheBase10[:])
+	leafSize = size
+	b.StartTimer()
+
+	for d := 1; d <= 10000; d *= 10 {
+		b.StopTimer()
+		var z nat
+		z = z.expWW(base, Word(d))            // build target number
+		_ = z.string(lowercaseDigits[0:base]) // warm divisor cache
+		b.StartTimer()
+
+		for i := 0; i < b.N; i++ {
+			_ = z.string(lowercaseDigits[0:base])
+		}
+	}
+
+	b.StopTimer()
+	resetTable(cacheBase10[:])
+	leafSize = originalLeafSize
+	b.StartTimer()
+}
+
+func resetTable(table []divisor) {
+	if table != nil && table[0].bbb != nil {
+		for i := 0; i < len(table); i++ {
+			table[i].bbb = nil
+			table[i].nbits = 0
+			table[i].ndigits = 0
+		}
+	}
+}
+
+func TestStringPowers(t *testing.T) {
+	var b, p Word
+	for b = 2; b <= 16; b++ {
+		for p = 0; p <= 512; p++ {
+			x := nat(nil).expWW(b, p)
+			xs := x.string(lowercaseDigits[0:b])
+			xs2 := toString(x, lowercaseDigits[0:b])
+			if xs != xs2 {
+				t.Errorf("failed at %d ** %d in base %d: %s != %s", b, p, b, xs, xs2)
+			}
+		}
 	}
 }