runtime: iterate over set bits in adjustpointers

There are several things combined in this change. First, eliminate the gobitvector type in favor of adding a ptrbit method to bitvector. In non-performance-critical code, use that method. In performance critical code, though, load the bitvector data one byte at a time and iterate only over set bits. To support that, add and use sys.Ctz8. name old time/op new time/op delta StackCopyPtr-8 81.8ms ± 5% 78.9ms ± 3% -3.58% (p=0.000 n=97+96) StackCopy-8 65.9ms ± 3% 62.8ms ± 3% -4.67% (p=0.000 n=96+92) StackCopyNoCache-8 105ms ± 3% 102ms ± 3% -3.38% (p=0.000 n=96+95) Change-Id: I00b80f45612708bd440b1a411a57fa6dfa24aa74 Reviewed-on: https://go-review.googlesource.com/109716 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Austin Clements <austin@google.com>
2024-11-26 17:56:55 -07:00 · 2018-04-01 11:01:36 -07:00 · 2018-04-01 11:01:36 -07:00 · 5af0b28a73
commit 5af0b28a73
parent 13cd006139
7 changed files with 76 additions and 47 deletions
--- a/src/cmd/compile/internal/gc/ssa.go
+++ b/src/cmd/compile/internal/gc/ssa.go
@ -2980,6 +2980,8 @@ func init() {
 	alias("runtime/internal/atomic", "Casp1", "runtime/internal/atomic", "Cas", p4...)
 	alias("runtime/internal/atomic", "Casp1", "runtime/internal/atomic", "Cas64", p8...)

+	alias("runtime/internal/sys", "Ctz8", "math/bits", "TrailingZeros8", all...)
+
 	/******** math ********/
 	addF("math", "Sqrt",
 		func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
--- a/src/runtime/heapdump.go
+++ b/src/runtime/heapdump.go
@ -233,9 +233,8 @@ type childInfo struct {

 // dump kinds & offsets of interesting fields in bv
 func dumpbv(cbv *bitvector, offset uintptr) {
-	bv := gobv(*cbv)
-	for i := uintptr(0); i < bv.n; i++ {
-		if bv.bytedata[i/8]>>(i%8)&1 == 1 {
+	for i := uintptr(0); i < uintptr(cbv.n); i++ {
+		if cbv.ptrbit(i) == 1 {
 			dumpint(fieldKindPtr)
 			dumpint(uint64(offset + i*sys.PtrSize))
 		}
--- a/src/runtime/internal/sys/intrinsics.go
+++ b/src/runtime/internal/sys/intrinsics.go
@ -50,6 +50,30 @@ func Ctz32(x uint32) int {
 	return i + z
 }

+// Ctz8 returns the number of trailing zero bits in x; the result is 8 for x == 0.
+func Ctz8(x uint8) int {
+	return int(ntz8tab[x])
+}
+
+var ntz8tab = [256]uint8{
+	0x08, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x06, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x07, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x06, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+}
+
 // Bswap64 returns its input with byte order reversed
 // 0x0102030405060708 -> 0x0807060504030201
 func Bswap64(x uint64) uint64 {
--- a/src/runtime/internal/sys/intrinsics_386.s
+++ b/src/runtime/internal/sys/intrinsics_386.s
@ -34,6 +34,14 @@ TEXT runtime∕internal∕sys·Ctz32(SB), NOSPLIT, $0-8
 	MOVL	AX, ret+4(FP)
 	RET

+TEXT runtime∕internal∕sys·Ctz8(SB), NOSPLIT, $0-8
+	MOVBLZX	x+0(FP), AX
+	BSFL	AX, AX
+	JNZ	2(PC)
+	MOVL	$8, AX
+	MOVL	AX, ret+4(FP)
+	RET
+
 TEXT runtime∕internal∕sys·Bswap64(SB), NOSPLIT, $0-16
 	MOVL	x_lo+0(FP), AX
 	MOVL	x_hi+4(FP), BX
--- a/src/runtime/internal/sys/intrinsics_stubs.go
+++ b/src/runtime/internal/sys/intrinsics_stubs.go
@ -8,5 +8,6 @@ package sys

 func Ctz64(x uint64) int
 func Ctz32(x uint32) int
+func Ctz8(x uint8) int
 func Bswap64(x uint64) uint64
 func Bswap32(x uint32) uint32
--- a/src/runtime/mbitmap.go
+++ b/src/runtime/mbitmap.go
@ -2021,9 +2021,8 @@ func getgcmask(ep interface{}) (mask []byte) {
 			n := (*ptrtype)(unsafe.Pointer(t)).elem.size
 			mask = make([]byte, n/sys.PtrSize)
 			for i := uintptr(0); i < n; i += sys.PtrSize {
-				bitmap := bv.bytedata
 				off := (uintptr(p) + i - frame.varp + size) / sys.PtrSize
-				mask[i/sys.PtrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
+				mask[i/sys.PtrSize] = bv.ptrbit(off)
 			}
 		}
 		return
--- a/src/runtime/stack.go
+++ b/src/runtime/stack.go
@ -544,64 +544,60 @@ type bitvector struct {
 	bytedata *uint8
 }

-type gobitvector struct {
-	n        uintptr
-	bytedata []uint8
-}
-
-func gobv(bv bitvector) gobitvector {
-	return gobitvector{
-		uintptr(bv.n),
-		(*[1 << 30]byte)(unsafe.Pointer(bv.bytedata))[:(bv.n+7)/8],
-	}
-}
-
-func ptrbit(bv *gobitvector, i uintptr) uint8 {
-	return (bv.bytedata[i/8] >> (i % 8)) & 1
+// ptrbit returns the i'th bit in bv.
+// ptrbit is less efficient than iterating directly over bitvector bits,
+// and should only be used in non-performance-critical code.
+// See adjustpointers for an example of a high-efficiency walk of a bitvector.
+func (bv *bitvector) ptrbit(i uintptr) uint8 {
+	b := *(addb(bv.bytedata, i/8))
+	return (b >> (i % 8)) & 1
 }

 // bv describes the memory starting at address scanp.
 // Adjust any pointers contained therein.
-func adjustpointers(scanp unsafe.Pointer, cbv *bitvector, adjinfo *adjustinfo, f funcInfo) {
-	bv := gobv(*cbv)
+func adjustpointers(scanp unsafe.Pointer, bv *bitvector, adjinfo *adjustinfo, f funcInfo) {
 	minp := adjinfo.old.lo
 	maxp := adjinfo.old.hi
 	delta := adjinfo.delta
-	num := bv.n
+	num := uintptr(bv.n)
 	// If this frame might contain channel receive slots, use CAS
 	// to adjust pointers. If the slot hasn't been received into
 	// yet, it may contain stack pointers and a concurrent send
 	// could race with adjusting those pointers. (The sent value
 	// itself can never contain stack pointers.)
 	useCAS := uintptr(scanp) < adjinfo.sghi
-	for i := uintptr(0); i < num; i++ {
+	for i := uintptr(0); i < num; i += 8 {
 		if stackDebug >= 4 {
-			print("        ", add(scanp, i*sys.PtrSize), ":", ptrnames[ptrbit(&bv, i)], ":", hex(*(*uintptr)(add(scanp, i*sys.PtrSize))), " # ", i, " ", bv.bytedata[i/8], "\n")
-		}
-		if ptrbit(&bv, i) != 1 {
-			continue
-		}
-		pp := (*uintptr)(add(scanp, i*sys.PtrSize))
-	retry:
-		p := *pp
-		if f.valid() && 0 < p && p < minLegalPointer && debug.invalidptr != 0 {
-			// Looks like a junk value in a pointer slot.
-			// Live analysis wrong?
-			getg().m.traceback = 2
-			print("runtime: bad pointer in frame ", funcname(f), " at ", pp, ": ", hex(p), "\n")
-			throw("invalid pointer found on stack")
-		}
-		if minp <= p && p < maxp {
-			if stackDebug >= 3 {
-				print("adjust ptr ", hex(p), " ", funcname(f), "\n")
+			for j := uintptr(0); j < 8; j++ {
+				print("        ", add(scanp, (i+j)*sys.PtrSize), ":", ptrnames[bv.ptrbit(i+j)], ":", hex(*(*uintptr)(add(scanp, (i+j)*sys.PtrSize))), " # ", i, " ", *addb(bv.bytedata, i/8), "\n")
 			}
-			if useCAS {
-				ppu := (*unsafe.Pointer)(unsafe.Pointer(pp))
-				if !atomic.Casp1(ppu, unsafe.Pointer(p), unsafe.Pointer(p+delta)) {
-					goto retry
+		}
+		b := *(addb(bv.bytedata, i/8))
+		for b != 0 {
+			j := uintptr(sys.Ctz8(b))
+			b &= b - 1
+			pp := (*uintptr)(add(scanp, (i+j)*sys.PtrSize))
+		retry:
+			p := *pp
+			if f.valid() && 0 < p && p < minLegalPointer && debug.invalidptr != 0 {
+				// Looks like a junk value in a pointer slot.
+				// Live analysis wrong?
+				getg().m.traceback = 2
+				print("runtime: bad pointer in frame ", funcname(f), " at ", pp, ": ", hex(p), "\n")
+				throw("invalid pointer found on stack")
+			}
+			if minp <= p && p < maxp {
+				if stackDebug >= 3 {
+					print("adjust ptr ", hex(p), " ", funcname(f), "\n")
+				}
+				if useCAS {
+					ppu := (*unsafe.Pointer)(unsafe.Pointer(pp))
+					if !atomic.Casp1(ppu, unsafe.Pointer(p), unsafe.Pointer(p+delta)) {
+						goto retry
+					}
+				} else {
+					*pp = p + delta
 				}
-			} else {
-				*pp = p + delta
 			}
 		}
 	}