From 54af9a3ba5f3e656077eab2e9305cbbb41d7b154 Mon Sep 17 00:00:00 2001
From: Russ Cox <rsc@golang.org>
Date: Mon, 4 May 2015 22:53:54 -0400
Subject: [PATCH] runtime: reintroduce ``dead'' space during GC scan
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reintroduce an optimization discarded during the initial conversion
from 4-bit heap bitmaps to 2-bit heap bitmaps: when we reach the
place in the bitmap where there are no more pointers, mark that position
for the GC so that it can avoid scanning past that place.

During heapBitsSetType we can also avoid initializing heap bitmap
beyond that location, which gives a bit of a win compared to Go 1.4.
This particular optimization (not initializing the heap bitmap) may not last:
we might change typedmemmove to use the heap bitmap, in which
case it would all need to be initialized. The early stop in the GC scan
will stay no matter what.

Compared to Go 1.4 (github.com/rsc/go, branch go14bench):
name                    old mean              new mean              delta
SetTypeNode64           80.7ns × (1.00,1.01)  57.4ns × (1.00,1.01)  -28.83% (p=0.000)
SetTypeNode64Dead       80.5ns × (1.00,1.01)  13.1ns × (0.99,1.02)  -83.77% (p=0.000)
SetTypeNode64Slice      2.16µs × (1.00,1.01)  1.54µs × (1.00,1.01)  -28.75% (p=0.000)
SetTypeNode64DeadSlice  2.16µs × (1.00,1.01)  1.52µs × (1.00,1.00)  -29.74% (p=0.000)

Compared to previous CL:
name                    old mean              new mean              delta
SetTypeNode64           56.7ns × (1.00,1.00)  57.4ns × (1.00,1.01)   +1.19% (p=0.000)
SetTypeNode64Dead       57.2ns × (1.00,1.00)  13.1ns × (0.99,1.02)  -77.15% (p=0.000)
SetTypeNode64Slice      1.56µs × (1.00,1.01)  1.54µs × (1.00,1.01)   -0.89% (p=0.000)
SetTypeNode64DeadSlice  1.55µs × (1.00,1.01)  1.52µs × (1.00,1.00)   -2.23% (p=0.000)

This is the last CL in the sequence converting from the 4-bit heap
to the 2-bit heap, with all the same optimizations reenabled.
Compared to before that process began (compared to CL 9701 patch set 1):

name                    old mean              new mean              delta
BinaryTree17             5.87s × (0.94,1.09)   5.91s × (0.96,1.06)    ~    (p=0.578)
Fannkuch11               4.32s × (1.00,1.00)   4.32s × (1.00,1.00)    ~    (p=0.474)
FmtFprintfEmpty         89.1ns × (0.95,1.16)  89.0ns × (0.93,1.10)    ~    (p=0.942)
FmtFprintfString         283ns × (0.98,1.02)   298ns × (0.98,1.06)  +5.33% (p=0.000)
FmtFprintfInt            284ns × (0.98,1.04)   286ns × (0.98,1.03)    ~    (p=0.208)
FmtFprintfIntInt         486ns × (0.98,1.03)   498ns × (0.97,1.06)  +2.48% (p=0.000)
FmtFprintfPrefixedInt    400ns × (0.99,1.02)   408ns × (0.98,1.02)  +2.23% (p=0.000)
FmtFprintfFloat          566ns × (0.99,1.01)   587ns × (0.98,1.01)  +3.69% (p=0.000)
FmtManyArgs             1.91µs × (0.99,1.02)  1.94µs × (0.99,1.02)  +1.81% (p=0.000)
GobDecode               15.5ms × (0.98,1.05)  15.8ms × (0.98,1.03)  +1.94% (p=0.002)
GobEncode               11.9ms × (0.97,1.03)  12.0ms × (0.96,1.09)    ~    (p=0.263)
Gzip                     648ms × (0.99,1.01)   648ms × (0.99,1.01)    ~    (p=0.992)
Gunzip                   143ms × (1.00,1.00)   143ms × (1.00,1.01)    ~    (p=0.585)
HTTPClientServer        89.2µs × (0.99,1.02)  90.3µs × (0.98,1.01)  +1.24% (p=0.000)
JSONEncode              32.3ms × (0.97,1.06)  31.6ms × (0.99,1.01)  -2.29% (p=0.000)
JSONDecode               106ms × (0.99,1.01)   107ms × (1.00,1.01)  +0.62% (p=0.000)
Mandelbrot200           6.02ms × (1.00,1.00)  6.03ms × (1.00,1.01)    ~    (p=0.250)
GoParse                 6.57ms × (0.97,1.06)  6.53ms × (0.99,1.03)    ~    (p=0.243)
RegexpMatchEasy0_32      162ns × (1.00,1.00)   161ns × (1.00,1.01)  -0.80% (p=0.000)
RegexpMatchEasy0_1K      561ns × (0.99,1.02)   541ns × (0.99,1.01)  -3.67% (p=0.000)
RegexpMatchEasy1_32      145ns × (0.95,1.04)   138ns × (1.00,1.00)  -5.04% (p=0.000)
RegexpMatchEasy1_1K      864ns × (0.99,1.04)   887ns × (0.99,1.01)  +2.57% (p=0.000)
RegexpMatchMedium_32     255ns × (0.99,1.04)   253ns × (0.99,1.01)  -1.05% (p=0.012)
RegexpMatchMedium_1K    73.9µs × (0.98,1.04)  72.8µs × (1.00,1.00)  -1.51% (p=0.005)
RegexpMatchHard_32      3.92µs × (0.98,1.04)  3.85µs × (1.00,1.01)  -1.88% (p=0.002)
RegexpMatchHard_1K       120µs × (0.98,1.04)   117µs × (1.00,1.01)  -2.02% (p=0.001)
Revcomp                  936ms × (0.95,1.08)   922ms × (0.97,1.08)    ~    (p=0.234)
Template                 130ms × (0.98,1.04)   126ms × (0.99,1.01)  -2.99% (p=0.000)
TimeParse                638ns × (0.98,1.05)   628ns × (0.99,1.01)  -1.54% (p=0.004)
TimeFormat               674ns × (0.99,1.01)   668ns × (0.99,1.01)  -0.80% (p=0.001)

The slowdown of the first few benchmarks seems to be due to the new
atomic operations for certain small size allocations. But the larger
benchmarks mostly improve, probably due to the decreased memory
pressure from having half as much heap bitmap.

CL 9706, which removes the (never used anymore) wbshadow mode,
gets back what is lost in the early microbenchmarks.

Change-Id: I37423a209e8ec2a2e92538b45cac5422a6acd32d
Reviewed-on: https://go-review.googlesource.com/9705
Reviewed-by: Rick Hudson <rlh@golang.org>
---
 src/cmd/internal/gc/reflect.go   |  31 +++---
 src/cmd/internal/ld/data.go      |  20 ++--
 src/cmd/internal/ld/decodesym.go |   5 +
 src/runtime/gcinfo_test.go       |  43 ++++----
 src/runtime/mbitmap.go           | 168 +++++++++++++++++++++----------
 5 files changed, 163 insertions(+), 104 deletions(-)

diff --git a/src/cmd/internal/gc/reflect.go b/src/cmd/internal/gc/reflect.go
index 6ff9df2cfc..061b17b3ae 100644
--- a/src/cmd/internal/gc/reflect.go
+++ b/src/cmd/internal/gc/reflect.go
@@ -687,7 +687,7 @@ func haspointers(t *Type) bool {
 
 // typeptrdata returns the length in bytes of the prefix of t
 // containing pointer data. Anything after this offset is scalar data.
-func typeptrdata(t *Type) uint64 {
+func typeptrdata(t *Type) int64 {
 	if !haspointers(t) {
 		return 0
 	}
@@ -699,24 +699,24 @@ func typeptrdata(t *Type) uint64 {
 		TFUNC,
 		TCHAN,
 		TMAP:
-		return uint64(Widthptr)
+		return int64(Widthptr)
 
 	case TSTRING:
 		// struct { byte *str; intgo len; }
-		return uint64(Widthptr)
+		return int64(Widthptr)
 
 	case TINTER:
 		// struct { Itab *tab;	void *data; } or
 		// struct { Type *type; void *data; }
-		return 2 * uint64(Widthptr)
+		return 2 * int64(Widthptr)
 
 	case TARRAY:
 		if Isslice(t) {
 			// struct { byte *array; uintgo len; uintgo cap; }
-			return uint64(Widthptr)
+			return int64(Widthptr)
 		}
 		// haspointers already eliminated t.Bound == 0.
-		return uint64(t.Bound-1)*uint64(t.Type.Width) + typeptrdata(t.Type)
+		return (t.Bound-1)*t.Type.Width + typeptrdata(t.Type)
 
 	case TSTRUCT:
 		// Find the last field that has pointers.
@@ -726,7 +726,7 @@ func typeptrdata(t *Type) uint64 {
 				lastPtrField = t1
 			}
 		}
-		return uint64(lastPtrField.Width) + typeptrdata(lastPtrField.Type)
+		return lastPtrField.Width + typeptrdata(lastPtrField.Type)
 
 	default:
 		Fatal("typeptrdata: unexpected type, %v", t)
@@ -794,7 +794,7 @@ func dcommontype(s *Sym, ot int, t *Type) int {
 	//		zero          unsafe.Pointer
 	//	}
 	ot = duintptr(s, ot, uint64(t.Width))
-	ot = duintptr(s, ot, typeptrdata(t))
+	ot = duintptr(s, ot, uint64(typeptrdata(t)))
 
 	ot = duint32(s, ot, typehash(t))
 	ot = duint8(s, ot, 0) // unused
@@ -1428,17 +1428,12 @@ func usegcprog(t *Type) bool {
 	}
 
 	// Calculate size of the unrolled GC mask.
-	nptr := (t.Width + int64(Widthptr) - 1) / int64(Widthptr)
-
-	size := (nptr + 7) / 8
+	nptr := typeptrdata(t) / int64(Widthptr)
 
 	// Decide whether to use unrolled GC mask or GC program.
 	// We could use a more elaborate condition, but this seems to work well in practice.
-	// For small objects GC program can't give significant reduction.
-	// While large objects usually contain arrays; and even if it don't
-	// the program uses 2-bits per word while mask uses 4-bits per word,
-	// so the program is still smaller.
-	return size > int64(2*Widthptr)
+	// For small objects, the GC program can't give significant reduction.
+	return nptr > int64(2*Widthptr*8)
 }
 
 // Generates GC bitmask (1 bit per word).
@@ -1450,11 +1445,11 @@ func gengcmask(t *Type, gcmask []byte) {
 		return
 	}
 
-	vec := bvalloc(2 * int32(Widthptr) * 8)
+	vec := bvalloc(int32(2 * Widthptr * 8))
 	xoffset := int64(0)
 	onebitwalktype1(t, &xoffset, vec)
 
-	nptr := (t.Width + int64(Widthptr) - 1) / int64(Widthptr)
+	nptr := typeptrdata(t) / int64(Widthptr)
 	for i := int64(0); i < nptr; i++ {
 		if bvget(vec, int32(i)) == 1 {
 			gcmask[i/8] |= 1 << (uint(i) % 8)
diff --git a/src/cmd/internal/ld/data.go b/src/cmd/internal/ld/data.go
index 676c8856de..37d458802f 100644
--- a/src/cmd/internal/ld/data.go
+++ b/src/cmd/internal/ld/data.go
@@ -1109,8 +1109,7 @@ func proggenaddsym(g *ProgGen, s *LSym) {
 
 	// Skip alignment hole from the previous symbol.
 	proggenskip(g, g.pos, s.Value-g.pos)
-
-	g.pos += s.Value - g.pos
+	g.pos = s.Value
 
 	// The test for names beginning with . here is meant
 	// to keep .dynamic and .dynsym from turning up as
@@ -1142,16 +1141,16 @@ func proggenaddsym(g *ProgGen, s *LSym) {
 			proggendata(g, 0)
 			proggenarrayend(g)
 		}
-
 		g.pos = s.Value + s.Size
 	} else if decodetype_usegcprog(s.Gotype) != 0 {
 		// gc program, copy directly
+		// TODO(rsc): Maybe someday the gc program will only describe
+		// the first decodetype_ptrdata(s.Gotype) bytes instead of the full size.
 		proggendataflush(g)
-
 		gcprog := decodetype_gcprog(s.Gotype)
 		size := decodetype_size(s.Gotype)
 		if (size%int64(Thearch.Ptrsize) != 0) || (g.pos%int64(Thearch.Ptrsize) != 0) {
-			Diag("proggenaddsym: unaligned gcprog symbol %s: size=%d pos=%d", s.Name, s.Size, g.pos)
+			Diag("proggenaddsym: unaligned gcprog symbol %s: size=%d pos=%d", s.Name, size, g.pos)
 		}
 		for i := int64(0); i < int64(len(gcprog.P)-1); i++ {
 			proggenemit(g, uint8(gcprog.P[i]))
@@ -1160,16 +1159,15 @@ func proggenaddsym(g *ProgGen, s *LSym) {
 	} else {
 		// gc mask, it's small so emit as data
 		mask := decodetype_gcmask(s.Gotype)
-
-		size := decodetype_size(s.Gotype)
-		if (size%int64(Thearch.Ptrsize) != 0) || (g.pos%int64(Thearch.Ptrsize) != 0) {
-			Diag("proggenaddsym: unaligned gcmask symbol %s: size=%d pos=%d", s.Name, s.Size, g.pos)
+		ptrdata := decodetype_ptrdata(s.Gotype)
+		if (ptrdata%int64(Thearch.Ptrsize) != 0) || (g.pos%int64(Thearch.Ptrsize) != 0) {
+			Diag("proggenaddsym: unaligned gcmask symbol %s: size=%d pos=%d", s.Name, ptrdata, g.pos)
 		}
-		for i := int64(0); i < size; i += int64(Thearch.Ptrsize) {
+		for i := int64(0); i < ptrdata; i += int64(Thearch.Ptrsize) {
 			word := uint(i / int64(Thearch.Ptrsize))
 			proggendata(g, (mask[word/8]>>(word%8))&1)
 		}
-		g.pos = s.Value + size
+		g.pos = s.Value + ptrdata
 	}
 }
 
diff --git a/src/cmd/internal/ld/decodesym.go b/src/cmd/internal/ld/decodesym.go
index 754c89f12b..b9333857fd 100644
--- a/src/cmd/internal/ld/decodesym.go
+++ b/src/cmd/internal/ld/decodesym.go
@@ -67,6 +67,11 @@ func decodetype_size(s *LSym) int64 {
 	return int64(decode_inuxi(s.P, Thearch.Ptrsize)) // 0x8 / 0x10
 }
 
+// Type.commonType.ptrdata
+func decodetype_ptrdata(s *LSym) int64 {
+	return int64(decode_inuxi(s.P[Thearch.Ptrsize:], Thearch.Ptrsize)) // 0x8 / 0x10
+}
+
 // Type.commonType.gc
 func decodetype_gcprog(s *LSym) *LSym {
 	if s.Type == obj.SDYNIMPORT {
diff --git a/src/runtime/gcinfo_test.go b/src/runtime/gcinfo_test.go
index dd5c25e0b1..7618d86a45 100644
--- a/src/runtime/gcinfo_test.go
+++ b/src/runtime/gcinfo_test.go
@@ -34,23 +34,23 @@ func TestGCInfo(t *testing.T) {
 	verifyGCInfo(t, "data eface", &dataEface, infoEface)
 	verifyGCInfo(t, "data iface", &dataIface, infoIface)
 
-	verifyGCInfo(t, "stack ScalarPtr", new(ScalarPtr), nonStackInfo(infoScalarPtr))
-	verifyGCInfo(t, "stack PtrScalar", new(PtrScalar), nonStackInfo(infoPtrScalar))
-	verifyGCInfo(t, "stack BigStruct", new(BigStruct), nonStackInfo(infoBigStruct()))
-	verifyGCInfo(t, "stack string", new(string), nonStackInfo(infoString))
-	verifyGCInfo(t, "stack slice", new([]string), nonStackInfo(infoSlice))
-	verifyGCInfo(t, "stack eface", new(interface{}), nonStackInfo(infoEface))
-	verifyGCInfo(t, "stack iface", new(Iface), nonStackInfo(infoIface))
+	verifyGCInfo(t, "stack ScalarPtr", new(ScalarPtr), infoScalarPtr)
+	verifyGCInfo(t, "stack PtrScalar", new(PtrScalar), infoPtrScalar)
+	verifyGCInfo(t, "stack BigStruct", new(BigStruct), infoBigStruct())
+	verifyGCInfo(t, "stack string", new(string), infoString)
+	verifyGCInfo(t, "stack slice", new([]string), infoSlice)
+	verifyGCInfo(t, "stack eface", new(interface{}), infoEface)
+	verifyGCInfo(t, "stack iface", new(Iface), infoIface)
 
 	for i := 0; i < 10; i++ {
-		verifyGCInfo(t, "heap PtrSlice", escape(&make([]*byte, 10)[0]), infoPtr10)
-		verifyGCInfo(t, "heap ScalarPtr", escape(new(ScalarPtr)), infoScalarPtr)
-		verifyGCInfo(t, "heap ScalarPtrSlice", escape(&make([]ScalarPtr, 4)[0]), infoScalarPtr4)
-		verifyGCInfo(t, "heap PtrScalar", escape(new(PtrScalar)), infoPtrScalar)
-		verifyGCInfo(t, "heap BigStruct", escape(new(BigStruct)), infoBigStruct())
-		verifyGCInfo(t, "heap string", escape(new(string)), infoString)
-		verifyGCInfo(t, "heap eface", escape(new(interface{})), infoEface)
-		verifyGCInfo(t, "heap iface", escape(new(Iface)), infoIface)
+		verifyGCInfo(t, "heap PtrSlice", escape(&make([]*byte, 10)[0]), trimDead(infoPtr10))
+		verifyGCInfo(t, "heap ScalarPtr", escape(new(ScalarPtr)), trimDead(infoScalarPtr))
+		verifyGCInfo(t, "heap ScalarPtrSlice", escape(&make([]ScalarPtr, 4)[0]), trimDead(infoScalarPtr4))
+		verifyGCInfo(t, "heap PtrScalar", escape(new(PtrScalar)), trimDead(infoPtrScalar))
+		verifyGCInfo(t, "heap BigStruct", escape(new(BigStruct)), trimDead(infoBigStruct()))
+		verifyGCInfo(t, "heap string", escape(new(string)), trimDead(infoString))
+		verifyGCInfo(t, "heap eface", escape(new(interface{})), trimDead(infoEface))
+		verifyGCInfo(t, "heap iface", escape(new(Iface)), trimDead(infoIface))
 	}
 
 }
@@ -67,16 +67,11 @@ func verifyGCInfo(t *testing.T, name string, p interface{}, mask0 []byte) {
 	}
 }
 
-func nonStackInfo(mask []byte) []byte {
-	// typeDead is replaced with typeScalar everywhere except stacks.
-	mask1 := make([]byte, len(mask))
-	for i, v := range mask {
-		if v == typeDead {
-			v = typeScalar
-		}
-		mask1[i] = v
+func trimDead(mask []byte) []byte {
+	for len(mask) > 2 && mask[len(mask)-1] == typeScalar {
+		mask = mask[:len(mask)-1]
 	}
-	return mask1
+	return mask
 }
 
 var gcinfoSink interface{}
diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go
index b866d7f732..61e1254bed 100644
--- a/src/runtime/mbitmap.go
+++ b/src/runtime/mbitmap.go
@@ -491,6 +491,8 @@ func heapBitsSweepSpan(base, size, n uintptr, f func(uintptr)) {
 // but if the start or end of x shares a bitmap byte with an adjacent
 // object, the GC marker is racing with updates to those object's mark bits.
 func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
+	const doubleCheck = false // slow but helpful; enable to test modifications to this function
+
 	// From here till marked label marking the object as allocated
 	// and storing type info in the GC bitmap.
 	h := heapBitsForAddr(x)
@@ -518,7 +520,7 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 
 	ptrmask := (*uint8)(unsafe.Pointer(typ.gc[0])) // pointer to unrolled mask
 	if typ.kind&kindGCProg != 0 {
-		nptr := (uintptr(typ.size) + ptrSize - 1) / ptrSize
+		nptr := typ.ptrdata / ptrSize
 		masksize := (nptr + 7) / 8
 		masksize++ // unroll flag in the beginning
 		if masksize > maxGCMask && typ.gc[1] != 0 {
@@ -568,21 +570,56 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 	// In general, one load can supply two bitmap byte writes.
 	// This is a lot of lines of code, but it compiles into relatively few
 	// machine instructions.
+
+	// Ptrmask buffer.
 	var (
 		p     *byte   // last ptrmask byte read
 		b     uintptr // ptrmask bits already loaded
-		nb    uint32  // number of bits in b at next read
+		nb    uintptr // number of bits in b at next read
 		endp  *byte   // final ptrmask byte to read (then repeat)
-		endnb uint32  // number of valid bits in *endp
+		endnb uintptr // number of valid bits in *endp
 		pbits uintptr // alternate source of bits
 	)
 
+	// Note about sizes:
+	//
+	// typ.size is the number of words in the object,
+	// and typ.ptrdata is the number of words in the prefix
+	// of the object that contains pointers. That is, the final
+	// typ.size - typ.ptrdata words contain no pointers.
+	// This allows optimization of a common pattern where
+	// an object has a small header followed by a large scalar
+	// buffer. If we know the pointers are over, we don't have
+	// to scan the buffer's heap bitmap at all.
+	// The 1-bit ptrmasks are sized to contain only bits for
+	// the typ.ptrdata prefix, zero padded out to a full byte
+	// of bitmap. This code sets nw (below) so that heap bitmap
+	// bits are only written for the typ.ptrdata prefix; if there is
+	// more room in the allocated object, the next heap bitmap
+	// entry is a 00, indicating that there are no more pointers
+	// to scan. So only the ptrmask for the ptrdata bytes is needed.
+	//
+	// Replicated copies are not as nice: if there is an array of
+	// objects with scalar tails, all but the last tail does have to
+	// be initialized, because there is no way to say "skip forward".
+	// However, because of the possibility of a repeated type with
+	// size not a multiple of 4 pointers (one heap bitmap byte),
+	// the code already must handle the last ptrmask byte specially
+	// by treating it as containing only the bits for endnb pointers,
+	// where endnb <= 4. We represent large scalar tails that must
+	// be expanded in the replication by setting endnb larger than 4.
+	// This will have the effect of reading many bits out of b,
+	// but once the real bits are shifted out, b will supply as many
+	// zero bits as we try to read, which is exactly what we need.
+
 	p = ptrmask
 	if typ.size < dataSize {
 		// Filling in bits for an array of typ.
 		// Set up for repetition of ptrmask during main loop.
-		if typ.size/ptrSize+7 <= ptrSize*8 {
-			// Entire ptrmask + a leftover fragment fits in uintptr.
+		// Note that ptrmask describes only a prefix of
+		const maxBits = ptrSize*8 - 7
+		if typ.ptrdata/ptrSize <= maxBits {
+			// Entire ptrmask fits in uintptr with room for a byte fragment.
 			// Load into pbits and never read from ptrmask again.
 			// This is especially important when the ptrmask has
 			// fewer than 8 bits in it; otherwise the reload in the middle
@@ -590,26 +627,34 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 			// at least 8 bits.
 
 			// Accumulate ptrmask into b.
-			nb = uint32(typ.size / ptrSize)
-			for i := uint32(0); i < nb; i += 8 {
+			// ptrmask is sized to describe only typ.ptrdata, but we record
+			// it as describing typ.size bytes, since all the high bits are zero.
+			nb = typ.ptrdata / ptrSize
+			for i := uintptr(0); i < nb; i += 8 {
 				b |= uintptr(*p) << i
 				p = addb(p, 1)
 			}
+			nb = typ.size / ptrSize
 
 			// Replicate ptrmask to fill entire pbits uintptr.
 			// Doubling and truncating is fewer steps than
 			// iterating by nb each time. (nb could be 1.)
+			// Since we loaded typ.ptrdata/ptrSize bits
+			// but are pretending to have typ.size/ptrSize,
+			// there might be no replication necessary/possible.
 			pbits = b
 			endnb = nb
-			for endnb <= ptrSize*8 {
-				pbits |= pbits << endnb
-				endnb += endnb
+			if nb+nb <= maxBits {
+				for endnb <= ptrSize*8 {
+					pbits |= pbits << endnb
+					endnb += endnb
+				}
+				// Truncate to a multiple of original ptrmask.
+				endnb = maxBits / nb * nb
+				pbits &= 1<<endnb - 1
+				b = pbits
+				nb = endnb
 			}
-			// Truncate to an multiple of original ptrmask.
-			endnb = (ptrSize*8 - 7) / nb * nb
-			pbits &= 1<<endnb - 1
-			b = pbits
-			nb = endnb
 
 			// Clear p and endp as sentinel for using pbits.
 			// Checked during Phase 2 loop.
@@ -617,11 +662,9 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 			endp = nil
 		} else {
 			// Ptrmask is larger. Read it multiple times.
-			endp = addb(ptrmask, (typ.size/ptrSize+7)/8-1)
-			endnb = uint32(typ.size/ptrSize) % 8
-			if endnb == 0 {
-				endnb = 8
-			}
+			n := (typ.ptrdata/ptrSize+7)/8 - 1
+			endp = addb(ptrmask, n)
+			endnb = typ.size/ptrSize - n*8
 		}
 	}
 	if p != nil {
@@ -630,8 +673,27 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 		nb = 8
 	}
 
-	w := uintptr(0)          // number of words processed
-	nw := dataSize / ptrSize // number of words to process
+	var w uintptr  // words processed
+	var nw uintptr // total number of words to process
+	if typ.size == dataSize {
+		// Single entry: can stop once we reach the non-pointer data.
+		nw = typ.ptrdata / ptrSize
+	} else {
+		// Repeated instances of typ in an array.
+		// Have to process the
+		nw = ((dataSize/typ.size-1)*typ.size + typ.ptrdata) / ptrSize
+	}
+	if nw == 0 {
+		// No pointers! Caller was supposed to check.
+		println("runtime: invalid type ", *typ._string)
+		throw("heapBitsSetType: called with non-pointer type")
+		return
+	}
+	if nw < 2 {
+		// Must write at least 2 words, because the "no scan"
+		// encoding doesn't take effect until the third word.
+		nw = 2
+	}
 
 	hbitp := h.bitp // next heap bitmap byte to write
 	var hb uintptr  // bits being preapred for *h.bitp
@@ -641,11 +703,11 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 	// which do not have the marked bits set.
 	// The leading half-byte is special because it's a half a byte and must be
 	// manipulated atomically.
-	switch h.shift {
+	switch {
 	default:
 		throw("heapBitsSetType: unexpected shift")
 
-	case 0:
+	case h.shift == 0:
 		// Ptrmask and heap bitmap are aligned.
 		// Handle first byte of bitmap specially.
 		// The first byte we write out contains the first two words of the object.
@@ -662,7 +724,7 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 		b >>= 4
 		nb -= 4
 
-	case 4:
+	case ptrSize == 8 && h.shift == 4:
 		// Ptrmask and heap bitmap are misaligned.
 		// The bits for the first two words are in a byte shared with another object
 		// and must be updated atomically.
@@ -679,17 +741,13 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 		// Note: no bitMarker in hb because the first two words don't get markers from us.
 		atomicor8(hbitp, uint8(hb))
 		hbitp = subtractb(hbitp, 1)
-
-		// Expand 8-bit chunks of ptrmask into pairs of heap bitmap bytes.
-		// We know the object size is a multiple of 2 words but not 4, so the
-		// object size minus the 2 words we just handled is a multiple of 4,
-		// so we can use non-atomic writes to the heap bitmap for the
-		// rest of this code, even for the final fragment or a trailing dead marker byte.
-
-		// Loop prepares bits for final byte but stops before writing them,
-		// so that in the case where we need to write only part of a byte,
-		// the code below the loop can truncate the bitMarked.
-		w += 2
+		if w += 2; w >= nw {
+			// We know that there is more data, because we handled 2-word objects above.
+			// This must be at least a 6-word object. If we're out of pointer words,
+			// mark no scan in next bitmap byte and finish.
+			*hbitp = 0
+			goto Phase4
+		}
 	}
 
 	// Phase 2: Full bytes in bitmap, up to but not including write to last byte (full or partial) in bitmap.
@@ -792,24 +850,27 @@ Phase3:
 		}
 	}
 
-	const test = false // slow but helpful
-	if test {
+Phase4:
+	// Phase 4: all done (goto target).
+
+	if doubleCheck {
 		// Double-check that bits to be written were written correctly.
 		// Does not check that other bits were not written, unfortunately.
 		h := heapBitsForAddr(x)
-		nptr := typ.size / ptrSize
+		nptr := typ.ptrdata / ptrSize
+		ndata := typ.size / ptrSize
+		count := dataSize / typ.size
 		for i := uintptr(0); i <= dataSize/ptrSize; i++ {
-			j := i % nptr
+			j := i % ndata
 			var have, want uint8
-			if i == dataSize/ptrSize {
-				if dataSize >= size {
-					break
-				}
-				have = (*h.bitp >> h.shift) & 3
-				want = 0 // dead bits
+			if i == dataSize/ptrSize && dataSize >= size {
+				break
+			}
+			have = (*h.bitp >> h.shift) & 3
+			if i == dataSize/ptrSize || i/ndata == count-1 && j >= nptr {
+				want = 0 // dead marker
 			} else {
-				have = (*h.bitp >> h.shift) & 3
-				if (*addb(ptrmask, j/8)>>(j%8))&1 != 0 {
+				if j < nptr && (*addb(ptrmask, j/8)>>(j%8))&1 != 0 {
 					want |= bitPointer
 				}
 				if i >= 2 {
@@ -820,13 +881,18 @@ Phase3:
 			}
 			if have != want {
 				println("mismatch writing bits for", *typ._string, "x", dataSize/typ.size)
-				print("typ.size=", typ.size, " dataSize=", dataSize, " size=", size, "\n")
+				print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n")
+				print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n")
 				h = heapBitsForAddr(x)
 				print("initial bits h.bitp=", h.bitp, " h.shift=", h.shift, "\n")
-				print("p=", p, " endp=", endp, " endnb=", endnb, " pbits=", hex(pbits), " b=", hex(b), " nb=", nb, "\n")
+				print("ptrmask=", ptrmask, " p=", p, " endp=", endp, " endnb=", endnb, " pbits=", hex(pbits), " b=", hex(b), " nb=", nb, "\n")
 				println("at word", i, "offset", i*ptrSize, "have", have, "want", want)
 				throw("bad heapBitsSetType")
 			}
+			if i >= 2 && want == 0 {
+				// found dead marker; the rest is uninitialized
+				break
+			}
 			h = h.next()
 		}
 	}
@@ -1076,7 +1142,7 @@ func getgcmask(ep interface{}) (mask []byte) {
 				mask[i/ptrSize] = 1
 			}
 			if i >= 2*ptrSize && !hbits.isMarked() {
-				mask[i/ptrSize] = 255
+				mask = mask[:i/ptrSize]
 				break
 			}
 		}