From 721c04ae4ef8406f169f7e32d131a8667fc75a33 Mon Sep 17 00:00:00 2001 From: Michael Anthony Knyszek Date: Wed, 18 Sep 2024 02:38:45 +0000 Subject: [PATCH] runtime: optimize 8-byte allocation pointer data writing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change brings back a minor optimization lost in the Go 1.22 cycle wherein the 8-byte pointer-ful span class spans would have the pointer bitmap written ahead of time in bulk, because there's only one possible pattern. │ before │ after │ │ sec/op │ sec/op vs base │ MallocTypeInfo8-4 25.13n ± 1% 23.59n ± 2% -6.15% (p=0.002 n=6) Change-Id: I135b84bb1d5b7e678b841b56430930bc73c0a038 Reviewed-on: https://go-review.googlesource.com/c/go/+/614256 Reviewed-by: Keith Randall Auto-Submit: Michael Knyszek Reviewed-by: Keith Randall LUCI-TryBot-Result: Go LUCI --- src/runtime/arena.go | 2 +- src/runtime/mbitmap.go | 23 ++++++++++++++--------- src/runtime/mcache.go | 2 +- src/runtime/mcentral.go | 2 +- 4 files changed, 17 insertions(+), 12 deletions(-) diff --git a/src/runtime/arena.go b/src/runtime/arena.go index 936e3604bf..9ba6c68f80 100644 --- a/src/runtime/arena.go +++ b/src/runtime/arena.go @@ -1088,7 +1088,7 @@ func (h *mheap) allocUserArenaChunk() *mspan { // This must clear the entire heap bitmap so that it's safe // to allocate noscan data without writing anything out. - s.initHeapBits(true) + s.initHeapBits() // Clear the span preemptively. It's an arena chunk, so let's assume // everything is going to be used. diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go index bae008b432..65590f5c22 100644 --- a/src/runtime/mbitmap.go +++ b/src/runtime/mbitmap.go @@ -535,12 +535,13 @@ func bulkBarrierPreWriteSrcOnly(dst, src, size uintptr, typ *abi.Type) { } // initHeapBits initializes the heap bitmap for a span. -// -// TODO(mknyszek): This should set the heap bits for single pointer -// allocations eagerly to avoid calling heapSetType at allocation time, -// just to write one bit. -func (s *mspan) initHeapBits(forceClear bool) { - if (!s.spanclass.noscan() && heapBitsInSpan(s.elemsize)) || s.isUserArenaChunk { +func (s *mspan) initHeapBits() { + if goarch.PtrSize == 8 && !s.spanclass.noscan() && s.spanclass.sizeclass() == 1 { + b := s.heapBits() + for i := range b { + b[i] = ^uintptr(0) + } + } else if (!s.spanclass.noscan() && heapBitsInSpan(s.elemsize)) || s.isUserArenaChunk { b := s.heapBits() clear(b) } @@ -639,16 +640,20 @@ func (span *mspan) heapBitsSmallForAddr(addr uintptr) uintptr { // //go:nosplit func (span *mspan) writeHeapBitsSmall(x, dataSize uintptr, typ *_type) (scanSize uintptr) { + if goarch.PtrSize == 8 && dataSize == goarch.PtrSize { + // Already set by initHeapBits. + return + } + // The objects here are always really small, so a single load is sufficient. src0 := readUintptr(typ.GCData) // Create repetitions of the bitmap if we have a small slice backing store. scanSize = typ.PtrBytes src := src0 - switch typ.Size_ { - case goarch.PtrSize: + if typ.Size_ == goarch.PtrSize { src = (1 << (dataSize / goarch.PtrSize)) - 1 - default: + } else { for i := typ.Size_; i < dataSize; i += typ.Size_ { src |= src0 << (i / goarch.PtrSize) scanSize += typ.Size_ diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go index 97a5f70e9c..51c496fed3 100644 --- a/src/runtime/mcache.go +++ b/src/runtime/mcache.go @@ -252,7 +252,7 @@ func (c *mcache) allocLarge(size uintptr, noscan bool) *mspan { // visible to the background sweeper. mheap_.central[spc].mcentral.fullSwept(mheap_.sweepgen).push(s) s.limit = s.base() + size - s.initHeapBits(false) + s.initHeapBits() return s } diff --git a/src/runtime/mcentral.go b/src/runtime/mcentral.go index 1a4819bc2c..08ff0a5c5d 100644 --- a/src/runtime/mcentral.go +++ b/src/runtime/mcentral.go @@ -260,6 +260,6 @@ func (c *mcentral) grow() *mspan { // n := (npages << _PageShift) / size n := s.divideByElemSize(npages << _PageShift) s.limit = s.base() + size*n - s.initHeapBits(false) + s.initHeapBits() return s }