From 6c2e327e35b6fe838b496ec1e8fb2f597c5bd59a Mon Sep 17 00:00:00 2001
From: Austin Clements <austin@google.com>
Date: Thu, 21 Jul 2022 15:43:11 -0400
Subject: [PATCH] runtime: only acquire trace.lock on the system stack

Currently, trace.lock can be acquired while on a user G and stack
splits can happen while holding trace.lock. That means every lock used
by the stack allocator must be okay to acquire while holding
trace.lock, including various locks related to span allocation. In
turn, we cannot safely emit trace events while holding any
allocation-related locks because this would cause a cycle in the lock
rank graph.

To fix this, require that trace.lock only be acquired on the system
stack, like mheap.lock. This pushes it into the "bottom half" and
eliminates the lock rank relationship between tracing and stack
allocation, making it safe to emit trace events in many more places.

One subtlety is that the trace code has race annotations and uses
maps, which have race annotations. By default, we can't have race
annotations on the system stack, so we borrow the user race context
for these situations.

We'll update the lock graph itself in the next CL.

For #53979. This CL technically fixes the problem, but the lock rank
checker doesn't know that yet.

Change-Id: I9f5187a9c52a67bee4f7064db124b1ad53e5178f
Reviewed-on: https://go-review.googlesource.com/c/go/+/418956
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
---
 src/runtime/trace.go | 200 +++++++++++++++++++++++++++----------------
 1 file changed, 127 insertions(+), 73 deletions(-)

diff --git a/src/runtime/trace.go b/src/runtime/trace.go
index 0f661493ce6..56fd1ba37b6 100644
--- a/src/runtime/trace.go
+++ b/src/runtime/trace.go
@@ -109,6 +109,8 @@ const (
 
 // trace is global tracing context.
 var trace struct {
+	// trace.lock must only be acquired on the system stack where
+	// stack splits cannot happen while it is held.
 	lock          mutex       // protects the following members
 	lockOwner     *g          // to avoid deadlocks during recursive lock locks
 	enabled       bool        // when set runtime traces events
@@ -385,31 +387,33 @@ func StopTrace() {
 		raceacquire(unsafe.Pointer(&trace.shutdownSema))
 	}
 
-	// The lock protects us from races with StartTrace/StopTrace because they do stop-the-world.
-	lock(&trace.lock)
-	for _, p := range allp[:cap(allp)] {
-		if p.tracebuf != 0 {
-			throw("trace: non-empty trace buffer in proc")
+	systemstack(func() {
+		// The lock protects us from races with StartTrace/StopTrace because they do stop-the-world.
+		lock(&trace.lock)
+		for _, p := range allp[:cap(allp)] {
+			if p.tracebuf != 0 {
+				throw("trace: non-empty trace buffer in proc")
+			}
 		}
-	}
-	if trace.buf != 0 {
-		throw("trace: non-empty global trace buffer")
-	}
-	if trace.fullHead != 0 || trace.fullTail != 0 {
-		throw("trace: non-empty full trace buffer")
-	}
-	if trace.reading != 0 || trace.reader.Load() != nil {
-		throw("trace: reading after shutdown")
-	}
-	for trace.empty != 0 {
-		buf := trace.empty
-		trace.empty = buf.ptr().link
-		sysFree(unsafe.Pointer(buf), unsafe.Sizeof(*buf.ptr()), &memstats.other_sys)
-	}
-	trace.strings = nil
-	trace.shutdown = false
-	trace.cpuLogRead = nil
-	unlock(&trace.lock)
+		if trace.buf != 0 {
+			throw("trace: non-empty global trace buffer")
+		}
+		if trace.fullHead != 0 || trace.fullTail != 0 {
+			throw("trace: non-empty full trace buffer")
+		}
+		if trace.reading != 0 || trace.reader.Load() != nil {
+			throw("trace: reading after shutdown")
+		}
+		for trace.empty != 0 {
+			buf := trace.empty
+			trace.empty = buf.ptr().link
+			sysFree(unsafe.Pointer(buf), unsafe.Sizeof(*buf.ptr()), &memstats.other_sys)
+		}
+		trace.strings = nil
+		trace.shutdown = false
+		trace.cpuLogRead = nil
+		unlock(&trace.lock)
+	})
 }
 
 // ReadTrace returns the next chunk of binary tracing data, blocking until data
@@ -419,48 +423,12 @@ func StopTrace() {
 // ReadTrace must be called from one goroutine at a time.
 func ReadTrace() []byte {
 top:
-	// This function may need to lock trace.lock recursively
-	// (goparkunlock -> traceGoPark -> traceEvent -> traceFlush).
-	// To allow this we use trace.lockOwner.
-	// Also this function must not allocate while holding trace.lock:
-	// allocation can call heap allocate, which will try to emit a trace
-	// event while holding heap lock.
-	lock(&trace.lock)
-	trace.lockOwner = getg()
-
-	if trace.reader.Load() != nil {
-		// More than one goroutine reads trace. This is bad.
-		// But we rather do not crash the program because of tracing,
-		// because tracing can be enabled at runtime on prod servers.
-		trace.lockOwner = nil
-		unlock(&trace.lock)
-		println("runtime: ReadTrace called from multiple goroutines simultaneously")
-		return nil
-	}
-	// Recycle the old buffer.
-	if buf := trace.reading; buf != 0 {
-		buf.ptr().link = trace.empty
-		trace.empty = buf
-		trace.reading = 0
-	}
-	// Write trace header.
-	if !trace.headerWritten {
-		trace.headerWritten = true
-		trace.lockOwner = nil
-		unlock(&trace.lock)
-		return []byte("go 1.19 trace\x00\x00\x00")
-	}
-	// Optimistically look for CPU profile samples. This may write new stack
-	// records, and may write new tracing buffers.
-	if !trace.footerWritten && !trace.shutdown {
-		traceReadCPU()
-	}
-	// Wait for new data.
-	if trace.fullHead == 0 && !trace.shutdown {
-		// We don't simply use a note because the scheduler
-		// executes this goroutine directly when it wakes up
-		// (also a note would consume an M).
-		unlock(&trace.lock)
+	var buf []byte
+	var park bool
+	systemstack(func() {
+		buf, park = readTrace0()
+	})
+	if park {
 		gopark(func(gp *g, _ unsafe.Pointer) bool {
 			if !trace.reader.CompareAndSwapNoWB(nil, gp) {
 				// We're racing with another reader.
@@ -484,6 +452,70 @@ top:
 		goto top
 	}
 
+	return buf
+}
+
+// readTrace0 is ReadTrace's continuation on g0. This must run on the
+// system stack because it acquires trace.lock.
+//
+//go:systemstack
+func readTrace0() (buf []byte, park bool) {
+	if raceenabled {
+		// g0 doesn't have a race context. Borrow the user G's.
+		if getg().racectx != 0 {
+			throw("expected racectx == 0")
+		}
+		getg().racectx = getg().m.curg.racectx
+		// (This defer should get open-coded, which is safe on
+		// the system stack.)
+		defer func() { getg().racectx = 0 }()
+	}
+
+	// This function may need to lock trace.lock recursively
+	// (goparkunlock -> traceGoPark -> traceEvent -> traceFlush).
+	// To allow this we use trace.lockOwner.
+	// Also this function must not allocate while holding trace.lock:
+	// allocation can call heap allocate, which will try to emit a trace
+	// event while holding heap lock.
+	lock(&trace.lock)
+	trace.lockOwner = getg().m.curg
+
+	if trace.reader.Load() != nil {
+		// More than one goroutine reads trace. This is bad.
+		// But we rather do not crash the program because of tracing,
+		// because tracing can be enabled at runtime on prod servers.
+		trace.lockOwner = nil
+		unlock(&trace.lock)
+		println("runtime: ReadTrace called from multiple goroutines simultaneously")
+		return nil, false
+	}
+	// Recycle the old buffer.
+	if buf := trace.reading; buf != 0 {
+		buf.ptr().link = trace.empty
+		trace.empty = buf
+		trace.reading = 0
+	}
+	// Write trace header.
+	if !trace.headerWritten {
+		trace.headerWritten = true
+		trace.lockOwner = nil
+		unlock(&trace.lock)
+		return []byte("go 1.19 trace\x00\x00\x00"), false
+	}
+	// Optimistically look for CPU profile samples. This may write new stack
+	// records, and may write new tracing buffers.
+	if !trace.footerWritten && !trace.shutdown {
+		traceReadCPU()
+	}
+	// Wait for new data.
+	if trace.fullHead == 0 && !trace.shutdown {
+		// We don't simply use a note because the scheduler
+		// executes this goroutine directly when it wakes up
+		// (also a note would consume an M).
+		trace.lockOwner = nil
+		unlock(&trace.lock)
+		return nil, true
+	}
 newFull:
 	assertLockHeld(&trace.lock)
 	// Write a buffer.
@@ -492,7 +524,7 @@ newFull:
 		trace.reading = buf
 		trace.lockOwner = nil
 		unlock(&trace.lock)
-		return buf.ptr().arr[:buf.ptr().pos]
+		return buf.ptr().arr[:buf.ptr().pos], false
 	}
 
 	// Write footer with timer frequency.
@@ -534,17 +566,21 @@ newFull:
 		}
 		// trace.enabled is already reset, so can call traceable functions.
 		semrelease(&trace.shutdownSema)
-		return nil
+		return nil, false
 	}
 	// Also bad, but see the comment above.
 	trace.lockOwner = nil
 	unlock(&trace.lock)
 	println("runtime: spurious wakeup of trace reader")
-	return nil
+	return nil, false
 }
 
 // traceReader returns the trace reader that should be woken up, if any.
 // Callers should first check that trace.enabled or trace.shutdown is set.
+//
+// This must run on the system stack because it acquires trace.lock.
+//
+//go:systemstack
 func traceReader() *g {
 	// Optimistic check first
 	if traceReaderAvailable() == nil {
@@ -571,6 +607,10 @@ func traceReaderAvailable() *g {
 }
 
 // traceProcFree frees trace buffer associated with pp.
+//
+// This must run on the system stack because it acquires trace.lock.
+//
+//go:systemstack
 func traceProcFree(pp *p) {
 	buf := pp.tracebuf
 	pp.tracebuf = 0
@@ -661,7 +701,9 @@ func traceEventLocked(extraBytes int, mp *m, pid int32, bufp *traceBufPtr, ev by
 	// TODO: test on non-zero extraBytes param.
 	maxSize := 2 + 5*traceBytesPerNumber + extraBytes // event type, length, sequence, timestamp, stack id and two add params
 	if buf == nil || len(buf.arr)-buf.pos < maxSize {
-		buf = traceFlush(traceBufPtrOf(buf), pid).ptr()
+		systemstack(func() {
+			buf = traceFlush(traceBufPtrOf(buf), pid).ptr()
+		})
 		bufp.set(buf)
 	}
 
@@ -802,7 +844,9 @@ func traceReadCPU() {
 
 			buf := bufp.ptr()
 			if buf == nil {
-				*bufp = traceFlush(*bufp, 0)
+				systemstack(func() {
+					*bufp = traceFlush(*bufp, 0)
+				})
 				buf = bufp.ptr()
 			}
 			for i := range stk {
@@ -856,6 +900,10 @@ func traceReleaseBuffer(pid int32) {
 }
 
 // traceFlush puts buf onto stack of full buffers and returns an empty buffer.
+//
+// This must run on the system stack because it acquires trace.lock.
+//
+//go:systemstack
 func traceFlush(buf traceBufPtr, pid int32) traceBufPtr {
 	owner := trace.lockOwner
 	dolock := owner == nil || owner != getg().m.curg
@@ -933,8 +981,10 @@ func traceString(bufp *traceBufPtr, pid int32, s string) (uint64, *traceBufPtr)
 	buf := bufp.ptr()
 	size := 1 + 2*traceBytesPerNumber + len(s)
 	if buf == nil || len(buf.arr)-buf.pos < size {
-		buf = traceFlush(traceBufPtrOf(buf), pid).ptr()
-		bufp.set(buf)
+		systemstack(func() {
+			buf = traceFlush(traceBufPtrOf(buf), pid).ptr()
+			bufp.set(buf)
+		})
 	}
 	buf.byte(traceEvString)
 	buf.varint(id)
@@ -1088,6 +1138,10 @@ func traceFrames(bufp traceBufPtr, pcs []uintptr) ([]traceFrame, traceBufPtr) {
 
 // dump writes all previously cached stacks to trace buffers,
 // releases all memory and resets state.
+//
+// This must run on the system stack because it calls traceFlush.
+//
+//go:systemstack
 func (tab *traceStackTable) dump(bufp traceBufPtr) traceBufPtr {
 	for i, _ := range tab.tab {
 		stk := tab.tab[i].ptr()