cmd/link: faster algorithm for nosplit stack checking, better errors

The linker performs a global analysis of all nosplit call chains to check they fit in the stack space ensured by splittable functions. That analysis has two problems right now: 1. It's inefficient. It performs a top-down analysis, starting with every nosplit function and the nosplit stack limit and walking *down* the call graph to compute how much stack remains at every call. As a result, it visits the same functions over and over, often with different remaining stack depths. This approach is historical: this check was originally written in C and this approach avoided the need for any interesting data structures. 2. If some call chain is over the limit, it only reports a single call chain. As a result, if the check does fail, you often wind up playing whack-a-mole by guessing where the problem is in the one chain, trying to reduce the stack size, and then seeing if the link works or reports a different path. This CL completely rewrites the nosplit stack check. It now uses a bottom-up analysis, computing the maximum stack height required by every function's call tree. This visits every function exactly once, making it much more efficient. It uses slightly more heap space for intermediate storage, but still very little in the scheme of the overall link. For example, when linking cmd/go, the new algorithm virtually eliminates the time spent in this pass, and reduces overall link time: │ before │ after │ │ sec/op │ sec/op vs base │ Dostkcheck 7.926m ± 4% 1.831m ± 6% -76.90% (p=0.000 n=20) TotalTime 301.3m ± 1% 296.4m ± 3% -1.62% (p=0.040 n=20) │ before │ after │ │ B/op │ B/op vs base │ Dostkcheck 40.00Ki ± 0% 212.15Ki ± 0% +430.37% (p=0.000 n=20) Most of this time is spent analyzing the runtime, so for larger binaries, the total time saved is roughly the same, and proportionally less of the overall link. If the new implementation finds an error, it redoes the analysis, switching to preferring quality of error reporting over performance. For error reporting, it computes stack depths top-down (like the old algorithm), and reports *all* paths that are over the stack limit, presented as a tree for compactness. For example, this is the output from a simple test case from test/nosplit with two over-limit paths from f1: main.f1: nosplit stack overflow main.f1 grows 768 bytes, calls main.f2 grows 56 bytes, calls main.f4 grows 48 bytes 80 bytes over limit grows 768 bytes, calls main.f3 grows 104 bytes 80 bytes over limit While we're here, we do a few nice cleanups: - We add a debug output flag, which will be useful for understanding what our nosplit chains look like and which ones are close to running over. - We move the implementation out of the fog of lib.go to its own file. - The implementation is generally more Go-like and less C-like. Change-Id: If1ab31197f5215475559b93695c44a01bd16e276 Reviewed-on: https://go-review.googlesource.com/c/go/+/398176 Run-TryBot: Austin Clements <austin@google.com> Reviewed-by: Than McIntosh <thanm@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com> TryBot-Result: Gopher Robot <gobot@golang.org>
2024-11-23 01:50:04 -07:00 · 2022-04-01 15:51:12 -04:00 · 2022-04-01 15:51:12 -04:00 · e25f46e596
commit e25f46e596
parent 7a06243205
7 changed files with 575 additions and 221 deletions
--- a/src/cmd/link/internal/ld/lib.go
+++ b/src/cmd/link/internal/ld/lib.go
@ -34,7 +34,6 @@ import (
 	"bytes"
 	"cmd/internal/bio"
 	"cmd/internal/goobj"
-	"cmd/internal/obj"
 	"cmd/internal/objabi"
 	"cmd/internal/sys"
 	"cmd/link/internal/loadelf"
@ -2343,223 +2342,6 @@ func addsection(ldr *loader.Loader, arch *sys.Arch, seg *sym.Segment, name strin
 	return sect
 }

-type chain struct {
-	sym   loader.Sym
-	up    *chain
-	limit int // limit on entry to sym
-}
-
-func callsize(ctxt *Link) int {
-	if ctxt.Arch.HasLR {
-		return 0
-	}
-	return ctxt.Arch.RegSize
-}
-
-type stkChk struct {
-	ldr       *loader.Loader
-	ctxt      *Link
-	morestack loader.Sym
-	done      loader.Bitmap
-}
-
-// Walk the call tree and check that there is always enough stack space
-// for the call frames, especially for a chain of nosplit functions.
-func (ctxt *Link) dostkcheck() {
-	ldr := ctxt.loader
-	sc := stkChk{
-		ldr:       ldr,
-		ctxt:      ctxt,
-		morestack: ldr.Lookup("runtime.morestack", 0),
-		done:      loader.MakeBitmap(ldr.NSym()),
-	}
-
-	// Every splitting function ensures that there are at least StackLimit
-	// bytes available below SP when the splitting prologue finishes.
-	// If the splitting function calls F, then F begins execution with
-	// at least StackLimit - callsize() bytes available.
-	// Check that every function behaves correctly with this amount
-	// of stack, following direct calls in order to piece together chains
-	// of non-splitting functions.
-	var ch chain
-	ch.limit = objabi.StackLimit - callsize(ctxt)
-	if buildcfg.GOARCH == "arm64" {
-		// need extra 8 bytes below SP to save FP
-		ch.limit -= 8
-	}
-
-	// Check every function, but do the nosplit functions in a first pass,
-	// to make the printed failure chains as short as possible.
-	for _, s := range ctxt.Textp {
-		if ldr.IsNoSplit(s) {
-			ch.sym = s
-			sc.check(&ch, 0)
-		}
-	}
-
-	for _, s := range ctxt.Textp {
-		if !ldr.IsNoSplit(s) {
-			ch.sym = s
-			sc.check(&ch, 0)
-		}
-	}
-}
-
-func (sc *stkChk) check(up *chain, depth int) int {
-	limit := up.limit
-	s := up.sym
-	ldr := sc.ldr
-	ctxt := sc.ctxt
-
-	// Don't duplicate work: only need to consider each
-	// function at top of safe zone once.
-	top := limit == objabi.StackLimit-callsize(ctxt)
-	if top {
-		if sc.done.Has(s) {
-			return 0
-		}
-		sc.done.Set(s)
-	}
-
-	if depth > 500 {
-		sc.ctxt.Errorf(s, "nosplit stack check too deep")
-		sc.broke(up, 0)
-		return -1
-	}
-
-	if ldr.AttrExternal(s) {
-		// external function.
-		// should never be called directly.
-		// onlyctxt.Diagnose the direct caller.
-		// TODO(mwhudson): actually think about this.
-		// TODO(khr): disabled for now. Calls to external functions can only happen on the g0 stack.
-		// See the trampolines in src/runtime/sys_darwin_$ARCH.go.
-		//if depth == 1 && ldr.SymType(s) != sym.SXREF && !ctxt.DynlinkingGo() &&
-		//	ctxt.BuildMode != BuildModeCArchive && ctxt.BuildMode != BuildModePIE && ctxt.BuildMode != BuildModeCShared && ctxt.BuildMode != BuildModePlugin {
-		//	Errorf(s, "call to external function")
-		//}
-		return -1
-	}
-	info := ldr.FuncInfo(s)
-	if !info.Valid() { // external function. see above.
-		return -1
-	}
-
-	if limit < 0 {
-		sc.broke(up, limit)
-		return -1
-	}
-
-	// morestack looks like it calls functions,
-	// but it switches the stack pointer first.
-	if s == sc.morestack {
-		return 0
-	}
-
-	var ch chain
-	ch.up = up
-
-	if !ldr.IsNoSplit(s) {
-		// Ensure we have enough stack to call morestack.
-		ch.limit = limit - callsize(ctxt)
-		ch.sym = sc.morestack
-		if sc.check(&ch, depth+1) < 0 {
-			return -1
-		}
-		if !top {
-			return 0
-		}
-		// Raise limit to allow frame.
-		locals := info.Locals()
-		limit = objabi.StackLimit + int(locals) + int(ctxt.Arch.FixedFrameSize)
-	}
-
-	// Walk through sp adjustments in function, consuming relocs.
-	relocs := ldr.Relocs(s)
-	var ch1 chain
-	pcsp := obj.NewPCIter(uint32(ctxt.Arch.MinLC))
-	ri := 0
-	for pcsp.Init(ldr.Data(ldr.Pcsp(s))); !pcsp.Done; pcsp.Next() {
-		// pcsp.value is in effect for [pcsp.pc, pcsp.nextpc).
-
-		// Check stack size in effect for this span.
-		if int32(limit)-pcsp.Value < 0 {
-			sc.broke(up, int(int32(limit)-pcsp.Value))
-			return -1
-		}
-
-		// Process calls in this span.
-		for ; ri < relocs.Count(); ri++ {
-			r := relocs.At(ri)
-			if uint32(r.Off()) >= pcsp.NextPC {
-				break
-			}
-			t := r.Type()
-			switch {
-			case t.IsDirectCall():
-				ch.limit = int(int32(limit) - pcsp.Value - int32(callsize(ctxt)))
-				ch.sym = r.Sym()
-				if sc.check(&ch, depth+1) < 0 {
-					return -1
-				}
-
-			// Indirect call. Assume it is a call to a splitting function,
-			// so we have to make sure it can call morestack.
-			// Arrange the data structures to report both calls, so that
-			// if there is an error, stkprint shows all the steps involved.
-			case t == objabi.R_CALLIND:
-				ch.limit = int(int32(limit) - pcsp.Value - int32(callsize(ctxt)))
-				ch.sym = 0
-				ch1.limit = ch.limit - callsize(ctxt) // for morestack in called prologue
-				ch1.up = &ch
-				ch1.sym = sc.morestack
-				if sc.check(&ch1, depth+2) < 0 {
-					return -1
-				}
-			}
-		}
-	}
-
-	return 0
-}
-
-func (sc *stkChk) broke(ch *chain, limit int) {
-	sc.ctxt.Errorf(ch.sym, "nosplit stack overflow")
-	sc.print(ch, limit)
-}
-
-func (sc *stkChk) print(ch *chain, limit int) {
-	ldr := sc.ldr
-	ctxt := sc.ctxt
-	var name string
-	if ch.sym != 0 {
-		name = fmt.Sprintf("%s<%d>", ldr.SymName(ch.sym), ldr.SymVersion(ch.sym))
-		if ldr.IsNoSplit(ch.sym) {
-			name += " (nosplit)"
-		}
-	} else {
-		name = "function pointer"
-	}
-
-	if ch.up == nil {
-		// top of chain. ch.sym != 0.
-		if ldr.IsNoSplit(ch.sym) {
-			fmt.Printf("\t%d\tassumed on entry to %s\n", ch.limit, name)
-		} else {
-			fmt.Printf("\t%d\tguaranteed after split check in %s\n", ch.limit, name)
-		}
-	} else {
-		sc.print(ch.up, ch.limit+callsize(ctxt))
-		if !ctxt.Arch.HasLR {
-			fmt.Printf("\t%d\ton entry to %s\n", ch.limit, name)
-		}
-	}
-
-	if ch.limit != limit {
-		fmt.Printf("\t%d\tafter %s uses %d\n", limit, name, ch.limit-limit)
-	}
-}
-
 func usage() {
 	fmt.Fprintf(os.Stderr, "usage: link [options] main.o\n")
 	objabi.Flagprint(os.Stderr)
--- a/src/cmd/link/internal/ld/main.go
+++ b/src/cmd/link/internal/ld/main.go
@ -93,6 +93,7 @@ var (
 	flagInterpreter   = flag.String("I", "", "use `linker` as ELF dynamic linker")
 	FlagDebugTramp    = flag.Int("debugtramp", 0, "debug trampolines")
 	FlagDebugTextSize = flag.Int("debugtextsize", 0, "debug text section max size")
+	flagDebugNosplit  = flag.Bool("debugnosplit", false, "dump nosplit call graph")
 	FlagStrictDups    = flag.Int("strictdups", 0, "sanity check duplicate symbol contents during object file reading (1=warn 2=err).")
 	FlagRound         = flag.Int("R", -1, "set address rounding `quantum`")
 	FlagTextAddr      = flag.Int64("T", -1, "set text segment `address`")
@ -283,8 +284,8 @@ func Main(arch *sys.Arch, theArch Arch) {
 	bench.Start("callgraph")
 	ctxt.callgraph()

-	bench.Start("dostkcheck")
-	ctxt.dostkcheck()
+	bench.Start("doStackCheck")
+	ctxt.doStackCheck()

 	bench.Start("mangleTypeSym")
 	ctxt.mangleTypeSym()
--- a/src/cmd/link/internal/ld/stackcheck.go
+++ b/src/cmd/link/internal/ld/stackcheck.go
@ -0,0 +1,421 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package ld
+
+import (
+	"cmd/internal/obj"
+	"cmd/internal/objabi"
+	"cmd/link/internal/loader"
+	"fmt"
+	"internal/buildcfg"
+	"sort"
+	"strings"
+)
+
+type stackCheck struct {
+	ctxt      *Link
+	ldr       *loader.Loader
+	morestack loader.Sym
+	callSize  int // The number of bytes added by a CALL
+
+	// height records the maximum number of bytes a function and
+	// its callees can add to the stack without a split check.
+	height map[loader.Sym]int16
+
+	// graph records the out-edges from each symbol. This is only
+	// populated on a second pass if the first pass reveals an
+	// over-limit function.
+	graph map[loader.Sym][]stackCheckEdge
+}
+
+type stackCheckEdge struct {
+	growth int        // Stack growth in bytes at call to target
+	target loader.Sym // 0 for stack growth without a call
+}
+
+// stackCheckCycle is a sentinel stored in the height map to detect if
+// we've found a cycle. This is effectively an "infinite" stack
+// height, so we use the closest value to infinity that we can.
+const stackCheckCycle int16 = 1<<15 - 1
+
+// stackCheckIndirect is a sentinel Sym value used to represent the
+// target of an indirect/closure call.
+const stackCheckIndirect loader.Sym = -1
+
+// doStackCheck walks the call tree to check that there is always
+// enough stack space for call frames, especially for a chain of
+// nosplit functions.
+//
+// It walks all functions to accumulate the number of bytes they can
+// grow the stack by without a split check and checks this against the
+// limit.
+func (ctxt *Link) doStackCheck() {
+	sc := newStackCheck(ctxt, false)
+
+	// limit is number of bytes a splittable function ensures are
+	// available on the stack. If any call chain exceeds this
+	// depth, the stack check test fails.
+	//
+	// The call to morestack in every splittable function ensures
+	// that there are at least StackLimit bytes available below SP
+	// when morestack returns.
+	limit := objabi.StackLimit - sc.callSize
+	if buildcfg.GOARCH == "arm64" {
+		// Need an extra 8 bytes below SP to save FP.
+		limit -= 8
+	}
+
+	// Compute stack heights without any back-tracking information.
+	// This will almost certainly succeed and we can simply
+	// return. If it fails, we do a second pass with back-tracking
+	// to produce a good error message.
+	//
+	// This accumulates stack heights bottom-up so it only has to
+	// visit every function once.
+	var failed []loader.Sym
+	for _, s := range ctxt.Textp {
+		if sc.check(s) > limit {
+			failed = append(failed, s)
+		}
+	}
+
+	if len(failed) > 0 {
+		// Something was over-limit, so now we do the more
+		// expensive work to report a good error. First, for
+		// the over-limit functions, redo the stack check but
+		// record the graph this time.
+		sc = newStackCheck(ctxt, true)
+		for _, s := range failed {
+			sc.check(s)
+		}
+
+		// Find the roots of the graph (functions that are not
+		// called by any other function).
+		roots := sc.findRoots()
+
+		// Find and report all paths that go over the limit.
+		// This accumulates stack depths top-down. This is
+		// much less efficient because we may have to visit
+		// the same function multiple times at different
+		// depths, but lets us find all paths.
+		for _, root := range roots {
+			ctxt.Errorf(root, "nosplit stack overflow")
+			chain := []stackCheckChain{{stackCheckEdge{0, root}, false}}
+			sc.report(root, limit, &chain)
+		}
+	}
+}
+
+func newStackCheck(ctxt *Link, graph bool) *stackCheck {
+	sc := &stackCheck{
+		ctxt:      ctxt,
+		ldr:       ctxt.loader,
+		morestack: ctxt.loader.Lookup("runtime.morestack", 0),
+		height:    make(map[loader.Sym]int16, len(ctxt.Textp)),
+	}
+	// Compute stack effect of a CALL operation. 0 on LR machines.
+	// 1 register pushed on non-LR machines.
+	if !ctxt.Arch.HasLR {
+		sc.callSize = ctxt.Arch.RegSize
+	}
+
+	if graph {
+		// We're going to record the call graph.
+		sc.graph = make(map[loader.Sym][]stackCheckEdge)
+	}
+
+	return sc
+}
+
+func (sc *stackCheck) symName(sym loader.Sym) string {
+	switch sym {
+	case stackCheckIndirect:
+		return "indirect"
+	case 0:
+		return "leaf"
+	}
+	return fmt.Sprintf("%s<%d>", sc.ldr.SymName(sym), sc.ldr.SymVersion(sym))
+}
+
+// check returns the stack height of sym. It populates sc.height and
+// sc.graph for sym and every function in its call tree.
+func (sc *stackCheck) check(sym loader.Sym) int {
+	if h, ok := sc.height[sym]; ok {
+		// We've already visited this symbol or we're in a cycle.
+		return int(h)
+	}
+	// Store the sentinel so we can detect cycles.
+	sc.height[sym] = stackCheckCycle
+	// Compute and record the height and optionally edges.
+	h, edges := sc.computeHeight(sym, *flagDebugNosplit || sc.graph != nil)
+	if h > int(stackCheckCycle) { // Prevent integer overflow
+		h = int(stackCheckCycle)
+	}
+	sc.height[sym] = int16(h)
+	if sc.graph != nil {
+		sc.graph[sym] = edges
+	}
+
+	if *flagDebugNosplit {
+		for _, edge := range edges {
+			fmt.Printf("nosplit: %s +%d", sc.symName(sym), edge.growth)
+			if edge.target == 0 {
+				// Local stack growth or leaf function.
+				fmt.Printf("\n")
+			} else {
+				fmt.Printf(" -> %s\n", sc.symName(edge.target))
+			}
+		}
+	}
+
+	return h
+}
+
+// computeHeight returns the stack height of sym. If graph is true, it
+// also returns the out-edges of sym.
+//
+// Caching is applied to this in check. Call check instead of calling
+// this directly.
+func (sc *stackCheck) computeHeight(sym loader.Sym, graph bool) (int, []stackCheckEdge) {
+	ldr := sc.ldr
+
+	// Check special cases.
+	if sym == sc.morestack {
+		// morestack looks like it calls functions, but they
+		// either happen only when already on the system stack
+		// (where there is ~infinite space), or after
+		// switching to the system stack. Hence, its stack
+		// height on the user stack is 0.
+		return 0, nil
+	}
+	if sym == stackCheckIndirect {
+		// Assume that indirect/closure calls are always to
+		// splittable functions, so they just need enough room
+		// to call morestack.
+		return sc.callSize, []stackCheckEdge{{sc.callSize, sc.morestack}}
+	}
+
+	// Ignore calls to external functions. Assume that these calls
+	// are only ever happening on the system stack, where there's
+	// plenty of room.
+	if ldr.AttrExternal(sym) {
+		return 0, nil
+	}
+	if info := ldr.FuncInfo(sym); !info.Valid() { // also external
+		return 0, nil
+	}
+
+	// Track the maximum height of this function and, if we're
+	// recording the graph, its out-edges.
+	var edges []stackCheckEdge
+	maxHeight := 0
+	ctxt := sc.ctxt
+	// addEdge adds a stack growth out of this function to
+	// function "target" or, if target == 0, a local stack growth
+	// within the function.
+	addEdge := func(growth int, target loader.Sym) {
+		if graph {
+			edges = append(edges, stackCheckEdge{growth, target})
+		}
+		height := growth
+		if target != 0 { // Don't walk into the leaf "edge"
+			height += sc.check(target)
+		}
+		if height > maxHeight {
+			maxHeight = height
+		}
+	}
+
+	if !ldr.IsNoSplit(sym) {
+		// Splittable functions start with a call to
+		// morestack, after which their height is 0. Account
+		// for the height of the call to morestack.
+		addEdge(sc.callSize, sc.morestack)
+		return maxHeight, edges
+	}
+
+	// This function is nosplit, so it adjusts SP without a split
+	// check.
+	//
+	// Walk through SP adjustments in function, consuming relocs
+	// and following calls.
+	maxLocalHeight := 0
+	relocs, ri := ldr.Relocs(sym), 0
+	pcsp := obj.NewPCIter(uint32(ctxt.Arch.MinLC))
+	for pcsp.Init(ldr.Data(ldr.Pcsp(sym))); !pcsp.Done; pcsp.Next() {
+		// pcsp.value is in effect for [pcsp.pc, pcsp.nextpc).
+		height := int(pcsp.Value)
+		if height > maxLocalHeight {
+			maxLocalHeight = height
+		}
+
+		// Process calls in this span.
+		for ; ri < relocs.Count(); ri++ {
+			r := relocs.At(ri)
+			if uint32(r.Off()) >= pcsp.NextPC {
+				break
+			}
+			t := r.Type()
+			if t.IsDirectCall() || t == objabi.R_CALLIND {
+				growth := height + sc.callSize
+				var target loader.Sym
+				if t == objabi.R_CALLIND {
+					target = stackCheckIndirect
+				} else {
+					target = r.Sym()
+				}
+				addEdge(growth, target)
+			}
+		}
+	}
+	if maxLocalHeight > maxHeight {
+		// This is either a leaf function, or the function
+		// grew its stack to larger than the maximum call
+		// height between calls. Either way, record that local
+		// stack growth.
+		addEdge(maxLocalHeight, 0)
+	}
+
+	return maxHeight, edges
+}
+
+func (sc *stackCheck) findRoots() []loader.Sym {
+	// Collect all nodes.
+	nodes := make(map[loader.Sym]struct{})
+	for k := range sc.graph {
+		nodes[k] = struct{}{}
+	}
+
+	// Start a DFS from each node and delete all reachable
+	// children. If we encounter an unrooted cycle, this will
+	// delete everything in that cycle, so we detect this case and
+	// track the lowest-numbered node encountered in the cycle and
+	// put that node back as a root.
+	var walk func(origin, sym loader.Sym) (cycle bool, lowest loader.Sym)
+	walk = func(origin, sym loader.Sym) (cycle bool, lowest loader.Sym) {
+		if _, ok := nodes[sym]; !ok {
+			// We already deleted this node.
+			return false, 0
+		}
+		delete(nodes, sym)
+
+		if origin == sym {
+			// We found an unrooted cycle. We already
+			// deleted all children of this node. Walk
+			// back up, tracking the lowest numbered
+			// symbol in this cycle.
+			return true, sym
+		}
+
+		// Delete children of this node.
+		for _, out := range sc.graph[sym] {
+			if c, l := walk(origin, out.target); c {
+				cycle = true
+				if lowest == 0 {
+					// On first cycle detection,
+					// add sym to the set of
+					// lowest-numbered candidates.
+					lowest = sym
+				}
+				if l < lowest {
+					lowest = l
+				}
+			}
+		}
+		return
+	}
+	for k := range nodes {
+		// Delete all children of k.
+		for _, out := range sc.graph[k] {
+			if cycle, lowest := walk(k, out.target); cycle {
+				// This is an unrooted cycle so we
+				// just deleted everything. Put back
+				// the lowest-numbered symbol.
+				nodes[lowest] = struct{}{}
+			}
+		}
+	}
+
+	// Sort roots by height. This makes the result deterministic
+	// and also improves the error reporting.
+	var roots []loader.Sym
+	for k := range nodes {
+		roots = append(roots, k)
+	}
+	sort.Slice(roots, func(i, j int) bool {
+		h1, h2 := sc.height[roots[i]], sc.height[roots[j]]
+		if h1 != h2 {
+			return h1 > h2
+		}
+		// Secondary sort by Sym.
+		return roots[i] < roots[j]
+	})
+	return roots
+}
+
+type stackCheckChain struct {
+	stackCheckEdge
+	printed bool
+}
+
+func (sc *stackCheck) report(sym loader.Sym, depth int, chain *[]stackCheckChain) {
+	// Walk the out-edges of sym. We temporarily pull the edges
+	// out of the graph to detect cycles and prevent infinite
+	// recursion.
+	edges, ok := sc.graph[sym]
+	isCycle := !(ok || sym == 0)
+	delete(sc.graph, sym)
+	for _, out := range edges {
+		*chain = append(*chain, stackCheckChain{out, false})
+		sc.report(out.target, depth-out.growth, chain)
+		*chain = (*chain)[:len(*chain)-1]
+	}
+	sc.graph[sym] = edges
+
+	// If we've reached the end of a chain and it went over the
+	// stack limit or was a cycle that would eventually go over,
+	// print the whole chain.
+	//
+	// We should either be in morestack (which has no out-edges)
+	// or the sentinel 0 Sym "called" from a leaf function (which
+	// has no out-edges), or we came back around a cycle (possibly
+	// to ourselves) and edges was temporarily nil'd.
+	if len(edges) == 0 && (depth < 0 || isCycle) {
+		var indent string
+		for i := range *chain {
+			ent := &(*chain)[i]
+			if ent.printed {
+				// Already printed on an earlier part
+				// of this call tree.
+				continue
+			}
+			ent.printed = true
+
+			if i == 0 {
+				// chain[0] is just the root function,
+				// not a stack growth.
+				fmt.Printf("%s\n", sc.symName(ent.target))
+				continue
+			}
+
+			indent = strings.Repeat("    ", i)
+			fmt.Print(indent)
+			// Grows the stack X bytes and (maybe) calls Y.
+			fmt.Printf("grows %d bytes", ent.growth)
+			if ent.target == 0 {
+				// Not a call, just a leaf. Print nothing.
+			} else {
+				fmt.Printf(", calls %s", sc.symName(ent.target))
+			}
+			fmt.Printf("\n")
+		}
+		// Print how far over this chain went.
+		if isCycle {
+			fmt.Printf("%sinfinite cycle\n", indent)
+		} else {
+			fmt.Printf("%s%d bytes over limit\n", indent, -depth)
+		}
+	}
+}
--- a/src/cmd/link/internal/ld/stackcheck_test.go
+++ b/src/cmd/link/internal/ld/stackcheck_test.go
@ -0,0 +1,89 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package ld
+
+import (
+	"cmd/internal/objabi"
+	"cmd/internal/sys"
+	"fmt"
+	"internal/testenv"
+	"os"
+	"os/exec"
+	"regexp"
+	"testing"
+)
+
+// See also $GOROOT/test/nosplit.go for multi-platform edge case tests.
+
+func TestStackCheckOutput(t *testing.T) {
+	testenv.MustHaveGoBuild(t)
+	t.Parallel()
+
+	cmd := exec.Command(testenv.GoToolPath(t), "build", "-o", os.DevNull, "./testdata/stackcheck")
+	// The rules for computing frame sizes on all of the
+	// architectures are complicated, so just do this on amd64.
+	cmd.Env = append(os.Environ(), "GOARCH=amd64")
+	outB, err := cmd.CombinedOutput()
+
+	if err == nil {
+		t.Fatalf("expected link to fail")
+	}
+	out := string(outB)
+
+	t.Logf("linker output:\n%s", out)
+
+	// Construct expected stanzas
+	arch := sys.ArchAMD64
+	call := 0
+	if !arch.HasLR {
+		call = arch.RegSize
+	}
+	limit := objabi.StackLimit - call
+
+	wantMap := map[string]string{
+		"main.startSelf": fmt.Sprintf(
+			`main.startSelf<0>
+    grows 1008 bytes
+    %d bytes over limit
+`, 1008-limit),
+		"main.startChain": fmt.Sprintf(
+			`main.startChain<0>
+    grows 32 bytes, calls main.chain0<0>
+        grows 48 bytes, calls main.chainEnd<0>
+            grows 1008 bytes
+            %d bytes over limit
+    grows 32 bytes, calls main.chain2<0>
+        grows 80 bytes, calls main.chainEnd<0>
+            grows 1008 bytes
+            %d bytes over limit
+`, 32+48+1008-limit, 32+80+1008-limit),
+		"main.startRec": `main.startRec<0>
+    grows 8 bytes, calls main.startRec0<0>
+        grows 8 bytes, calls main.startRec<0>
+        infinite cycle
+`,
+	}
+
+	// Parse stanzas
+	stanza := regexp.MustCompile(`^(.*): nosplit stack overflow\n(.*\n(?: .*\n)*)`)
+	// Strip comments from cmd/go
+	out = regexp.MustCompile(`(?m)^#.*\n`).ReplaceAllString(out, "")
+	for len(out) > 0 {
+		m := stanza.FindStringSubmatch(out)
+		if m == nil {
+			t.Fatalf("unexpected output:\n%s", out)
+		}
+		out = out[len(m[0]):]
+		fn := m[1]
+		got := m[2]
+
+		want, ok := wantMap[fn]
+		if !ok {
+			t.Errorf("unexpected function: %s", fn)
+		} else if want != got {
+			t.Errorf("want:\n%sgot:\n%s", want, got)
+		}
+	}
+}
--- a/src/cmd/link/internal/ld/testdata/stackcheck/main.go
+++ b/src/cmd/link/internal/ld/testdata/stackcheck/main.go
@ -0,0 +1,20 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+func main() { asmMain() }
+
+func asmMain()
+
+func startSelf()
+
+func startChain()
+func chain0()
+func chain1()
+func chain2()
+func chainEnd()
+
+func startRec()
+func startRec0()
--- a/src/cmd/link/internal/ld/testdata/stackcheck/main.s
+++ b/src/cmd/link/internal/ld/testdata/stackcheck/main.s
@ -0,0 +1,40 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#define NOSPLIT 7
+
+TEXT ·asmMain(SB),0,$0-0
+	CALL ·startSelf(SB)
+	CALL ·startChain(SB)
+	CALL ·startRec(SB)
+	RET
+
+// Test reporting of basic over-the-limit
+TEXT ·startSelf(SB),NOSPLIT,$1000-0
+	RET
+
+// Test reporting of multiple over-the-limit chains
+TEXT ·startChain(SB),NOSPLIT,$16-0
+	CALL ·chain0(SB)
+	CALL ·chain1(SB)
+	CALL ·chain2(SB)
+	RET
+TEXT ·chain0(SB),NOSPLIT,$32-0
+	CALL ·chainEnd(SB)
+	RET
+TEXT ·chain1(SB),NOSPLIT,$48-0 // Doesn't go over
+	RET
+TEXT ·chain2(SB),NOSPLIT,$64-0
+	CALL ·chainEnd(SB)
+	RET
+TEXT ·chainEnd(SB),NOSPLIT,$1000-0 // Should be reported twice
+	RET
+
+// Test reporting of rootless recursion
+TEXT ·startRec(SB),NOSPLIT,$0-0
+	CALL ·startRec0(SB)
+	RET
+TEXT ·startRec0(SB),NOSPLIT,$0-0
+	CALL ·startRec(SB)
+	RET
--- a/test/nosplit.go
+++ b/test/nosplit.go
@ -51,7 +51,8 @@ var tests = `
 start 0

 # Large frame marked nosplit is always wrong.
-start 10000 nosplit
+# Frame is so large it overflows cmd/link's int16.
+start 100000 nosplit
 REJECT

 # Calling a large frame is okay.