diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index 6c51517522..b6c241f141 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -378,7 +378,6 @@ type workType struct {
 	// markDoneSema protects transitions from mark to mark termination.
 	markDoneSema uint32
 
-	bgMarkReady note   // signal background mark worker has started
 	bgMarkDone  uint32 // cas to 1 when at a background mark completion point
 	// Background mark completion signaling
 
@@ -1230,11 +1229,34 @@ func gcBgMarkStartWorkers() {
 	//
 	// Worker Gs don't exit if gomaxprocs is reduced. If it is raised
 	// again, we can reuse the old workers; no need to create new workers.
-	for gcBgMarkWorkerCount < gomaxprocs {
-		go gcBgMarkWorker()
+	if gcBgMarkWorkerCount >= gomaxprocs {
+		return
+	}
 
-		notetsleepg(&work.bgMarkReady, -1)
-		noteclear(&work.bgMarkReady)
+	// Increment mp.locks when allocating. We are called within gcStart,
+	// and thus must not trigger another gcStart via an allocation. gcStart
+	// bails when allocating with locks held, so simulate that for these
+	// allocations.
+	//
+	// TODO(prattmic): cleanup gcStart to use a more explicit "in gcStart"
+	// check for bailing.
+	mp := acquirem()
+	ready := make(chan struct{}, 1)
+	releasem(mp)
+
+	for gcBgMarkWorkerCount < gomaxprocs {
+		mp := acquirem() // See above, we allocate a closure here.
+		go gcBgMarkWorker(ready)
+		releasem(mp)
+
+		// N.B. we intentionally wait on each goroutine individually
+		// rather than starting all in a batch and then waiting once
+		// afterwards. By running one goroutine at a time, we can take
+		// advantage of runnext to bounce back and forth between
+		// workers and this goroutine. In an overloaded application,
+		// this can reduce GC start latency by prioritizing these
+		// goroutines rather than waiting on the end of the run queue.
+		<-ready
 		// The worker is now guaranteed to be added to the pool before
 		// its P's next findRunnableGCWorker.
 
@@ -1273,7 +1295,7 @@ type gcBgMarkWorkerNode struct {
 	m muintptr
 }
 
-func gcBgMarkWorker() {
+func gcBgMarkWorker(ready chan struct{}) {
 	gp := getg()
 
 	// We pass node to a gopark unlock function, so it can't be on
@@ -1286,7 +1308,8 @@ func gcBgMarkWorker() {
 	node.gp.set(gp)
 
 	node.m.set(acquirem())
-	notewakeup(&work.bgMarkReady)
+
+	ready <- struct{}{}
 	// After this point, the background mark worker is generally scheduled
 	// cooperatively by gcController.findRunnableGCWorker. While performing
 	// work on the P, preemption is disabled because we are working on
@@ -1299,10 +1322,10 @@ func gcBgMarkWorker() {
 	// fine; it will eventually gopark again for further scheduling via
 	// findRunnableGCWorker.
 	//
-	// Since we disable preemption before notifying bgMarkReady, we
-	// guarantee that this G will be in the worker pool for the next
-	// findRunnableGCWorker. This isn't strictly necessary, but it reduces
-	// latency between _GCmark starting and the workers starting.
+	// Since we disable preemption before notifying ready, we guarantee that
+	// this G will be in the worker pool for the next findRunnableGCWorker.
+	// This isn't strictly necessary, but it reduces latency between
+	// _GCmark starting and the workers starting.
 
 	for {
 		// Go to sleep until woken by