runtime: allocate goroutine ids in batches

Helps reduce contention on sched.goidgen. benchmark old ns/op new ns/op delta BenchmarkCreateGoroutines-16 259 237 -8.49% BenchmarkCreateGoroutinesParallel-16 127 43 -66.06% R=golang-codereviews, dave, bradfitz, khr CC=golang-codereviews, rsc https://golang.org/cl/46970043
2024-11-12 10:20:27 -07:00 · 2014-01-22 10:34:36 +04:00 · 2014-01-22 10:34:36 +04:00 · 98b50b89a8
commit 98b50b89a8
parent 8a3c587dc1
2 changed files with 23 additions and 6 deletions
--- a/src/pkg/runtime/proc.c
+++ b/src/pkg/runtime/proc.c
@ -58,9 +58,16 @@ struct Sched {
 	int32	profilehz;	// cpu profiling rate
 };

+enum
+{
 	// The max value of GOMAXPROCS.
 	// There are no fundamental restrictions on the value.
-enum { MaxGomaxprocs = 1<<8 };
+	MaxGomaxprocs = 1<<8,
+
+	// Number of goroutine ids to grab from runtime·sched.goidgen to local per-P cache at once.
+	// 16 seems to provide enough amortization, but other than that it's mostly arbitrary number.
+	GoidCacheBatch = 16,
+};

 Sched	runtime·sched;
 int32	runtime·gomaxprocs;
@ -1752,6 +1759,7 @@ runtime·newproc1(FuncVal *fn, byte *argp, int32 narg, int32 nret, void *callerp
 {
 	byte *sp;
 	G *newg;
+	P *p;
 	int32 siz;

 //runtime·printf("newproc1 %p %p narg=%d nret=%d\n", fn->fn, argp, narg, nret);
@ -1766,7 +1774,8 @@ runtime·newproc1(FuncVal *fn, byte *argp, int32 narg, int32 nret, void *callerp
 	if(siz > StackMin - 1024)
 		runtime·throw("runtime.newproc: function arguments too large for new goroutine");

-	if((newg = gfget(m->p)) != nil) {
+	p = m->p;
+	if((newg = gfget(p)) != nil) {
 		if(newg->stackguard - StackGuard != newg->stack0)
 			runtime·throw("invalid stack in newg");
 	} else {
@ -1790,11 +1799,15 @@ runtime·newproc1(FuncVal *fn, byte *argp, int32 narg, int32 nret, void *callerp
 	runtime·gostartcallfn(&newg->sched, fn);
 	newg->gopc = (uintptr)callerpc;
 	newg->status = Grunnable;
-	newg->goid = runtime·xadd64(&runtime·sched.goidgen, 1);
+	if(p->goidcache == p->goidcacheend) {
+		p->goidcache = runtime·xadd64(&runtime·sched.goidgen, GoidCacheBatch);
+		p->goidcacheend = p->goidcache + GoidCacheBatch;
+	}
+	newg->goid = p->goidcache++;
 	newg->panicwrap = 0;
 	if(raceenabled)
 		newg->racectx = runtime·racegostart((void*)callerpc);
-	runqput(m->p, newg);
+	runqput(p, newg);

 	if(runtime·atomicload(&runtime·sched.npidle) != 0 && runtime·atomicload(&runtime·sched.nmspinning) == 0 && fn->fn != runtime·main)  // TODO: fast atomic
 		wakep();
--- a/src/pkg/runtime/runtime.h
+++ b/src/pkg/runtime/runtime.h
@ -385,6 +385,10 @@ struct P
 	MCache*	mcache;
 	Defer*	deferpool[5];	// pool of available Defer structs of different sizes (see panic.c)

+	// Cache of goroutine ids, amortizes accesses to runtime·sched.goidgen.
+	uint64	goidcache;
+	uint64	goidcacheend;
+
 	// Queue of runnable goroutines.
 	uint32	runqhead;
 	uint32	runqtail;