From ba134539c58d8b157a9d5646de91dbd0616b33c4 Mon Sep 17 00:00:00 2001 From: Russ Cox Date: Sat, 23 Jul 2011 12:22:55 -0400 Subject: [PATCH] runtime: faster entersyscall/exitsyscall Replace cas with xadd in scheduler. Suggested by Dmitriy in last code review. Verified with Promela model. When there's actual contention for the atomic word, this avoids the looping that compare-and-swap requires. benchmark old ns/op new ns/op delta runtime_test.BenchmarkSyscall 32 26 -17.08% runtime_test.BenchmarkSyscall-2 155 59 -61.81% runtime_test.BenchmarkSyscall-3 112 52 -52.95% runtime_test.BenchmarkSyscall-4 94 48 -48.57% runtime_test.BenchmarkSyscallWork 871 872 +0.11% runtime_test.BenchmarkSyscallWork-2 481 477 -0.83% runtime_test.BenchmarkSyscallWork-3 338 335 -0.89% runtime_test.BenchmarkSyscallWork-4 263 256 -2.66% R=golang-dev, iant CC=golang-dev https://golang.org/cl/4800047 --- src/pkg/runtime/proc.c | 57 ++++++++++++------------------------------ src/pkg/runtime/proc.p | 42 +++++++++++++++++++++++-------- 2 files changed, 47 insertions(+), 52 deletions(-) diff --git a/src/pkg/runtime/proc.c b/src/pkg/runtime/proc.c index 6416651ce56..13bc52bb682 100644 --- a/src/pkg/runtime/proc.c +++ b/src/pkg/runtime/proc.c @@ -773,7 +773,7 @@ runtime·gosched(void) void runtime·entersyscall(void) { - uint32 v, w; + uint32 v; if(runtime·sched.predawn) return; @@ -796,24 +796,14 @@ runtime·entersyscall(void) // mcpu-- // gwait not true // waitstop && mcpu <= mcpumax not true - // If we can do the same with a single atomic read/write, + // If we can do the same with a single atomic add, // then we can skip the locks. - for(;;) { - v = runtime·sched.atomic; - if(atomic_gwaiting(v)) - break; - if(atomic_waitstop(v) && atomic_mcpu(v)-1 <= atomic_mcpumax(v)) - break; - w = v; - w += (-1< atomic_mcpumax(v))) + return; schedlock(); - - // atomic { mcpu--; } - v = runtime·xadd(&runtime·sched.atomic, (-1<profilehz != runtime·sched.profilehz) - break; - - v = runtime·sched.atomic; - if(atomic_mcpu(v) >= atomic_mcpumax(v)) - break; - - w = v; - w += (1<status = Grunning; - // Garbage collector isn't running (since we are), - // so okay to clear gcstack. - g->gcstack = nil; - return; - } + v = runtime·xadd(&runtime·sched.atomic, (1<profilehz == runtime·sched.profilehz && atomic_mcpu(v) <= atomic_mcpumax(v)) { + // There's a cpu for us, so we can run. + g->status = Grunning; + // Garbage collector isn't running (since we are), + // so okay to clear gcstack. + g->gcstack = nil; + return; } schedlock(); - // atomic { mcpu++; } - runtime·xadd(&runtime·sched.atomic, (1<status = Grunning, // but keeps the garbage collector from thinking diff --git a/src/pkg/runtime/proc.p b/src/pkg/runtime/proc.p index 337b0787730..f0b46de6114 100644 --- a/src/pkg/runtime/proc.p +++ b/src/pkg/runtime/proc.p @@ -3,8 +3,9 @@ // license that can be found in the LICENSE file. /* -model for proc.c as of 2011/07/15. -takes 4300 seconds to explore 1128130 states +model for proc.c as of 2011/07/22. + +takes 4900 seconds to explore 1189070 states with G=3, var_gomaxprocs=1 on a Core i7 L640 2.13 GHz Lenovo X201s. @@ -329,33 +330,53 @@ inline schedule() { nextgandunlock() } +/* + * schedpend is > 0 if a goroutine is about to committed to + * entering the scheduler but has not yet done so. + * Just as we don't test for the undesirable conditions when a + * goroutine is in the scheduler, we don't test for them when + * a goroutine will be in the scheduler shortly. + * Modeling this state lets us replace mcpu cas loops with + * simpler mcpu atomic adds. + */ +byte schedpend; + /* * entersyscall is like the C function. */ inline entersyscall() { + bit willsched; + /* * Fast path. Check all the conditions tested during schedlock/schedunlock * below, and if we can get through the whole thing without stopping, run it * in one atomic cas-based step. */ atomic { + atomic_mcpu--; if :: atomic_gwaiting -> skip - :: atomic_waitstop && atomic_mcpu-1 <= atomic_mcpumax -> + :: atomic_waitstop && atomic_mcpu <= atomic_mcpumax -> skip :: else -> - atomic_mcpu--; goto Lreturn_entersyscall; - fi + fi; + willsched = 1; + schedpend++; } /* * Normal path. */ schedlock() - d_step { - atomic_mcpu--; + opt_dstep { + if + :: willsched -> + schedpend--; + willsched = 0 + :: else + fi } if :: atomic_gwaiting -> @@ -382,11 +403,11 @@ inline exitsyscall() { */ atomic { // omitted profilehz check + atomic_mcpu++; if :: atomic_mcpu >= atomic_mcpumax -> skip :: else -> - atomic_mcpu++; goto Lreturn_exitsyscall fi } @@ -396,7 +417,6 @@ inline exitsyscall() { */ schedlock(); d_step { - atomic_mcpu++; if :: atomic_mcpu <= atomic_mcpumax -> skip @@ -497,10 +517,10 @@ active proctype monitor() { do // Should never have goroutines waiting with procs available. - :: !sched_lock && gwait > 0 && atomic_mcpu < atomic_mcpumax -> + :: !sched_lock && schedpend==0 && gwait > 0 && atomic_mcpu < atomic_mcpumax -> assert 0 // Should never have gc waiting for stop if things have already stopped. - :: !sched_lock && atomic_waitstop && atomic_mcpu <= atomic_mcpumax -> + :: !sched_lock && schedpend==0 && atomic_waitstop && atomic_mcpu <= atomic_mcpumax -> assert 0 od }