mirror of
https://github.com/golang/go
synced 2024-11-19 16:24:45 -07:00
runtime: replace Semacquire/Semrelease implementation
1. The implementation uses distributed hash table of waitlists instead of a centralized one. It significantly improves scalability for uncontended semaphores. 2. The implementation provides wait-free fast-path for signalers. 3. The implementation uses less locks (1 lock/unlock instead of 5 for Semacquire). 4. runtime·ready() call is moved out of critical section. 5. Semacquire() does not call semwake(). Benchmark results on HP Z600 (2 x Xeon E5620, 8 HT cores, 2.40GHz) are as follows: benchmark old ns/op new ns/op delta runtime_test.BenchmarkSemaUncontended 58.20 36.30 -37.63% runtime_test.BenchmarkSemaUncontended-2 199.00 18.30 -90.80% runtime_test.BenchmarkSemaUncontended-4 327.00 9.20 -97.19% runtime_test.BenchmarkSemaUncontended-8 491.00 5.32 -98.92% runtime_test.BenchmarkSemaUncontended-16 946.00 4.18 -99.56% runtime_test.BenchmarkSemaSyntNonblock 59.00 36.80 -37.63% runtime_test.BenchmarkSemaSyntNonblock-2 167.00 138.00 -17.37% runtime_test.BenchmarkSemaSyntNonblock-4 333.00 129.00 -61.26% runtime_test.BenchmarkSemaSyntNonblock-8 464.00 130.00 -71.98% runtime_test.BenchmarkSemaSyntNonblock-16 1015.00 136.00 -86.60% runtime_test.BenchmarkSemaSyntBlock 58.80 36.70 -37.59% runtime_test.BenchmarkSemaSyntBlock-2 294.00 149.00 -49.32% runtime_test.BenchmarkSemaSyntBlock-4 333.00 177.00 -46.85% runtime_test.BenchmarkSemaSyntBlock-8 471.00 221.00 -53.08% runtime_test.BenchmarkSemaSyntBlock-16 990.00 227.00 -77.07% runtime_test.BenchmarkSemaWorkNonblock 829.00 832.00 +0.36% runtime_test.BenchmarkSemaWorkNonblock-2 425.00 419.00 -1.41% runtime_test.BenchmarkSemaWorkNonblock-4 308.00 220.00 -28.57% runtime_test.BenchmarkSemaWorkNonblock-8 394.00 147.00 -62.69% runtime_test.BenchmarkSemaWorkNonblock-16 1510.00 149.00 -90.13% runtime_test.BenchmarkSemaWorkBlock 828.00 813.00 -1.81% runtime_test.BenchmarkSemaWorkBlock-2 428.00 436.00 +1.87% runtime_test.BenchmarkSemaWorkBlock-4 232.00 219.00 -5.60% runtime_test.BenchmarkSemaWorkBlock-8 392.00 251.00 -35.97% runtime_test.BenchmarkSemaWorkBlock-16 1524.00 298.00 -80.45% sync_test.BenchmarkMutexUncontended 24.10 24.00 -0.41% sync_test.BenchmarkMutexUncontended-2 12.00 12.00 +0.00% sync_test.BenchmarkMutexUncontended-4 6.25 6.17 -1.28% sync_test.BenchmarkMutexUncontended-8 3.43 3.34 -2.62% sync_test.BenchmarkMutexUncontended-16 2.34 2.32 -0.85% sync_test.BenchmarkMutex 24.70 24.70 +0.00% sync_test.BenchmarkMutex-2 208.00 99.50 -52.16% sync_test.BenchmarkMutex-4 2744.00 256.00 -90.67% sync_test.BenchmarkMutex-8 5137.00 556.00 -89.18% sync_test.BenchmarkMutex-16 5368.00 1284.00 -76.08% sync_test.BenchmarkMutexSlack 24.70 25.00 +1.21% sync_test.BenchmarkMutexSlack-2 1094.00 186.00 -83.00% sync_test.BenchmarkMutexSlack-4 3430.00 402.00 -88.28% sync_test.BenchmarkMutexSlack-8 5051.00 1066.00 -78.90% sync_test.BenchmarkMutexSlack-16 6806.00 1363.00 -79.97% sync_test.BenchmarkMutexWork 793.00 792.00 -0.13% sync_test.BenchmarkMutexWork-2 398.00 398.00 +0.00% sync_test.BenchmarkMutexWork-4 1441.00 308.00 -78.63% sync_test.BenchmarkMutexWork-8 8532.00 847.00 -90.07% sync_test.BenchmarkMutexWork-16 8225.00 2760.00 -66.44% sync_test.BenchmarkMutexWorkSlack 793.00 793.00 +0.00% sync_test.BenchmarkMutexWorkSlack-2 418.00 414.00 -0.96% sync_test.BenchmarkMutexWorkSlack-4 4481.00 480.00 -89.29% sync_test.BenchmarkMutexWorkSlack-8 6317.00 1598.00 -74.70% sync_test.BenchmarkMutexWorkSlack-16 9111.00 3038.00 -66.66% R=rsc CC=golang-dev https://golang.org/cl/4631059
This commit is contained in:
parent
39acba55ee
commit
997c00f991
12
src/pkg/runtime/386/atomic.c
Normal file
12
src/pkg/runtime/386/atomic.c
Normal file
@ -0,0 +1,12 @@
|
||||
// Copyright 2009 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "runtime.h"
|
||||
|
||||
#pragma textflag 7
|
||||
uint32
|
||||
runtime·atomicload(uint32 volatile* addr)
|
||||
{
|
||||
return *addr;
|
||||
}
|
@ -47,6 +47,7 @@ OFILES_arm=\
|
||||
|
||||
OFILES=\
|
||||
asm.$O\
|
||||
atomic.$O\
|
||||
cgocall.$O\
|
||||
chan.$O\
|
||||
closure.$O\
|
||||
|
12
src/pkg/runtime/amd64/atomic.c
Normal file
12
src/pkg/runtime/amd64/atomic.c
Normal file
@ -0,0 +1,12 @@
|
||||
// Copyright 2009 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "runtime.h"
|
||||
|
||||
#pragma textflag 7
|
||||
uint32
|
||||
runtime·atomicload(uint32 volatile* addr)
|
||||
{
|
||||
return *addr;
|
||||
}
|
12
src/pkg/runtime/arm/atomic.c
Normal file
12
src/pkg/runtime/arm/atomic.c
Normal file
@ -0,0 +1,12 @@
|
||||
// Copyright 2009 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "runtime.h"
|
||||
|
||||
#pragma textflag 7
|
||||
uint32
|
||||
runtime·atomicload(uint32 volatile* addr)
|
||||
{
|
||||
return runtime·xadd(addr, 0);
|
||||
}
|
@ -416,7 +416,10 @@ int32 runtime·write(int32, void*, int32);
|
||||
int32 runtime·mincore(void*, uintptr, byte*);
|
||||
bool runtime·cas(uint32*, uint32, uint32);
|
||||
bool runtime·casp(void**, void*, void*);
|
||||
// Don't confuse with XADD x86 instruction,
|
||||
// this one is actually 'addx', that is, add-and-fetch.
|
||||
uint32 runtime·xadd(uint32 volatile*, int32);
|
||||
uint32 runtime·atomicload(uint32 volatile*);
|
||||
void runtime·jmpdefer(byte*, void*);
|
||||
void runtime·exit1(int32);
|
||||
void runtime·ready(G*);
|
||||
|
@ -23,104 +23,68 @@ package runtime
|
||||
typedef struct Sema Sema;
|
||||
struct Sema
|
||||
{
|
||||
uint32 *addr;
|
||||
uint32 volatile *addr;
|
||||
G *g;
|
||||
Sema *prev;
|
||||
Sema *next;
|
||||
};
|
||||
|
||||
// TODO: For now, a linked list; maybe a hash table of linked lists later.
|
||||
static Sema *semfirst, *semlast;
|
||||
static Lock semlock;
|
||||
|
||||
static void
|
||||
semqueue(uint32 *addr, Sema *s)
|
||||
typedef struct SemaRoot SemaRoot;
|
||||
struct SemaRoot
|
||||
{
|
||||
s->addr = addr;
|
||||
s->g = nil;
|
||||
Lock;
|
||||
Sema *head;
|
||||
Sema *tail;
|
||||
// Number of waiters. Read w/o the lock.
|
||||
uint32 volatile nwait;
|
||||
};
|
||||
|
||||
runtime·lock(&semlock);
|
||||
s->prev = semlast;
|
||||
s->next = nil;
|
||||
if(semlast)
|
||||
semlast->next = s;
|
||||
else
|
||||
semfirst = s;
|
||||
semlast = s;
|
||||
runtime·unlock(&semlock);
|
||||
// Prime to not correlate with any user patterns.
|
||||
#define SEMTABLESZ 251
|
||||
|
||||
static union
|
||||
{
|
||||
SemaRoot;
|
||||
// Modern processors tend to have 64-byte cache lines,
|
||||
// potentially with 128-byte effective cache line size for reading.
|
||||
// While there are hypothetical architectures
|
||||
// with 16-4096 byte cache lines, 128 looks like a good compromise.
|
||||
uint8 pad[128];
|
||||
} semtable[SEMTABLESZ];
|
||||
|
||||
static SemaRoot*
|
||||
semroot(uint32 *addr)
|
||||
{
|
||||
return &semtable[((uintptr)addr >> 3) % SEMTABLESZ];
|
||||
}
|
||||
|
||||
static void
|
||||
semdequeue(Sema *s)
|
||||
semqueue(SemaRoot *root, uint32 volatile *addr, Sema *s)
|
||||
{
|
||||
s->g = g;
|
||||
s->addr = addr;
|
||||
s->next = nil;
|
||||
s->prev = root->tail;
|
||||
if(root->tail)
|
||||
root->tail->next = s;
|
||||
else
|
||||
root->head = s;
|
||||
root->tail = s;
|
||||
}
|
||||
|
||||
static void
|
||||
semdequeue(SemaRoot *root, Sema *s)
|
||||
{
|
||||
runtime·lock(&semlock);
|
||||
if(s->next)
|
||||
s->next->prev = s->prev;
|
||||
else
|
||||
semlast = s->prev;
|
||||
root->tail = s->prev;
|
||||
if(s->prev)
|
||||
s->prev->next = s->next;
|
||||
else
|
||||
semfirst = s->next;
|
||||
root->head = s->next;
|
||||
s->prev = nil;
|
||||
s->next = nil;
|
||||
runtime·unlock(&semlock);
|
||||
}
|
||||
|
||||
static void
|
||||
semwakeup(uint32 *addr)
|
||||
{
|
||||
Sema *s;
|
||||
|
||||
runtime·lock(&semlock);
|
||||
for(s=semfirst; s; s=s->next) {
|
||||
if(s->addr == addr && s->g) {
|
||||
runtime·ready(s->g);
|
||||
s->g = nil;
|
||||
break;
|
||||
}
|
||||
}
|
||||
runtime·unlock(&semlock);
|
||||
}
|
||||
|
||||
// Step 1 of sleep: make ourselves available for wakeup.
|
||||
// TODO(rsc): Maybe we can write a version without
|
||||
// locks by using cas on s->g. Maybe not: I need to
|
||||
// think more about whether it would be correct.
|
||||
static void
|
||||
semsleep1(Sema *s)
|
||||
{
|
||||
runtime·lock(&semlock);
|
||||
s->g = g;
|
||||
runtime·unlock(&semlock);
|
||||
}
|
||||
|
||||
// Decided not to go through with it: undo step 1.
|
||||
static void
|
||||
semsleepundo1(Sema *s)
|
||||
{
|
||||
runtime·lock(&semlock);
|
||||
if(s->g != nil) {
|
||||
s->g = nil; // back ourselves out
|
||||
} else {
|
||||
// If s->g == nil already, semwakeup
|
||||
// already readied us. Since we never stopped
|
||||
// running, readying us just set g->readyonstop.
|
||||
// Clear it.
|
||||
if(g->readyonstop == 0)
|
||||
*(int32*)0x555 = 555;
|
||||
g->readyonstop = 0;
|
||||
}
|
||||
runtime·unlock(&semlock);
|
||||
}
|
||||
|
||||
// Step 2: wait for the wakeup.
|
||||
static void
|
||||
semsleep2(Sema *s)
|
||||
{
|
||||
USED(s);
|
||||
g->status = Gwaiting;
|
||||
runtime·gosched();
|
||||
}
|
||||
|
||||
static int32
|
||||
@ -128,52 +92,83 @@ cansemacquire(uint32 *addr)
|
||||
{
|
||||
uint32 v;
|
||||
|
||||
while((v = *addr) > 0)
|
||||
while((v = runtime·atomicload(addr)) > 0)
|
||||
if(runtime·cas(addr, v, v-1))
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// For now has no return value.
|
||||
// Might return an ok (not interrupted) bool in the future?
|
||||
void
|
||||
runtime·semacquire(uint32 *addr)
|
||||
runtime·semacquire(uint32 volatile *addr)
|
||||
{
|
||||
Sema s;
|
||||
SemaRoot *root;
|
||||
|
||||
// Easy case.
|
||||
if(cansemacquire(addr))
|
||||
return;
|
||||
|
||||
// Harder case:
|
||||
// queue
|
||||
// try semacquire one more time, sleep if failed
|
||||
// dequeue
|
||||
// wake up one more guy to avoid races (TODO(rsc): maybe unnecessary?)
|
||||
semqueue(addr, &s);
|
||||
// increment waiter count
|
||||
// try cansemacquire one more time, return if succeeded
|
||||
// enqueue itself as a waiter
|
||||
// sleep
|
||||
// (waiter descriptor is dequeued by signaler)
|
||||
root = semroot(addr);
|
||||
for(;;) {
|
||||
semsleep1(&s);
|
||||
runtime·lock(root);
|
||||
// Add ourselves to nwait to disable "easy case" in semrelease.
|
||||
runtime·xadd(&root->nwait, 1);
|
||||
// Check cansemacquire to avoid missed wakeup.
|
||||
if(cansemacquire(addr)) {
|
||||
semsleepundo1(&s);
|
||||
break;
|
||||
runtime·xadd(&root->nwait, -1);
|
||||
runtime·unlock(root);
|
||||
return;
|
||||
}
|
||||
semsleep2(&s);
|
||||
// Any semrelease after the cansemacquire knows we're waiting
|
||||
// (we set nwait above), so go to sleep.
|
||||
semqueue(root, addr, &s);
|
||||
g->status = Gwaiting;
|
||||
runtime·unlock(root);
|
||||
runtime·gosched();
|
||||
if(cansemacquire(addr))
|
||||
return;
|
||||
}
|
||||
semdequeue(&s);
|
||||
semwakeup(addr);
|
||||
}
|
||||
|
||||
void
|
||||
runtime·semrelease(uint32 *addr)
|
||||
runtime·semrelease(uint32 volatile *addr)
|
||||
{
|
||||
uint32 v;
|
||||
Sema *s;
|
||||
SemaRoot *root;
|
||||
|
||||
for(;;) {
|
||||
v = *addr;
|
||||
if(runtime·cas(addr, v, v+1))
|
||||
break;
|
||||
root = semroot(addr);
|
||||
runtime·xadd(addr, 1);
|
||||
|
||||
// Easy case: no waiters?
|
||||
// This check must happen after the xadd, to avoid a missed wakeup
|
||||
// (see loop in semacquire).
|
||||
if(runtime·atomicload(&root->nwait) == 0)
|
||||
return;
|
||||
|
||||
// Harder case: search for a waiter and wake it.
|
||||
runtime·lock(root);
|
||||
if(runtime·atomicload(&root->nwait) == 0) {
|
||||
// The count is already consumed by another goroutine,
|
||||
// so no need to wake up another goroutine.
|
||||
runtime·unlock(root);
|
||||
return;
|
||||
}
|
||||
semwakeup(addr);
|
||||
for(s = root->head; s; s = s->next) {
|
||||
if(s->addr == addr) {
|
||||
runtime·xadd(&root->nwait, -1);
|
||||
semdequeue(root, s);
|
||||
break;
|
||||
}
|
||||
}
|
||||
runtime·unlock(root);
|
||||
if(s)
|
||||
runtime·ready(s->g);
|
||||
}
|
||||
|
||||
func Semacquire(addr *uint32) {
|
||||
|
Loading…
Reference in New Issue
Block a user