1
0
mirror of https://github.com/golang/go synced 2024-10-04 14:21:21 -06:00
go/src/pkg/runtime/parfor.c
Russ Cox 67c83db60d runtime: use goc2c as much as possible
Package runtime's C functions written to be called from Go
started out written in C using carefully constructed argument
lists and the FLUSH macro to write a result back to memory.

For some functions, the appropriate parameter list ended up
being architecture-dependent due to differences in alignment,
so we added 'goc2c', which takes a .goc file containing Go func
declarations but C bodies, rewrites the Go func declaration to
equivalent C declarations for the target architecture, adds the
needed FLUSH statements, and writes out an equivalent C file.
That C file is compiled as part of package runtime.

Native Client's x86-64 support introduces the most complex
alignment rules yet, breaking many functions that could until
now be portably written in C. Using goc2c for those avoids the
breakage.

Separately, Keith's work on emitting stack information from
the C compiler would require the hand-written functions
to add #pragmas specifying how many arguments are result
parameters. Using goc2c for those avoids maintaining #pragmas.

For both reasons, use goc2c for as many Go-called C functions
as possible.

This CL is a replay of the bulk of CL 15400047 and CL 15790043,
both of which were reviewed as part of the NaCl port and are
checked in to the NaCl branch. This CL is part of bringing the
NaCl code into the main tree.

No new code here, just reformatting and occasional movement
into .h files.

LGTM=r
R=dave, alex.brainman, r
CC=golang-codereviews
https://golang.org/cl/65220044
2014-02-20 15:58:47 -05:00

200 lines
4.8 KiB
C

// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Parallel for algorithm.
#include "runtime.h"
#include "arch_GOARCH.h"
struct ParForThread
{
// the thread's iteration space [32lsb, 32msb)
uint64 pos;
// stats
uint64 nsteal;
uint64 nstealcnt;
uint64 nprocyield;
uint64 nosyield;
uint64 nsleep;
byte pad[CacheLineSize];
};
ParFor*
runtime·parforalloc(uint32 nthrmax)
{
ParFor *desc;
// The ParFor object is followed by CacheLineSize padding
// and then nthrmax ParForThread.
desc = (ParFor*)runtime·malloc(sizeof(ParFor) + CacheLineSize + nthrmax * sizeof(ParForThread));
desc->thr = (ParForThread*)((byte*)(desc+1) + CacheLineSize);
desc->nthrmax = nthrmax;
return desc;
}
void
runtime·parforsetup(ParFor *desc, uint32 nthr, uint32 n, void *ctx, bool wait, void (*body)(ParFor*, uint32))
{
uint32 i, begin, end;
uint64 *pos;
if(desc == nil || nthr == 0 || nthr > desc->nthrmax || body == nil) {
runtime·printf("desc=%p nthr=%d count=%d body=%p\n", desc, nthr, n, body);
runtime·throw("parfor: invalid args");
}
desc->body = body;
desc->done = 0;
desc->nthr = nthr;
desc->thrseq = 0;
desc->cnt = n;
desc->ctx = ctx;
desc->wait = wait;
desc->nsteal = 0;
desc->nstealcnt = 0;
desc->nprocyield = 0;
desc->nosyield = 0;
desc->nsleep = 0;
for(i=0; i<nthr; i++) {
begin = (uint64)n*i / nthr;
end = (uint64)n*(i+1) / nthr;
pos = &desc->thr[i].pos;
if(((uintptr)pos & 7) != 0)
runtime·throw("parforsetup: pos is not aligned");
*pos = (uint64)begin | (((uint64)end)<<32);
}
}
void
runtime·parfordo(ParFor *desc)
{
ParForThread *me;
uint32 tid, begin, end, begin2, try, victim, i;
uint64 *mypos, *victimpos, pos, newpos;
void (*body)(ParFor*, uint32);
bool idle;
// Obtain 0-based thread index.
tid = runtime·xadd(&desc->thrseq, 1) - 1;
if(tid >= desc->nthr) {
runtime·printf("tid=%d nthr=%d\n", tid, desc->nthr);
runtime·throw("parfor: invalid tid");
}
// If single-threaded, just execute the for serially.
if(desc->nthr==1) {
for(i=0; i<desc->cnt; i++)
desc->body(desc, i);
return;
}
body = desc->body;
me = &desc->thr[tid];
mypos = &me->pos;
for(;;) {
for(;;) {
// While there is local work,
// bump low index and execute the iteration.
pos = runtime·xadd64(mypos, 1);
begin = (uint32)pos-1;
end = (uint32)(pos>>32);
if(begin < end) {
body(desc, begin);
continue;
}
break;
}
// Out of work, need to steal something.
idle = false;
for(try=0;; try++) {
// If we don't see any work for long enough,
// increment the done counter...
if(try > desc->nthr*4 && !idle) {
idle = true;
runtime·xadd(&desc->done, 1);
}
// ...if all threads have incremented the counter,
// we are done.
if(desc->done + !idle == desc->nthr) {
if(!idle)
runtime·xadd(&desc->done, 1);
goto exit;
}
// Choose a random victim for stealing.
victim = runtime·fastrand1() % (desc->nthr-1);
if(victim >= tid)
victim++;
victimpos = &desc->thr[victim].pos;
for(;;) {
// See if it has any work.
pos = runtime·atomicload64(victimpos);
begin = (uint32)pos;
end = (uint32)(pos>>32);
if(begin+1 >= end) {
begin = end = 0;
break;
}
if(idle) {
runtime·xadd(&desc->done, -1);
idle = false;
}
begin2 = begin + (end-begin)/2;
newpos = (uint64)begin | (uint64)begin2<<32;
if(runtime·cas64(victimpos, pos, newpos)) {
begin = begin2;
break;
}
}
if(begin < end) {
// Has successfully stolen some work.
if(idle)
runtime·throw("parfor: should not be idle");
runtime·atomicstore64(mypos, (uint64)begin | (uint64)end<<32);
me->nsteal++;
me->nstealcnt += end-begin;
break;
}
// Backoff.
if(try < desc->nthr) {
// nothing
} else if (try < 4*desc->nthr) {
me->nprocyield++;
runtime·procyield(20);
// If a caller asked not to wait for the others, exit now
// (assume that most work is already done at this point).
} else if (!desc->wait) {
if(!idle)
runtime·xadd(&desc->done, 1);
goto exit;
} else if (try < 6*desc->nthr) {
me->nosyield++;
runtime·osyield();
} else {
me->nsleep++;
runtime·usleep(1);
}
}
}
exit:
runtime·xadd64(&desc->nsteal, me->nsteal);
runtime·xadd64(&desc->nstealcnt, me->nstealcnt);
runtime·xadd64(&desc->nprocyield, me->nprocyield);
runtime·xadd64(&desc->nosyield, me->nosyield);
runtime·xadd64(&desc->nsleep, me->nsleep);
me->nsteal = 0;
me->nstealcnt = 0;
me->nprocyield = 0;
me->nosyield = 0;
me->nsleep = 0;
}
// For testing from Go.
void
runtime·parforiters(ParFor *desc, uintptr tid, uintptr *start, uintptr *end)
{
*start = (uint32)desc->thr[tid].pos;
*end = (uint32)(desc->thr[tid].pos>>32);
}