* comment, clean up scheduler

* rewrite lock implementation to be correct (tip: never assume that an algorithm you found in a linux man page is correct.) * delete unneeded void* arg from clone fn * replace Rendez with Note * comment mal better * use 6c -w, fix warnings * mark all assembly functions 7 R=r DELTA=828 (338 added, 221 deleted, 269 changed) OCL=13884 CL=13886
2024-09-29 11:34:32 -06:00 · 2008-08-05 14:18:47 -07:00 · 2008-08-05 14:18:47 -07:00 · 96824000ed
commit 96824000ed
parent 5adbacb8e7
14 changed files with 545 additions and 429 deletions
--- a/src/runtime/Makefile
+++ b/src/runtime/Makefile
@ -49,10 +49,10 @@ clean:
 	rm -f *.$(O) *.a runtime.acid

 %.$O:	%.c
-	$(CC) $<
+	$(CC) -w $<

 sys_file.$O:	sys_file.c sys_types.h $(OS_H)
-	$(CC) -D$(GOARCH)_$(GOOS) $<
+	$(CC) -w -D$(GOARCH)_$(GOOS) $<

 %.$O:	%.s
 	$(AS) $<
--- a/src/runtime/amd64_linux.h
+++ b/src/runtime/amd64_linux.h
@ -48,6 +48,6 @@ struct stat {

 // Linux-specific system calls
 int64	futex(uint32*, int32, uint32, struct timespec*, uint32*, uint32);
-int64	clone(int32, void*, M*, G*, void(*)(void*), void*);
+int64	clone(int32, void*, M*, G*, void(*)(void));
 int64	select(int32, void*, void*, void*, void*);

--- a/src/runtime/chan.c
+++ b/src/runtime/chan.c
@ -487,7 +487,7 @@ sys·selectgo(Select *sel)
 	SudoG *sg;
 	G *gp;

-	byte *ae, *as;
+	byte *as;

 	if(xxx) {
 		prints("selectgo: sel=");
@ -630,6 +630,8 @@ sys·selectgo(Select *sel)
 asynr:
 asyns:
 	throw("asyn");
+	return;	// compiler doesn't know throw doesn't return
+
 gotr:
 	// recv path to wakeup the sender (sg)
 	if(xxx) {
--- a/src/runtime/map.c
+++ b/src/runtime/map.c
@ -199,7 +199,6 @@ out:
 void
 sys·mapassign1(Hmap *m, ...)
 {
-	Link **ll;
 	byte *ak, *av;

 	ak = (byte*)&m + m->ko;
--- a/src/runtime/print.c
+++ b/src/runtime/print.c
@ -8,7 +8,6 @@
 void
 dump(byte *p, int32 n)
 {
-	uint32 v;
 	int32 i;

 	for(i=0; i<n; i++) {
--- a/src/runtime/proc.c
+++ b/src/runtime/proc.c
@ -9,28 +9,101 @@ typedef struct Sched Sched;
 M	m0;
 G	g0;	// idle goroutine for m0

-// Maximum number of os procs (M's) to kick off.
-// Can override with $gomaxprocs environment variable.
-// For now set to 1 (single-threaded), because not 
-// everything is properly locked (e.g., chans) and because
-// Darwin's multithreading code isn't implemented.
-int32	gomaxprocs = 1;
-
 static	int32	debug	= 0;

+// Go scheduler
+//
+// The go scheduler's job is to match ready-to-run goroutines (`g's)
+// with waiting-for-work schedulers (`m's).  If there are ready gs
+// and no waiting ms, ready() will start a new m running in a new
+// OS thread, so that all ready gs can run simultaneously, up to a limit.
+// For now, ms never go away.
+//
+// The default maximum number of ms is one: go runs single-threaded.
+// This is because some locking details have to be worked ou
+// (select in particular is not locked properly) and because the low-level
+// code hasn't been written yet for OS X.  Setting the environmen
+// variable $gomaxprocs changes sched.mmax for now.
+//
+// Even a program that can run without deadlock in a single process
+// might use more ms if given the chance.  For example, the prime
+// sieve will use as many ms as there are primes (up to sched.mmax),
+// allowing different stages of the pipeline to execute in parallel.
+// We could revisit this choice, only kicking off new ms for blocking
+// system calls, but that would limit the amount of parallel computation
+// that go would try to do.
+//
+// In general, one could imagine all sorts of refinements to the
+// scheduler, but the goal now is just to get something working on
+// Linux and OS X.
+
 struct Sched {
-	G *runhead;
-	G *runtail;
-	int32 nwait;
-	int32 nready;
-	int32 ng;
-	int32 nm;
-	M *wait;
 	Lock;
+
+	G *gfree;	// available gs (status == Gdead)
+	
+	G *ghead;	// gs waiting to run
+	G *gtail;
+	int32 gwait;	// number of gs waiting to run
+	int32 gcount;	// number of gs that are alive
+	
+	M *mhead;	// ms waiting for work
+	int32 mwait;	// number of ms waiting for work
+	int32 mcount;	// number of ms that are alive
+	int32 mmax;	// max number of ms allowed
+	
+	int32 predawn;	// running initialization, don't run new gs.
 };

 Sched sched;

+// Scheduling helpers.  Sched must be locked.
+static void gput(G*);	// put/get on ghead/gtail
+static G* gget(void);
+static void mput(M*);	// put/get on mhead
+static M* mget(void);
+static void gfput(G*);	// put/get on gfree
+static G* gfget(void);
+static void mnew(void);	// kick off new m
+static void readylocked(G*);	// ready, but sched is locked
+
+// Scheduler loop.
+static void scheduler(void);
+
+// Called before main·init_function.
+void
+schedinit(void)
+{
+	int32 n;
+	byte *p;
+	
+	sched.mmax = 1;
+	p = getenv("gomaxprocs");
+	if(p != nil && (n = atoi(p)) != 0)
+		sched.mmax = n;
+	sched.mcount = 1;
+	sched.predawn = 1;
+}
+
+// Called after main·init_function; main·main is on ready queue.
+void
+m0init(void)
+{
+	int32 i;
+	
+	// Let's go.
+	sched.predawn = 0;
+
+	// There's already one m (us).
+	// If main·init_function started other goroutines,
+	// kick off new ms to handle them, like ready
+	// would have, had it not been pre-dawn.
+	for(i=1; i<sched.gcount && i<sched.mmax; i++)
+		mnew();
+
+	scheduler();
+}
+
 void
 sys·goexit(void)
 {
@ -39,23 +112,10 @@ sys·goexit(void)
 		sys·printint(g->goid);
 		prints("\n");
 	}
-	g->status = Gdead;
+	g->status = Gmoribund;
 	sys·gosched();
 }

-void
-schedinit(void)
-{
-	byte *p;
-	extern int32 getenvc(void);
-	
-	p = getenv("gomaxprocs");
-	if(p && '0' <= *p && *p <= '9')
-		gomaxprocs = atoi(p);
-	sched.nm = 1;
-	sched.nwait = 1;
-}
-
 void
 sys·newproc(int32 siz, byte* fn, byte* arg0)
 {
@ -71,22 +131,18 @@ sys·newproc(int32 siz, byte* fn, byte* arg0)
 	if(siz > 1024)
 		throw("sys·newproc: too many args");

-	// try to rip off an old goroutine
-	for(newg=allg; newg!=nil; newg=newg->alllink)
-		if(newg->status == Gdead)
-			break;
+	lock(&sched);

-	if(newg == nil) {
+	if((newg = gfget()) != nil){
+		newg->status = Gwaiting;
+		stk = newg->stack0;
+	}else{
 		newg = mal(sizeof(G));
 		stk = mal(4096);
 		newg->stack0 = stk;
-
 		newg->status = Gwaiting;
 		newg->alllink = allg;
 		allg = newg;
-	} else {
-		stk = newg->stack0;
-		newg->status = Gwaiting;
 	}

 	newg->stackguard = stk+160;
@ -104,14 +160,13 @@ sys·newproc(int32 siz, byte* fn, byte* arg0)
 	newg->sched.SP = sp;
 	newg->sched.PC = fn;

-	lock(&sched);
-	sched.ng++;
+	sched.gcount++;
 	goidgen++;
 	newg->goid = goidgen;
+	
+	readylocked(newg);
 	unlock(&sched);

-	ready(newg);
-
 //prints(" goid=");
 //sys·printint(newg->goid);
 //prints("\n");
@ -132,193 +187,248 @@ tracebackothers(G *me)
 	}
 }

-void newmach(void);
-
+// Put on `g' queue.  Sched must be locked.
 static void
-readylocked(G *g)
+gput(G *g)
 {
-	g->status = Grunnable;
-	if(sched.runhead == nil)
-		sched.runhead = g;
+	g->schedlink = nil;
+	if(sched.ghead == nil)
+		sched.ghead = g;
 	else
-		sched.runtail->runlink = g;
-	sched.runtail = g;
-	g->runlink = nil;
-	sched.nready++;
-	// Don't wake up another scheduler.
-	// This only gets called when we're
-	// about to reschedule anyway.
+		sched.gtail->schedlink = g;
+	sched.gtail = g;
+	sched.gwait++;
 }

-static Lock print;
+// Get from `g' queue.  Sched must be locked.
+static G*
+gget(void)
+{
+	G *g;
 	
+	g = sched.ghead;
+	if(g){
+		sched.ghead = g->schedlink;
+		if(sched.ghead == nil)
+			sched.gtail = nil;
+		sched.gwait--;
+	}
+	return g;
+}
+
+// Put on `m' list.  Sched must be locked.
+static void
+mput(M *m)
+{
+	m->schedlink = sched.mhead;
+	sched.mhead = m;
+	sched.mwait++;
+}
+
+// Get from `m' list.  Sched must be locked.
+static M*
+mget(void)
+{
+	M *m;
+	
+	m = sched.mhead;
+	if(m){
+		sched.mhead = m->schedlink;
+		sched.mwait--;
+	}
+	return m;
+}
+
+// Put on gfree list.  Sched must be locked.
+static void
+gfput(G *g)
+{
+	g->schedlink = sched.gfree;
+	sched.gfree = g;
+}
+
+// Get from gfree list.  Sched must be locked.
+static G*
+gfget(void)
+{
+	G *g;
+	
+	g = sched.gfree;
+	if(g)
+		sched.gfree = g->schedlink;
+	return g;
+}
+
+// Mark g ready to run.
 void
 ready(G *g)
 {
-	M *mm;
-
-	// gp might be running on another scheduler.
-	// (E.g., it queued and then we decided to wake it up
-	// before it had a chance to sys·gosched().)
-	// Grabbing the runlock ensures that it is not running elsewhere.
-	// You can delete the if check, but don't delete the 
-	// lock/unlock sequence (being able to grab the lock
-	// means the proc has gone to sleep).
-	lock(&g->runlock);
-	if(g->status == Grunnable || g->status == Grunning)
-		*(int32*)0x1023 = 0x1023;
+	// Wait for g to stop running (for example, it migh
+	// have queued itself on a channel but not yet gotten
+	// a chance to call sys·gosched and actually go to sleep).
+	notesleep(&g->stopped);
+	
 	lock(&sched);
-	g->status = Grunnable;
-	if(sched.runhead == nil)
-		sched.runhead = g;
-	else
-		sched.runtail->runlink = g;
-	sched.runtail = g;
-	g->runlink = nil;
-	unlock(&g->runlock);
-	sched.nready++;
-	if(sched.nready > sched.nwait)
-	if(gomaxprocs == 0 || sched.nm < gomaxprocs){
-		if(debug){
-			prints("new scheduler: ");
-			sys·printint(sched.nready);
-			prints(" > ");
-			sys·printint(sched.nwait);
-			prints("\n");
-		}
-		sched.nwait++;
-		newmach();
-	}
-	if(sched.wait){
-		mm = sched.wait;
-		sched.wait = mm->waitlink;
-		rwakeupandunlock(&mm->waitr);
-	}else
-		unlock(&sched);
+	readylocked(g);
+	unlock(&sched);
 }

-extern void p0(void), p1(void);
+// Mark g ready to run.  Sched is already locked,
+// and g is known not to be running right now
+// (i.e., ready has slept on g->stopped or the g was
+// just allocated in sys·newproc).
+static void
+readylocked(G *g)
+{
+	M *m;

-G*
-nextgoroutine(void)
+	// Mark runnable.
+	if(g->status == Grunnable || g->status == Grunning)
+		throw("bad g->status in ready");
+	g->status = Grunnable;
+
+	// Before we've gotten to main·main,
+	// only queue new gs, don't run them
+	// or try to allocate new ms for them.
+	// That includes main·main itself.
+	if(sched.predawn){
+		gput(g);
+	}
+
+	// Else if there's an m waiting, give it g.
+	else if((m = mget()) != nil){
+		m->nextg = g;
+		notewakeup(&m->havenextg);
+	}
+	
+	// Else put g on queue, kicking off new m if needed.
+	else{
+		gput(g);
+		if(sched.mcount < sched.mmax)
+			mnew();
+	}
+}
+
+// Get the next goroutine that m should run.
+// Sched must be locked on entry, is unlocked on exit.
+static G*
+nextgandunlock(void)
 {
 	G *gp;

-	while((gp = sched.runhead) == nil){
-		if(debug){
-			prints("nextgoroutine runhead=nil ng=");
-			sys·printint(sched.ng);
-			prints("\n");
-		}
-		if(sched.ng == 0)
-			return nil;
-		m->waitlink = sched.wait;
-		m->waitr.l = &sched.Lock;
-		sched.wait = m;
-		sched.nwait++;
-		if(sched.nm == sched.nwait)
-			prints("all goroutines are asleep - deadlock!\n");
-		rsleep(&m->waitr);
-		sched.nwait--;
+	if((gp = gget()) != nil){
+		unlock(&sched);
+		return gp;
 	}
-	sched.nready--;
-	sched.runhead = gp->runlink;
+
+	mput(m);
+	if(sched.mcount == sched.mwait)
+		prints("warning: all goroutines are asleep - deadlock!\n");
+	m->nextg = nil;
+	noteclear(&m->havenextg);
+	unlock(&sched);
+	
+	notesleep(&m->havenextg);
+	if((gp = m->nextg) == nil)
+		throw("bad m->nextg in nextgoroutine");
+	m->nextg = nil;
 	return gp;
 }

-void
+// Scheduler loop: find g to run, run it, repeat.
+static void
 scheduler(void)
 {
 	G* gp;

-	m->pid = getprocid();
-
-	gosave(&m->sched);
+	// Initialization.
+	m->procid = getprocid();
 	lock(&sched);

-	if(m->curg == nil){
-		// Brand new scheduler; nwait counts us.
-		// Not anymore.
-		sched.nwait--;
-	}else{
+	if(gosave(&m->sched)){
+		// Jumped here via gosave/gogo, so didn'
+		// execute lock(&sched) above.
+		lock(&sched);
+
+		// Just finished running m->curg.
 		gp = m->curg;
-		gp->m = nil;
+		gp->m = nil;	// for debugger
 		switch(gp->status){
-		case Gdead:
-			sched.ng--;
-			if(debug){
-				prints("sched: dead: ");
-				sys·printint(sched.ng);
-				prints("\n");
-			}
-			break;
-		case Grunning:
-			readylocked(gp);
-			break;
 		case Grunnable:
-			// don't want to see this
-			*(int32*)0x456 = 0x234;
+		case Gdead:
+			// Shouldn't have been running!
+			throw("bad gp->status in sched");
+		case Grunning:
+			gp->status = Grunnable;
+			gput(gp);
+			break;
+		case Gmoribund:
+			gp->status = Gdead;
+			if(--sched.gcount == 0)
+				sys·exit(0);
 			break;
 		}
-		unlock(&gp->runlock);
+		notewakeup(&gp->stopped);
 	}

-	gp = nextgoroutine();
-	if(gp == nil) {
-//		prints("sched: no more work\n");
-		sys·exit(0);
-	}
-	unlock(&sched);
+	// Find (or wait for) g to run.  Unlocks sched.
+	gp = nextgandunlock();
 	
-	lock(&gp->runlock);
+	noteclear(&gp->stopped);
 	gp->status = Grunning;
 	m->curg = gp;
-	gp->m = m;
+	gp->m = m;	// for debugger
 	g = gp;
 	gogo(&gp->sched);
 }

-void
-newmach(void)
-{
-	M *mm;
-	byte *stk, *stktop;
-	int64 ret;
-	
-	sched.nm++;
-	if(!(sched.nm&(sched.nm-1))){
-		sys·printint(sched.nm);
-		prints(" threads\n");
-	}
-	mm = mal(sizeof(M)+sizeof(G)+1024+104);
-	sys·memclr((byte*)mm, sizeof(M));
-	mm->g0 = (G*)(mm+1);
-	sys·memclr((byte*)mm->g0, sizeof(G));
-	stk = (byte*)mm->g0 + 104;
-	stktop = stk + 1024;
-	mm->g0->stackguard = stk;
-	mm->g0->stackbase = stktop;
-	newosproc(mm, mm->g0, stktop, (void(*)(void*))scheduler, nil);
-}
-
-void
-gom0init(void)
-{
-	scheduler();
-}
-
+// Enter scheduler.  If g->status is Grunning,
+// re-queues g and runs everyone else who is waiting
+// before running g again.  If g->status is Gmoribund,
+// kills off g.
 void
 sys·gosched(void)
 {
 	if(gosave(&g->sched) == 0){
-		// (rsc) signal race here?
+		// TODO(rsc) signal race here?
+		// If a signal comes in between
+		// changing g and changing SP,
+		// growing the stack will fail.
 		g = m->g0;
 		gogo(&m->sched);
 	}
 }

+// Fork off a new m.  Sched must be locked.
+static void
+mnew(void)
+{
+	M *m;
+	G *g;
+	byte *stk, *stktop;
+	
+	sched.mcount++;
+	if(debug){
+		sys·printint(sched.mcount);
+		prints(" threads\n");
+	}
+	
+	// Allocate m, g, stack in one chunk.
+	// 1024 and 104 are the magic constants
+	// use in rt0_amd64.s when setting up g0.
+	m = mal(sizeof(M)+sizeof(G)+104+1024);
+	g = (G*)(m+1);
+	stk = (byte*)g + 104;
+	stktop = stk + 1024;
+	
+	m->g0 = g;
+	g->stackguard = stk;
+	g->stackbase = stktop;
+	newosproc(m, g, stktop, scheduler);
+}
+
 //
-// the calling sequence for a routine that
+// the calling sequence for a routine tha
 // needs N bytes stack, A args.
 //
 //	N1 = (N+160 > 4096)? N+160: 0
--- a/src/runtime/rt0_amd64.s
+++ b/src/runtime/rt0_amd64.s
@ -41,7 +41,7 @@ TEXT	_rt0_amd64(SB),7,$-8
 	PUSHQ	$main·main(SB)		// entry
 	PUSHQ	$16			// arg size
 	CALL	sys·newproc(SB)
-	CALL	gom0init(SB)
+	CALL	m0init(SB)
 	POPQ	AX
 	POPQ	AX

--- a/src/runtime/rt1_amd64_darwin.c
+++ b/src/runtime/rt1_amd64_darwin.c
@ -191,7 +191,7 @@ sys·sleep(int64 ms)
 void
 lock(Lock *l)
 {
-	if(xadd(&l->key, 1) == 1)
+	if(cas(&l->key, 0, 1))
 		return;
 	unimplemented("lock wait");
 }
@ -199,43 +199,33 @@ lock(Lock *l)
 void
 unlock(Lock *l)
 {
-	if(xadd(&l->key, -1) == 0)
+	if(cas(&l->key, 1, 0))
 		return;
 	unimplemented("unlock wakeup");
 }

 void
-rsleep(Rendez *r)
+noteclear(Note *n)
 {
-	unimplemented("rsleep");
-
-	// dumb implementation:
-	r->sleeping = 1;
-	unlock(r->l);
-	while(r->sleeping)
-		;
-	lock(r->l);
+	n->lock.key = 0;
+	lock(&n->lock);
 }

 void
-rwakeup(Rendez *r)
+notesleep(Note *n)
 {
-	unimplemented("rwakeup");
-
-	// dumb implementation:
-	r->sleeping = 0;
+	lock(&n->lock);
+	unlock(&n->lock);
 }

 void
-rwakeupandunlock(Rendez *r)
+notewakeup(Note *n)
 {
-	// dumb implementation:
-	rwakeup(r);
-	unlock(r->l);
+	unlock(&n->lock);
 }

 void
-newosproc(M *mm, G *gg, void *stk, void (*fn)(void*), void *arg)
+newosproc(M *mm, G *gg, void *stk, void (*fn)(void))
 {
 	unimplemented("newosproc");
 }
--- a/src/runtime/rt1_amd64_linux.c
+++ b/src/runtime/rt1_amd64_linux.c
@ -138,21 +138,19 @@ typedef struct sigaction {
 void
 sighandler(int32 sig, siginfo* info, void** context)
 {
-	int32 i;
-
 	if(sig < 0 || sig >= NSIG){
 		prints("Signal ");
 		sys·printint(sig);
 	}else{
 		prints(sigtab[sig].name);
 	}
-        
+
        struct sigcontext *sc = &(((struct ucontext *)context)->uc_mcontext);
-        
+
        prints("\nFaulting address: 0x");  sys·printpointer(info->si_addr);
        prints("\npc: 0x");  sys·printpointer((void *)sc->rip);
        prints("\n\n");
-        
+
 	traceback((void *)sc->rip, (void *)sc->rsp, (void *)sc->r15);
 	tracebackothers((void*)sc->r15);
 	print_sigcontext(sc);
@ -179,16 +177,14 @@ initsig(void)
 		}
 }

-// Linux futex.  The simple cases really are simple:
+// Linux futex.
 //
-//	futex(addr, FUTEX_WAIT, val, duration, _, _)
-//		Inside the kernel, atomically check that *addr == val
-//		and go to sleep for at most duration.
+//	futexsleep(uint32 *addr, uint32 val)
+//	futexwakeup(uint32 *addr)
 //
-//	futex(addr, FUTEX_WAKE, val, _, _, _)
-//		Wake up at least val procs sleeping on addr.
-//
-// (Of course, they have added more complicated things since then.)
+// Futexsleep atomically checks if *addr == val and if so, sleeps on addr.
+// Futexwakeup wakes up one thread sleeping on addr.
+// Futexsleep is allowed to wake up spuriously.

 enum
 {	
@ -199,10 +195,10 @@ enum
 	EAGAIN = 11,
 };

-// TODO(rsc) I tried using 1<<40 here but it woke up (-ETIMEDOUT).
+// TODO(rsc) I tried using 1<<40 here but futex woke up (-ETIMEDOUT).
 // I wonder if the timespec that gets to the kernel
-// actually has two 32-bit numbers in it, so that
-// a 64-bit 1<<40 ends up being 0 seconds, 
+// actually has two 32-bit numbers in it, so tha
+// a 64-bit 1<<40 ends up being 0 seconds,
 // 1<<8 nanoseconds.
 static struct timespec longtime =
 {
@ -210,69 +206,106 @@ static struct timespec longtime =
 	0
 };

+// Atomically,
+//	if(*addr == val) sleep
+// Might be woken up spuriously; that's allowed.
 static void
-efutex(uint32 *addr, int32 op, int32 val, struct timespec *ts)
+futexsleep(uint32 *addr, uint32 val)
 {
 	int64 ret;
+	
+	ret = futex(addr, FUTEX_WAIT, val, &longtime, nil, 0);
+	if(ret >= 0 || ret == -EAGAIN || ret == -EINTR)
+		return;

-again:
-	ret = futex(addr, op, val, ts, nil, 0);
-
-	// These happen when you use a debugger, among other times.
-	if(ret == -EAGAIN || ret == -EINTR){
-		// If we were sleeping, it's okay to wake up early.
-		if(op == FUTEX_WAIT)
-			return;
-		
-		// If we were waking someone up, we don't know
-		// whether that succeeded, so wake someone else up too.
-		if(op == FUTEX_WAKE){
-prints("futexwake ");
-sys·printint(ret);
-prints("\n");
-			goto again;
-		}
-	}
-
-	if(ret < 0){
-		prints("futex error addr=");
-		sys·printpointer(addr);
-		prints(" op=");
-		sys·printint(op);
-		prints(" val=");
-		sys·printint(val);
-		prints(" ts=");
-		sys·printpointer(ts);
-		prints(" returned ");
-		sys·printint(-ret);
-		prints("\n");
-		*(int32*)101 = 202;
-	}
+	prints("futexsleep addr=");
+	sys·printpointer(addr);
+	prints(" val=");
+	sys·printint(val);
+	prints(" returned ");
+	sys·printint(ret);
+	prints("\n");
+	*(int32*)0x1005 = 0x1005;
 }

-// Lock and unlock.  
-// A zeroed Lock is unlocked (no need to initialize each lock).
-// The l->key is either 0 (unlocked), 1 (locked), or >=2 (contended).
+// If any procs are sleeping on addr, wake up at least one.
+static void
+futexwakeup(uint32 *addr)
+{
+	int64 ret;
+	
+	ret = futex(addr, FUTEX_WAKE, 1, nil, nil, 0);
+
+	if(ret >= 0)
+		return;
+
+	// I don't know that futex wakeup can return
+	// EAGAIN or EINTR, but if it does, it would be 
+	// safe to loop and call futex again.
+
+	prints("futexwakeup addr=");
+	sys·printpointer(addr);
+	prints(" returned ");
+	sys·printint(ret);
+	prints("\n");
+	*(int32*)0x1006 = 0x1006;
+}
+
+
+// Lock and unlock.
+//
+// The lock state is a single 32-bit word that holds
+// a 31-bit count of threads waiting for the lock
+// and a single bit (the low bit) saying whether the lock is held.
+// The uncontended case runs entirely in user space.
+// When contention is detected, we defer to the kernel (futex).
+//
+// A reminder: compare-and-swap cas(addr, old, new) does
+//	if(*addr == old) { *addr = new; return 1; }
+// 	else return 0;
+// but atomically.

 void
 lock(Lock *l)
 {
 	uint32 v;
-	
-	if(l->key != 0) *(int32*)0x1001 = 0x1001;
-	l->key = 1;
-	return;

-	for(;;){
-		// Try for lock.  If we incremented it from 0 to 1, we win.
-		if((v=xadd(&l->key, 1)) == 1)
+again:
+	v = l->key;
+	if((v&1) == 0){
+		if(cas(&l->key, v, v|1)){
+			// Lock wasn't held; we grabbed it.
 			return;
-
-		// We lose.  It was already >=1 and is now >=2.
-		// Use futex to atomically check that the value is still
-		// what we think it is and go to sleep.
-		efutex(&l->key, FUTEX_WAIT, v, &longtime);
+		}
+		goto again;
 	}
+	
+	// Lock was held; try to add ourselves to the waiter count.
+	if(!cas(&l->key, v, v+2))
+		goto again;
+	
+	// We're accounted for, now sleep in the kernel.
+	//
+	// We avoid the obvious lock/unlock race because
+	// the kernel won't put us to sleep if l->key has
+	// changed underfoot and is no longer v+2.
+	//
+	// We only really care that (v&1) == 1 (the lock is held),
+	// and in fact there is a futex variant that could
+	// accomodate that check, but let's not get carried away.)
+	futexsleep(&l->key, v+2);
+	
+	// We're awake: remove ourselves from the count.
+	for(;;){
+		v = l->key;
+		if(v < 2)
+			throw("bad lock key");
+		if(cas(&l->key, v, v-2))
+			break;
+	}
+	
+	// Try for the lock again.
+	goto again;
 }

 void
@ -280,68 +313,54 @@ unlock(Lock *l)
 {
 	uint32 v;

-	if(l->key != 1) *(int32*)0x1002 = 0x1002;
-	l->key = 0;
-	return;
+	// Atomically get value and clear lock bit.
+again:
+	v = l->key;
+	if((v&1) == 0)
+		throw("unlock of unlocked lock");
+	if(!cas(&l->key, v, v&~1))
+		goto again;

-	// Unlock the lock.  If we decremented from 1 to 0, wasn't contended.
-	if((v=xadd(&l->key, -1)) == 0)
-		return;
-	
-	// The lock was contended.  Mark it as unlocked and wake a waiter.
-	l->key = 0;
-	efutex(&l->key, FUTEX_WAKE, 1, nil);
+	// If there were waiters, wake one.
+	if(v & ~1)
+		futexwakeup(&l->key);
 }

-// Sleep and wakeup (see description in runtime.h)
+
+// One-time notifications.
+//
+// Since the lock/unlock implementation already
+// takes care of sleeping in the kernel, we just reuse it.
+// (But it's a weird use, so it gets its own interface.)
+//
+// We use a lock to represent the event:
+// unlocked == event has happened.
+// Thus the lock starts out locked, and to wait for the
+// event you try to lock the lock.  To signal the event,
+// you unlock the lock.

 void
-rsleep(Rendez *r)
+noteclear(Note *n)
 {
-	// Record that we're about to go to sleep and drop the lock.
-	r->sleeping = 1;
-	unlock(r->l);
-	
-	// Go to sleep if r->sleeping is still 1.
-	efutex(&r->sleeping, FUTEX_WAIT, 1, &longtime);
-
-	// Reacquire the lock.
-	lock(r->l);
+	n->lock.key = 0;	// memset(n, 0, sizeof *n)
+	lock(&n->lock);
 }

 void
-rwakeup(Rendez *r)
+notewakeup(Note *n)
 {
-	if(!r->sleeping)
-		return;
-
-	// Clear the sleeping flag in case sleeper
-	// is between unlock and futex.
-	r->sleeping = 0;
-	
-	// Wake up if actually made it to sleep.
-	efutex(&r->sleeping, FUTEX_WAKE, 1, nil);
+	unlock(&n->lock);
 }

-// Like rwakeup(r), unlock(r->l), but drops the lock before
-// waking the other proc.  This reduces bouncing back and forth
-// in the scheduler: the first thing the other proc wants to do
-// is acquire r->l, so it helps to unlock it before we wake him.
 void
-rwakeupandunlock(Rendez *r)
+notesleep(Note *n)
 {
-	int32 wassleeping;
-	
-	if(!r->sleeping){
-		unlock(r->l);
-		return;
-	}
-
-	r->sleeping = 0;
-	unlock(r->l);
-	efutex(&r->sleeping, FUTEX_WAKE, 1, nil);
+	lock(&n->lock);
+	unlock(&n->lock);	// Let other sleepers find out too.
 }

+
+// Clone, the Linux rfork.
 enum
 {
 	CLONE_VM = 0x100,
@ -365,7 +384,7 @@ enum
 };

 void
-newosproc(M *mm, G *gg, void *stk, void (*fn)(void*), void *arg)
+newosproc(M *m, G *g, void *stk, void (*fn)(void))
 {
 	int64 ret;
 	int32 flags;
@ -382,20 +401,18 @@ newosproc(M *mm, G *gg, void *stk, void (*fn)(void*), void *arg)
 	if(0){
 		prints("newosproc stk=");
 		sys·printpointer(stk);
-		prints(" mm=");
-		sys·printpointer(mm);
-		prints(" gg=");
-		sys·printpointer(gg);
+		prints(" m=");
+		sys·printpointer(m);
+		prints(" g=");
+		sys·printpointer(g);
 		prints(" fn=");
 		sys·printpointer(fn);
-		prints(" arg=");
-		sys·printpointer(arg);
 		prints(" clone=");
 		sys·printpointer(clone);
 		prints("\n");
 	}

-	ret = clone(flags, stk, mm, gg, fn, arg);
+	ret = clone(flags, stk, m, g, fn);
 	if(ret < 0)
 		*(int32*)123 = 123;
 }
--- a/src/runtime/runtime.c
+++ b/src/runtime/runtime.c
@ -71,6 +71,7 @@ rnd(uint32 n, uint32 m)
 	return n;
 }

+// Convenient wrapper around mmap.
 static void*
 brk(uint32 n)
 {
@ -81,12 +82,15 @@ brk(uint32 n)
 	return v;
 }

-
+// Allocate n bytes of memory.  Note that this gets used
+// to allocate new stack segments, so at each call to a function
+// you have to ask yourself "would it be okay to call mal recursively
+// right here?"  The answer is yes unless we're in the middle of
+// editing the malloc state in m->mem.
 void*
 mal(uint32 n)
 {
 	byte* v;
-	Mem *mem;

 	// round to keep everything 64-bit aligned
 	n = rnd(n, 8);
@ -94,17 +98,19 @@ mal(uint32 n)
 	// be careful.  calling any function might invoke
 	// mal to allocate more stack.
 	if(n > NHUNK) {
-		// this call is okay - calling mal recursively
-		// won't change anything we depend on.
 		v = brk(n);
 	} else {
 		// allocate a new hunk if this one is too small
 		if(n > m->mem.nhunk) {
-			// better not to call brk here - it might grow the stack,
-			// causing a call to mal and the allocation of a 
-			// new hunk behind our backs.  then we'd toss away
-			// almost all of that new hunk and replace it.
-			// that'd just be a memory leak - the code would still run.
+			// here we're in the middle of editing m->mem
+			// (we're about to overwrite m->mem.hunk),
+			// so we can't call brk - it might call mal to grow the
+			// stack, and the recursive call would allocate a new
+			// hunk, and then once brk returned we'd immediately
+			// overwrite that hunk with our own.
+			// (the net result would be a memory leak, not a crash.)
+			// so we have to call sys·mmap directly - it is written
+			// in assembly and tagged not to grow the stack.
 			m->mem.hunk =
 				sys·mmap(nil, NHUNK, PROT_READ|PROT_WRITE,
 					MAP_ANON|MAP_PRIVATE, 0, 0);
@ -136,7 +142,7 @@ hashmap(Sigi *si, Sigs *ss)
 	byte *sname, *iname;
 	Map *m;

-	h = ((uint32)si + (uint32)ss) % nelem(hash);
+	h = ((uint32)(uint64)si + (uint32)(uint64)ss) % nelem(hash);
 	for(m=hash[h]; m!=nil; m=m->link) {
 		if(m->si == si && m->ss == ss) {
 			if(m->bad) {
@ -301,9 +307,9 @@ enum
 	NANSIGN		= 1<<31,
 };

-static	uint64	uvnan		= 0x7FF0000000000001;
-static	uint64	uvinf		= 0x7FF0000000000000;
-static	uint64	uvneginf	= 0xFFF0000000000000;
+static	uint64	uvnan		= 0x7FF0000000000001ULL;
+static	uint64	uvinf		= 0x7FF0000000000000ULL;
+static	uint64	uvneginf	= 0xFFF0000000000000ULL;

 static int32
 isInf(float64 d, int32 sign)
@ -338,7 +344,7 @@ isNaN(float64 d)
 	uint64 x;

 	x = *(uint64*)&d;
-	return ((uint32)x>>32)==0x7FF00000 && !isInf(d, 0);
+	return (uint32)(x>>32)==0x7FF00000 && !isInf(d, 0);
 }

 static float64
@ -424,7 +430,7 @@ modf(float64 d, float64 *ip)
 	return d - dd;
 }

-// func frexp(float64) (float64, int32); // break fp into exp,fract
+// func frexp(float64) (float64, int32); // break fp into exp,frac
 void
 sys·frexp(float64 din, float64 dou, int32 iou)
 {
@ -432,7 +438,7 @@ sys·frexp(float64 din, float64 dou, int32 iou)
 	FLUSH(&dou);
 }

-//func	ldexp(int32, float64) float64;	// make fp from exp,fract
+//func	ldexp(int32, float64) float64;	// make fp from exp,frac
 void
 sys·ldexp(float64 din, int32 ein, float64 dou)
 {
@ -441,7 +447,7 @@ sys·ldexp(float64 din, int32 ein, float64 dou)
 }

 //func	modf(float64) (float64, float64);	// break fp into double+double
-float64
+void
 sys·modf(float64 din, float64 integer, float64 fraction)
 {
 	fraction = modf(din, &integer);
@ -593,6 +599,7 @@ out:
 	FLUSH(&s);
 }

+void
 check(void)
 {
 	int8 a;
@ -638,18 +645,6 @@ check(void)
 	initsig();
 }

-uint32
-xadd(uint32 *val, uint32 delta)
-{
-	uint32 v;
-	
-	for(;;){
-		v = *val;
-		if(cas(val, v, v+delta))
-			return v+delta;
-	}
-}
-
 /*
 * map and chan helpers for
 * dealing with unknown types
@ -657,6 +652,7 @@ xadd(uint32 *val, uint32 delta)
 static uint64
 memhash(uint32 s, void *a)
 {
+	USED(s, a);
 	prints("memhash\n");
 	return 0x12345;
 }
@ -718,6 +714,7 @@ memcopy(uint32 s, void *a, void *b)
 static uint64
 stringhash(uint32 s, string *a)
 {
+	USED(s, a);
 	prints("stringhash\n");
 	return 0x12345;
 }
@ -725,18 +722,21 @@ stringhash(uint32 s, string *a)
 static uint32
 stringequal(uint32 s, string *a, string *b)
 {
+	USED(s);
 	return cmpstring(*a, *b) == 0;
 }

 static void
 stringprint(uint32 s, string *a)
 {
+	USED(s);
 	sys·printstring(*a);
 }

 static void
 stringcopy(uint32 s, string *a, string *b)
 {
+	USED(s);
 	if(b == nil) {
 		*a = nil;
 		return;
@ -747,6 +747,7 @@ stringcopy(uint32 s, string *a, string *b)
 static uint64
 pointerhash(uint32 s, void **a)
 {
+	USED(s, a);
 	prints("pointerhash\n");
 	return 0x12345;
 }
@ -754,6 +755,7 @@ pointerhash(uint32 s, void **a)
 static uint32
 pointerequal(uint32 s, void **a, void **b)
 {
+	USED(s, a, b);
 	prints("pointerequal\n");
 	return 0;
 }
@ -761,12 +763,14 @@ pointerequal(uint32 s, void **a, void **b)
 static void
 pointerprint(uint32 s, void **a)
 {
+	USED(s, a);
 	prints("pointerprint\n");
 }

 static void
 pointercopy(uint32 s, void **a, void **b)
 {
+	USED(s);
 	if(b == nil) {
 		*a = nil;
 		return;
@ -777,8 +781,8 @@ pointercopy(uint32 s, void **a, void **b)
 Alg
 algarray[3] =
 {
-	{	&memhash,	&memequal,	&memprint,	&memcopy	},  // 0
-	{	&stringhash,	&stringequal,	&stringprint,	&stringcopy	},  // 1
-//	{	&pointerhash,	&pointerequal,	&pointerprint,	&pointercopy	},  // 2
-	{	&memhash,	&memequal,	&memprint,	&memcopy	},  // 2 - treat pointers as ints
+	{	memhash,	memequal,	memprint,	memcopy	},  // 0
+	{	stringhash,	stringequal,	stringprint,	stringcopy	},  // 1
+//	{	pointerhash,	pointerequal,	pointerprint,	pointercopy	},  // 2
+	{	memhash,	memequal,	memprint,	memcopy	},  // 2 - treat pointers as ints
 };
--- a/src/runtime/runtime.h
+++ b/src/runtime/runtime.h
@ -43,7 +43,7 @@ typedef	struct	M		M;
 typedef	struct	Stktop		Stktop;
 typedef	struct	Alg		Alg;
 typedef	struct	Lock		Lock;
-typedef	struct	Rendez	Rendez;
+typedef	struct	Note	Note;
 typedef	struct	Mem		Mem;

 /*
@ -62,6 +62,7 @@ enum
 	Grunnable,
 	Grunning,
 	Gwaiting,
+	Gmoribund,
 	Gdead,
 };
 enum
@ -77,10 +78,9 @@ struct	Lock
 {
 	uint32	key;
 };
-struct	Rendez
+struct	Note
 {
-	Lock*	l;
-	uint32	sleeping;	// someone is sleeping (Linux)
+	Lock	lock;
 };
 struct String
 {
@ -124,8 +124,8 @@ struct	G
 	int16	status;
 	int32	goid;
 	int32	selgen;		// valid sudog pointer
-	G*	runlink;
-	Lock	runlock;
+	G*	schedlink;
+	Note	stopped;
 	M*	m;	// for debuggers
 };
 struct	Mem
@ -147,9 +147,10 @@ struct	M
 	byte*	moresp;
 	int32	siz1;
 	int32	siz2;
-	Rendez	waitr;
-	M*	waitlink;
-	int32	pid;	// for debuggers
+	Note	havenextg;
+	G*	nextg;
+	M*	schedlink;
+	int32	procid;	// for debuggers
 	Mem	mem;
 };
 struct	Stktop
@ -224,36 +225,34 @@ int32	write(int32, void*, int32);
 void	close(int32);
 int32	fstat(int32, void*);
 bool	cas(uint32*, uint32, uint32);
-uint32	xadd(uint32*, uint32);
 void	exit1(int32);
 void	ready(G*);
 byte*	getenv(int8*);
 int32	atoi(byte*);
-void	newosproc(M *mm, G *gg, void *stk, void (*fn)(void*), void *arg);
+void	newosproc(M *m, G *g, void *stk, void (*fn)(void));
 int32	getprocid(void);

 /*
 * mutual exclusion locks.  in the uncontended case,
 * as fast as spin locks (just a few user-level instructions),
 * but on the contention path they sleep in the kernel.
+ * a zeroed Lock is unlocked (no need to initialize each lock).
 */
 void	lock(Lock*);
 void	unlock(Lock*);
-void	lockinit(Lock*);

 /*
- * sleep and wakeup.
- * a Rendez is somewhere to sleep.  it is protected by the lock r->l.
- * the caller must acquire r->l, check the condition, and if the 
- * condition is false, call rsleep.  rsleep will atomically drop the lock
- * and go to sleep.  a subsequent rwakeup (caller must hold r->l)
- * will wake up the guy who is rsleeping.  the lock keeps rsleep and
- * rwakeup from missing each other.
- * n.b. only one proc can rsleep on a given rendez at a time.
+ * sleep and wakeup on one-time events.
+ * before any calls to notesleep or notewakeup, 
+ * must call noteclear to initialize the Note.
+ * then, any number of threads can call notesleep
+ * and exactly one thread can call notewakeup (once).
+ * once notewakeup has been called, all the notesleeps
+ * will return.  future notesleeps will return immediately.
 */
-void	rsleep(Rendez*);
-void	rwakeup(Rendez*);
-void	rwakeupandunlock(Rendez*);
+void	noteclear(Note*);
+void	notesleep(Note*);
+void	notewakeup(Note*);

 /*
 * low level go -called
--- a/src/runtime/string.c
+++ b/src/runtime/string.c
@ -45,8 +45,6 @@ out:
 static void
 prbounds(int8* s, int32 a, int32 b, int32 c)
 {
-	int32 i;
-
 	prints(s);
 	prints(" ");
 	sys·printint(a);
@ -115,7 +113,6 @@ strcmp(byte *s1, byte *s2)
 void
 sys·slicestring(string si, int32 lindex, int32 hindex, string so)
 {
-	string s, str;
 	int32 l;

 	if(si == nil)
@ -154,8 +151,6 @@ sys·indexstring(string s, int32 i, byte b)
 void
 sys·intstring(int64 v, string s)
 {
-	int32 l;
-
 	s = mal(sizeof(s->len)+8);
 	s->len = runetochar(s->str, v);
 	FLUSH(&s);
--- a/src/runtime/sys_amd64_darwin.s
+++ b/src/runtime/sys_amd64_darwin.s
@ -7,21 +7,24 @@
 //

 // TODO(rsc): Either sys·exit or exit1 is wrong!
-TEXT	sys·exit(SB),1,$-8
+// It looks like sys·exit is correct (exits the entire program)
+// and exit1 should be mimicking the OS X library routine
+// __bsdthread_terminate.
+TEXT	sys·exit(SB),7,$-8
 	MOVL	8(SP), DI		// arg 1 exit status
 	MOVL	$(0x2000000+1), AX	// syscall entry
 	SYSCALL
 	CALL	notok(SB)
 	RET

-TEXT	exit1(SB),1,$-8
+TEXT	exit1(SB),7,$-8
 	MOVL	8(SP), DI		// arg 1 exit status
 	MOVL	$(0x2000000+1), AX	// syscall entry
 	SYSCALL
 	CALL	notok(SB)
 	RET

-TEXT	sys·write(SB),1,$-8
+TEXT	sys·write(SB),7,$-8
 	MOVL	8(SP), DI		// arg 1 fid
 	MOVQ	16(SP), SI		// arg 2 buf
 	MOVL	24(SP), DX		// arg 3 count
@ -31,7 +34,7 @@ TEXT	sys·write(SB),1,$-8
 	CALL	notok(SB)
 	RET

-TEXT	open(SB),1,$-8
+TEXT	open(SB),7,$-8
 	MOVQ	8(SP), DI
 	MOVL	16(SP), SI
 	MOVL	20(SP), DX
@ -40,20 +43,20 @@ TEXT	open(SB),1,$-8
 	SYSCALL
 	RET

-TEXT	close(SB),1,$-8
+TEXT	close(SB),7,$-8
 	MOVL	8(SP), DI
 	MOVL	$(0x2000000+6), AX	// syscall entry
 	SYSCALL
 	RET

-TEXT	fstat(SB),1,$-8
+TEXT	fstat(SB),7,$-8
 	MOVL	8(SP), DI
 	MOVQ	16(SP), SI
 	MOVL	$(0x2000000+339), AX	// syscall entry; really fstat64
 	SYSCALL
 	RET

-TEXT	read(SB),1,$-8
+TEXT	read(SB),7,$-8
 	MOVL	8(SP), DI
 	MOVQ	16(SP), SI
 	MOVL	24(SP), DX
@ -61,7 +64,7 @@ TEXT	read(SB),1,$-8
 	SYSCALL
 	RET

-TEXT	write(SB),1,$-8
+TEXT	write(SB),7,$-8
 	MOVL	8(SP), DI
 	MOVQ	16(SP), SI
 	MOVL	24(SP), DX
@ -69,7 +72,7 @@ TEXT	write(SB),1,$-8
 	SYSCALL
 	RET

-TEXT	sys·sigaction(SB),1,$-8
+TEXT	sys·sigaction(SB),7,$-8
 	MOVL	8(SP), DI		// arg 1 sig
 	MOVQ	16(SP), SI		// arg 2 act
 	MOVQ	24(SP), DX		// arg 3 oact
@ -81,7 +84,7 @@ TEXT	sys·sigaction(SB),1,$-8
 	CALL	notok(SB)
 	RET

-TEXT sigtramp(SB),1,$24
+TEXT sigtramp(SB),7,$24
 	MOVL	DX,0(SP)
 	MOVQ	CX,8(SP)
 	MOVQ	R8,16(SP)
@ -101,7 +104,7 @@ TEXT	sys·mmap(SB),7,$-8
 	CALL	notok(SB)
 	RET

-TEXT	notok(SB),1,$-8
+TEXT	notok(SB),7,$-8
 	MOVL	$0xf1, BP
 	MOVQ	BP, (BP)
 	RET
@ -117,12 +120,12 @@ TEXT	sys·memclr(SB),7,$-8
 	STOSQ
 	RET

-TEXT	sys·getcallerpc+0(SB),1,$0
+TEXT	sys·getcallerpc+0(SB),7,$0
 	MOVQ	x+0(FP),AX		// addr of first arg
 	MOVQ	-8(AX),AX		// get calling pc
 	RET

-TEXT	sys·setcallerpc+0(SB),1,$0
+TEXT	sys·setcallerpc+0(SB),7,$0
 	MOVQ	x+0(FP),AX		// addr of first arg
 	MOVQ	x+8(FP), BX
 	MOVQ	BX, -8(AX)		// set calling pc
--- a/src/runtime/sys_amd64_linux.s
+++ b/src/runtime/sys_amd64_linux.s
@ -6,19 +6,19 @@
 // System calls and other sys.stuff for AMD64, Linux
 //

-TEXT	sys·exit(SB),1,$0-8
+TEXT	sys·exit(SB),7,$0-8
 	MOVL	8(SP), DI
-	MOVL	$231, AX	// force all os threads to exit
+	MOVL	$231, AX	// exitgroup - force all os threads to exi
 	SYSCALL
 	RET

-TEXT exit1(SB),1,$0-8
+TEXT exit1(SB),7,$0-8
 	MOVL	8(SP), DI
-	MOVL	$60, AX	// exit the current os thread
+	MOVL	$60, AX	// exit - exit the current os thread
 	SYSCALL
 	RET

-TEXT	open(SB),1,$0-16
+TEXT	open(SB),7,$0-16
 	MOVQ	8(SP), DI
 	MOVL	16(SP), SI
 	MOVL	20(SP), DX
@ -26,20 +26,20 @@ TEXT	open(SB),1,$0-16
 	SYSCALL
 	RET

-TEXT	close(SB),1,$0-8
+TEXT	close(SB),7,$0-8
 	MOVL	8(SP), DI
 	MOVL	$3, AX			// syscall entry
 	SYSCALL
 	RET

-TEXT	fstat(SB),1,$0-16
+TEXT	fstat(SB),7,$0-16
 	MOVL	8(SP), DI
 	MOVQ	16(SP), SI
 	MOVL	$5, AX			// syscall entry
 	SYSCALL
 	RET

-TEXT	read(SB),1,$0-24
+TEXT	read(SB),7,$0-24
 	MOVL	8(SP), DI
 	MOVQ	16(SP), SI
 	MOVL	24(SP), DX
@ -47,7 +47,7 @@ TEXT	read(SB),1,$0-24
 	SYSCALL
 	RET

-TEXT	write(SB),1,$0-24
+TEXT	write(SB),7,$0-24
 	MOVL	8(SP), DI
 	MOVQ	16(SP), SI
 	MOVL	24(SP), DX
@ -55,7 +55,7 @@ TEXT	write(SB),1,$0-24
 	SYSCALL
 	RET

-TEXT	sys·write(SB),1,$0-24
+TEXT	sys·write(SB),7,$0-24
 	MOVL	8(SP), DI
 	MOVQ	16(SP), SI
 	MOVL	24(SP), DX
@ -63,7 +63,7 @@ TEXT	sys·write(SB),1,$0-24
 	SYSCALL
 	RET

-TEXT	sys·rt_sigaction(SB),1,$0-32
+TEXT	sys·rt_sigaction(SB),7,$0-32
 	MOVL	8(SP), DI
 	MOVQ	16(SP), SI
 	MOVQ	24(SP), DX
@ -72,7 +72,7 @@ TEXT	sys·rt_sigaction(SB),1,$0-32
 	SYSCALL
 	RET

-TEXT	sigtramp(SB),1,$24-16
+TEXT	sigtramp(SB),7,$24-16
 	MOVQ	DI,0(SP)
 	MOVQ	SI,8(SP)
 	MOVQ	DX,16(SP)
@ -118,20 +118,20 @@ TEXT	sys·memclr(SB),7,$0-16
 	STOSQ
 	RET

-TEXT	sys·getcallerpc+0(SB),1,$0
+TEXT	sys·getcallerpc+0(SB),7,$0
 	MOVQ	x+0(FP),AX		// addr of first arg
 	MOVQ	-8(AX),AX		// get calling pc
 	RET

-TEXT	sys·setcallerpc+0(SB),1,$0
+TEXT	sys·setcallerpc+0(SB),7,$0
 	MOVQ	x+0(FP),AX		// addr of first arg
 	MOVQ	x+8(FP), BX
 	MOVQ	BX, -8(AX)		// set calling pc
 	RET

-// int64 futex(int32 *uaddr, int32 op, int32 val, 
+// int64 futex(int32 *uaddr, int32 op, int32 val,
 //	struct timespec *timeout, int32 *uaddr2, int32 val2);
-TEXT futex(SB),1,$0
+TEXT futex(SB),7,$0
 	MOVQ	8(SP), DI
 	MOVL	16(SP), SI
 	MOVL	20(SP), DX
@ -142,17 +142,16 @@ TEXT futex(SB),1,$0
 	SYSCALL
 	RET

-// int64 clone(int32 flags, void *stack, M *m, G *g, void (*fn)(void*), void *arg);
+// int64 clone(int32 flags, void *stack, M *m, G *g, void (*fn)(void));
 TEXT clone(SB),7,$0
-	MOVL	8(SP), DI
-	MOVQ	16(SP), SI
+	MOVL	flags+8(SP), DI
+	MOVQ	stack+16(SP), SI
 	
-	// Copy m, g, fn, arg off parent stack for use by child.
+	// Copy m, g, fn off parent stack for use by child.
 	// Careful: Linux system call clobbers CX and R11.
-	MOVQ	24(SP), R8
-	MOVQ	32(SP), R9
-	MOVQ	40(SP), R12
-	MOVQ	48(SP), R13
+	MOVQ	m+24(SP), R8
+	MOVQ	g+32(SP), R9
+	MOVQ	fn+40(SP), R12

 	MOVL	$56, AX
 	SYSCALL
@ -162,21 +161,20 @@ TEXT clone(SB),7,$0
 	JEQ	2(PC)
 	RET
 	
-	// In child, call fn(arg) on new stack
+	// In child, call fn on new stack
 	MOVQ	SI, SP
 	MOVQ	R8, R14	// m
 	MOVQ	R9, R15	// g
-	PUSHQ	R13
 	CALL	R12
 	
-	// It shouldn't return.  If it does, exit
+	// It shouldn't return.  If it does, exi
 	MOVL	$111, DI
 	MOVL	$60, AX
 	SYSCALL
 	JMP	-3(PC)	// keep exiting

 // int64 select(int32, void*, void*, void*, void*)
-TEXT select(SB),1,$0
+TEXT select(SB),7,$0
 	MOVL	8(SP), DI
 	MOVQ	16(SP), SI
 	MOVQ	24(SP), DX
@ -187,14 +185,14 @@ TEXT select(SB),1,$0
 	RET

 // Linux allocates each thread its own pid, like Plan 9.
-// But the getpid() system call returns the pid of the 
+// But the getpid() system call returns the pid of the
 // original thread (the one that exec started with),
 // no matter which thread asks.  This system call,
 // which Linux calls gettid, returns the actual pid of
 // the calling thread, not the fake one.
 //
 // int32 getprocid(void)
-TEXT getprocid(SB),1,$0
+TEXT getprocid(SB),7,$0
 	MOVL	$186, AX
 	SYSCALL
 	RET