go/src/runtime/race_amd64.s

// Copyright 2013 The Go Authors.  All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// +build race

#include "go_asm.h"
#include "go_tls.h"
#include "funcdata.h"
#include "textflag.h"

// The following thunks allow calling the gcc-compiled race runtime directly
// from Go code without going all the way through cgo.
// First, it's much faster (up to 50% speedup for real Go programs).
// Second, it eliminates race-related special cases from cgocall and scheduler.
// Third, in long-term it will allow to remove cyclic runtime/race dependency on cmd/go.

// A brief recap of the amd64 calling convention.
// Arguments are passed in DI, SI, DX, CX, R8, R9, the rest is on stack.
// Callee-saved registers are: BX, BP, R12-R15.
// SP must be 16-byte aligned.
// On Windows:
// Arguments are passed in CX, DX, R8, R9, the rest is on stack.
// Callee-saved registers are: BX, BP, DI, SI, R12-R15.
// SP must be 16-byte aligned. Windows also requires "stack-backing" for the 4 register arguments:
// http://msdn.microsoft.com/en-us/library/ms235286.aspx
// We do not do this, because it seems to be intended for vararg/unprototyped functions.
// Gcc-compiled race runtime does not try to use that space.

#ifdef GOOS_windows
#define RARG0 CX
#define RARG1 DX
#define RARG2 R8
#define RARG3 R9
#else
#define RARG0 DI
#define RARG1 SI
#define RARG2 DX
#define RARG3 CX
#endif

// func runtime·raceread(addr uintptr)
// Called from instrumented code.
TEXT	runtime·raceread(SB), NOSPLIT, $0-8
	MOVQ	addr+0(FP), RARG1
	MOVQ	(SP), RARG2
	// void __tsan_read(ThreadState *thr, void *addr, void *pc);
	MOVQ	$__tsan_read(SB), AX
	JMP	racecalladdr<>(SB)

// func runtime·RaceRead(addr uintptr)
TEXT	runtime·RaceRead(SB), NOSPLIT, $0-8
	// This needs to be a tail call, because raceread reads caller pc.
	JMP	runtime·raceread(SB)

// void runtime·racereadpc(void *addr, void *callpc, void *pc)
TEXT	runtime·racereadpc(SB), NOSPLIT, $0-24
	MOVQ	addr+0(FP), RARG1
	MOVQ	callpc+8(FP), RARG2
	MOVQ	pc+16(FP), RARG3
	// void __tsan_read_pc(ThreadState *thr, void *addr, void *callpc, void *pc);
	MOVQ	$__tsan_read_pc(SB), AX
	JMP	racecalladdr<>(SB)

// func runtime·racewrite(addr uintptr)
// Called from instrumented code.
TEXT	runtime·racewrite(SB), NOSPLIT, $0-8
	MOVQ	addr+0(FP), RARG1
	MOVQ	(SP), RARG2
	// void __tsan_write(ThreadState *thr, void *addr, void *pc);
	MOVQ	$__tsan_write(SB), AX
	JMP	racecalladdr<>(SB)

// func runtime·RaceWrite(addr uintptr)
TEXT	runtime·RaceWrite(SB), NOSPLIT, $0-8
	// This needs to be a tail call, because racewrite reads caller pc.
	JMP	runtime·racewrite(SB)

// void runtime·racewritepc(void *addr, void *callpc, void *pc)
TEXT	runtime·racewritepc(SB), NOSPLIT, $0-24
	MOVQ	addr+0(FP), RARG1
	MOVQ	callpc+8(FP), RARG2
	MOVQ	pc+16(FP), RARG3
	// void __tsan_write_pc(ThreadState *thr, void *addr, void *callpc, void *pc);
	MOVQ	$__tsan_write_pc(SB), AX
	JMP	racecalladdr<>(SB)

// func runtime·racereadrange(addr, size uintptr)
// Called from instrumented code.
TEXT	runtime·racereadrange(SB), NOSPLIT, $0-16
	MOVQ	addr+0(FP), RARG1
	MOVQ	size+8(FP), RARG2
	MOVQ	(SP), RARG3
	// void __tsan_read_range(ThreadState *thr, void *addr, uintptr size, void *pc);
	MOVQ	$__tsan_read_range(SB), AX
	JMP	racecalladdr<>(SB)

// func runtime·RaceReadRange(addr, size uintptr)
TEXT	runtime·RaceReadRange(SB), NOSPLIT, $0-16
	// This needs to be a tail call, because racereadrange reads caller pc.
	JMP	runtime·racereadrange(SB)

// void runtime·racereadrangepc1(void *addr, uintptr sz, void *pc)
TEXT	runtime·racereadrangepc1(SB), NOSPLIT, $0-24
	MOVQ	addr+0(FP), RARG1
	MOVQ	size+8(FP), RARG2
	MOVQ	pc+16(FP), RARG3
	// void __tsan_read_range(ThreadState *thr, void *addr, uintptr size, void *pc);
	MOVQ	$__tsan_read_range(SB), AX
	JMP	racecalladdr<>(SB)

// func runtime·racewriterange(addr, size uintptr)
// Called from instrumented code.
TEXT	runtime·racewriterange(SB), NOSPLIT, $0-16
	MOVQ	addr+0(FP), RARG1
	MOVQ	size+8(FP), RARG2
	MOVQ	(SP), RARG3
	// void __tsan_write_range(ThreadState *thr, void *addr, uintptr size, void *pc);
	MOVQ	$__tsan_write_range(SB), AX
	JMP	racecalladdr<>(SB)

// func runtime·RaceWriteRange(addr, size uintptr)
TEXT	runtime·RaceWriteRange(SB), NOSPLIT, $0-16
	// This needs to be a tail call, because racewriterange reads caller pc.
	JMP	runtime·racewriterange(SB)

// void runtime·racewriterangepc1(void *addr, uintptr sz, void *pc)
TEXT	runtime·racewriterangepc1(SB), NOSPLIT, $0-24
	MOVQ	addr+0(FP), RARG1
	MOVQ	size+8(FP), RARG2
	MOVQ	pc+16(FP), RARG3
	// void __tsan_write_range(ThreadState *thr, void *addr, uintptr size, void *pc);
	MOVQ	$__tsan_write_range(SB), AX
	JMP	racecalladdr<>(SB)

// If addr (RARG1) is out of range, do nothing.
// Otherwise, setup goroutine context and invoke racecall. Other arguments already set.
TEXT	racecalladdr<>(SB), NOSPLIT, $0-0
	get_tls(R12)
	MOVQ	g(R12), R14
	MOVQ	g_racectx(R14), RARG0	// goroutine context
	// Check that addr is within [arenastart, arenaend) or within [racedatastart, racedataend).
	CMPQ	RARG1, runtime·racearenastart(SB)
	JB	data
	CMPQ	RARG1, runtime·racearenaend(SB)
	JB	call
data:
	CMPQ	RARG1, runtime·racedatastart(SB)
	JB	ret
	CMPQ	RARG1, runtime·racedataend(SB)
	JAE	ret
call:
	MOVQ	AX, AX		// w/o this 6a miscompiles this function
	JMP	racecall<>(SB)
ret:
	RET

// func runtime·racefuncenter(pc uintptr)
// Called from instrumented code.
TEXT	runtime·racefuncenter(SB), NOSPLIT, $0-8
	MOVQ	DX, R15		// save function entry context (for closures)
	get_tls(R12)
	MOVQ	g(R12), R14
	MOVQ	g_racectx(R14), RARG0	// goroutine context
	MOVQ	callpc+0(FP), RARG1
	// void __tsan_func_enter(ThreadState *thr, void *pc);
	MOVQ	$__tsan_func_enter(SB), AX
	// racecall<> preserves R15
	CALL	racecall<>(SB)
	MOVQ	R15, DX	// restore function entry context
	RET

// func runtime·racefuncexit()
// Called from instrumented code.
TEXT	runtime·racefuncexit(SB), NOSPLIT, $0-0
	get_tls(R12)
	MOVQ	g(R12), R14
	MOVQ	g_racectx(R14), RARG0	// goroutine context
	// void __tsan_func_exit(ThreadState *thr);
	MOVQ	$__tsan_func_exit(SB), AX
	JMP	racecall<>(SB)

// Atomic operations for sync/atomic package.

// Load
TEXT	sync∕atomic·LoadInt32(SB), NOSPLIT, $0-0
	MOVQ	$__tsan_go_atomic32_load(SB), AX
	CALL	racecallatomic<>(SB)
	RET

TEXT	sync∕atomic·LoadInt64(SB), NOSPLIT, $0-0
	MOVQ	$__tsan_go_atomic64_load(SB), AX
	CALL	racecallatomic<>(SB)
	RET

TEXT	sync∕atomic·LoadUint32(SB), NOSPLIT, $0-0
	JMP	sync∕atomic·LoadInt32(SB)

TEXT	sync∕atomic·LoadUint64(SB), NOSPLIT, $0-0
	JMP	sync∕atomic·LoadInt64(SB)

TEXT	sync∕atomic·LoadUintptr(SB), NOSPLIT, $0-0
	JMP	sync∕atomic·LoadInt64(SB)

TEXT	sync∕atomic·LoadPointer(SB), NOSPLIT, $0-0
	JMP	sync∕atomic·LoadInt64(SB)

// Store
TEXT	sync∕atomic·StoreInt32(SB), NOSPLIT, $0-0
	MOVQ	$__tsan_go_atomic32_store(SB), AX
	CALL	racecallatomic<>(SB)
	RET

TEXT	sync∕atomic·StoreInt64(SB), NOSPLIT, $0-0
	MOVQ	$__tsan_go_atomic64_store(SB), AX
	CALL	racecallatomic<>(SB)
	RET

TEXT	sync∕atomic·StoreUint32(SB), NOSPLIT, $0-0
	JMP	sync∕atomic·StoreInt32(SB)

TEXT	sync∕atomic·StoreUint64(SB), NOSPLIT, $0-0
	JMP	sync∕atomic·StoreInt64(SB)

TEXT	sync∕atomic·StoreUintptr(SB), NOSPLIT, $0-0
	JMP	sync∕atomic·StoreInt64(SB)

// Swap
TEXT	sync∕atomic·SwapInt32(SB), NOSPLIT, $0-0
	MOVQ	$__tsan_go_atomic32_exchange(SB), AX
	CALL	racecallatomic<>(SB)
	RET

TEXT	sync∕atomic·SwapInt64(SB), NOSPLIT, $0-0
	MOVQ	$__tsan_go_atomic64_exchange(SB), AX
	CALL	racecallatomic<>(SB)
	RET

TEXT	sync∕atomic·SwapUint32(SB), NOSPLIT, $0-0
	JMP	sync∕atomic·SwapInt32(SB)

TEXT	sync∕atomic·SwapUint64(SB), NOSPLIT, $0-0
	JMP	sync∕atomic·SwapInt64(SB)

TEXT	sync∕atomic·SwapUintptr(SB), NOSPLIT, $0-0
	JMP	sync∕atomic·SwapInt64(SB)

// Add
TEXT	sync∕atomic·AddInt32(SB), NOSPLIT, $0-0
	MOVQ	$__tsan_go_atomic32_fetch_add(SB), AX
	CALL	racecallatomic<>(SB)
	MOVL	add+8(FP), AX	// convert fetch_add to add_fetch
	ADDL	AX, ret+16(FP)
	RET

TEXT	sync∕atomic·AddInt64(SB), NOSPLIT, $0-0
	MOVQ	$__tsan_go_atomic64_fetch_add(SB), AX
	CALL	racecallatomic<>(SB)
	MOVQ	add+8(FP), AX	// convert fetch_add to add_fetch
	ADDQ	AX, ret+16(FP)
	RET

TEXT	sync∕atomic·AddUint32(SB), NOSPLIT, $0-0
	JMP	sync∕atomic·AddInt32(SB)

TEXT	sync∕atomic·AddUint64(SB), NOSPLIT, $0-0
	JMP	sync∕atomic·AddInt64(SB)

TEXT	sync∕atomic·AddUintptr(SB), NOSPLIT, $0-0
	JMP	sync∕atomic·AddInt64(SB)

// CompareAndSwap
TEXT	sync∕atomic·CompareAndSwapInt32(SB), NOSPLIT, $0-0
	MOVQ	$__tsan_go_atomic32_compare_exchange(SB), AX
	CALL	racecallatomic<>(SB)
	RET

TEXT	sync∕atomic·CompareAndSwapInt64(SB), NOSPLIT, $0-0
	MOVQ	$__tsan_go_atomic64_compare_exchange(SB), AX
	CALL	racecallatomic<>(SB)
	RET

TEXT	sync∕atomic·CompareAndSwapUint32(SB), NOSPLIT, $0-0
	JMP	sync∕atomic·CompareAndSwapInt32(SB)

TEXT	sync∕atomic·CompareAndSwapUint64(SB), NOSPLIT, $0-0
	JMP	sync∕atomic·CompareAndSwapInt64(SB)

TEXT	sync∕atomic·CompareAndSwapUintptr(SB), NOSPLIT, $0-0
	JMP	sync∕atomic·CompareAndSwapInt64(SB)

// Generic atomic operation implementation.
// AX already contains target function.
TEXT	racecallatomic<>(SB), NOSPLIT, $0-0
	// Trigger SIGSEGV early.
	MOVQ	16(SP), R12
	MOVL	(R12), R13
	// Check that addr is within [arenastart, arenaend) or within [racedatastart, racedataend).
	CMPQ	R12, runtime·racearenastart(SB)
	JB	racecallatomic_data
	CMPQ	R12, runtime·racearenaend(SB)
	JB	racecallatomic_ok
racecallatomic_data:
	CMPQ	R12, runtime·racedatastart(SB)
	JB	racecallatomic_ignore
	CMPQ	R12, runtime·racedataend(SB)
	JAE	racecallatomic_ignore
racecallatomic_ok:
	// Addr is within the good range, call the atomic function.
	get_tls(R12)
	MOVQ	g(R12), R14
	MOVQ	g_racectx(R14), RARG0	// goroutine context
	MOVQ	8(SP), RARG1	// caller pc
	MOVQ	(SP), RARG2	// pc
	LEAQ	16(SP), RARG3	// arguments
	JMP	racecall<>(SB)	// does not return
racecallatomic_ignore:
	// Addr is outside the good range.
	// Call __tsan_go_ignore_sync_begin to ignore synchronization during the atomic op.
	// An attempt to synchronize on the address would cause crash.
	MOVQ	AX, R15	// remember the original function
	MOVQ	$__tsan_go_ignore_sync_begin(SB), AX
	MOVQ	g(R12), R14
	MOVQ	g_racectx(R14), RARG0	// goroutine context
	CALL	racecall<>(SB)
	MOVQ	R15, AX	// restore the original function
	// Call the atomic function.
	MOVQ	g_racectx(R14), RARG0	// goroutine context
	MOVQ	8(SP), RARG1	// caller pc
	MOVQ	(SP), RARG2	// pc
	LEAQ	16(SP), RARG3	// arguments
	CALL	racecall<>(SB)
	// Call __tsan_go_ignore_sync_end.
	MOVQ	$__tsan_go_ignore_sync_end(SB), AX
	MOVQ	g_racectx(R14), RARG0	// goroutine context
	JMP	racecall<>(SB)

// void runtime·racecall(void(*f)(...), ...)
// Calls C function f from race runtime and passes up to 4 arguments to it.
// The arguments are never heap-object-preserving pointers, so we pretend there are no arguments.
TEXT	runtime·racecall(SB), NOSPLIT, $0-0
	MOVQ	fn+0(FP), AX
	MOVQ	arg0+8(FP), RARG0
	MOVQ	arg1+16(FP), RARG1
	MOVQ	arg2+24(FP), RARG2
	MOVQ	arg3+32(FP), RARG3
	JMP	racecall<>(SB)

// Switches SP to g0 stack and calls (AX). Arguments already set.
TEXT	racecall<>(SB), NOSPLIT, $0-0
	get_tls(R12)
	MOVQ	g(R12), R14
	MOVQ	g_m(R14), R13
	// Switch to g0 stack.
	MOVQ	SP, R12		// callee-saved, preserved across the CALL
	MOVQ	m_g0(R13), R10
	CMPQ	R10, R14
	JE	call	// already on g0
	MOVQ	(g_sched+gobuf_sp)(R10), SP
call:
	ANDQ	$~15, SP	// alignment for gcc ABI
	CALL	AX
	MOVQ	R12, SP
	RET

// C->Go callback thunk that allows to call runtime·racesymbolize from C code.
// Direct Go->C race call has only switched SP, finish g->g0 switch by setting correct g.
// The overall effect of Go->C->Go call chain is similar to that of mcall.
TEXT	runtime·racesymbolizethunk(SB), NOSPLIT, $56-8
	// Save callee-saved registers (Go code won't respect that).
	// This is superset of darwin/linux/windows registers.
	PUSHQ	BX
	PUSHQ	BP
	PUSHQ	DI
	PUSHQ	SI
	PUSHQ	R12
	PUSHQ	R13
	PUSHQ	R14
	PUSHQ	R15
	// Set g = g0.
	get_tls(R12)
	MOVQ	g(R12), R13
	MOVQ	g_m(R13), R13
	MOVQ	m_g0(R13), R14
	MOVQ	R14, g(R12)	// g = m->g0
	PUSHQ	RARG0	// func arg
	CALL	runtime·racesymbolize(SB)
	POPQ	R12
	// All registers are smashed after Go code, reload.
	get_tls(R12)
	MOVQ	g(R12), R13
	MOVQ	g_m(R13), R13
	MOVQ	m_curg(R13), R14
	MOVQ	R14, g(R12)	// g = m->curg
	// Restore callee-saved registers.
	POPQ	R15
	POPQ	R14
	POPQ	R13
	POPQ	R12
	POPQ	SI
	POPQ	DI
	POPQ	BP
	POPQ	BX
	RET
-												runtime: preserve DX during racefuncenter

R=golang-dev, dvyukov
CC=golang-dev
https://golang.org/cl/7382049

											
										
										
											2013-02-22 11:06:43 -07:00
+								// Copyright 2013 The Go Authors.  All rights reserved.
 								// Use of this source code is governed by a BSD-style
 								// license that can be found in the LICENSE file.
 								// +build race
-												[dev.cc] runtime: convert assembly files for C to Go transition

The main change is that #include "zasm_GOOS_GOARCH.h"
is now #include "go_asm.h" and/or #include "go_tls.h".

Also, because C StackGuard is now Go _StackGuard,
the assembly name changes from const_StackGuard to
const__StackGuard.

In asm_$GOARCH.s, add new function getg, formerly
implemented in C.

The renamed atomics now have Go wrappers, to get
escape analysis annotations right. Those wrappers
are in CL 174860043.

LGTM=r, aram
R=r, aram
CC=austin, dvyukov, golang-codereviews, iant, khr
https://golang.org/cl/168510043

											
										
										
											2014-11-11 15:06:22 -07:00
+								#include "go_asm.h"
 								#include "go_tls.h"
-												runtime: use custom thunks for race calls instead of cgo
Implement custom assembly thunks for hot race calls (memory accesses and function entry/exit).
The thunks extract caller pc, verify that the address is in heap or global and switch to g0 stack.

Before:
ok  	regexp	3.692s
ok  	compress/bzip2	9.461s
ok  	encoding/json	6.380s
After:
ok  	regexp	2.229s (-40%)
ok  	compress/bzip2	4.703s (-50%)
ok  	encoding/json	3.629s (-43%)

For comparison, normal non-race build:
ok  	regexp	0.348s
ok  	compress/bzip2	0.304s
ok  	encoding/json	0.661s
Race build:
ok  	regexp	2.229s (+540%)
ok  	compress/bzip2	4.703s (+1447%)
ok  	encoding/json	3.629s (+449%)

Also removes some race-related special cases from cgocall and scheduler.
In long-term it will allow to remove cyclic runtime/race dependency on cmd/cgo.

Fixes #4249.
Fixes #7460.
Update #6508
Update #6688

R=iant, rsc, bradfitz
CC=golang-codereviews
https://golang.org/cl/55100044

											
										
										
											2014-03-06 12:48:30 -07:00
+								#include "funcdata.h"
-												runtime: use new #include "textflag.h"

I did this just to clean things up, but it will be important
when we drop the pkg directory later.

LGTM=bradfitz
R=r, bradfitz
CC=golang-codereviews
https://golang.org/cl/132600043

											
										
										
											2014-09-04 21:05:18 -06:00
+								#include "textflag.h"
-												runtime: convert .s textflags from numbers to symbolic constants.
Remove NOPROF/DUPOK from everything.

Edits done with a script, except pclinetest.asm which depended
on the DUPOK flag on main().

R=golang-dev, bradfitz
CC=golang-dev
https://golang.org/cl/12613044

											
										
										
											2013-08-07 13:20:05 -06:00
-												runtime: use custom thunks for race calls instead of cgo
Implement custom assembly thunks for hot race calls (memory accesses and function entry/exit).
The thunks extract caller pc, verify that the address is in heap or global and switch to g0 stack.

Before:
ok  	regexp	3.692s
ok  	compress/bzip2	9.461s
ok  	encoding/json	6.380s
After:
ok  	regexp	2.229s (-40%)
ok  	compress/bzip2	4.703s (-50%)
ok  	encoding/json	3.629s (-43%)

For comparison, normal non-race build:
ok  	regexp	0.348s
ok  	compress/bzip2	0.304s
ok  	encoding/json	0.661s
Race build:
ok  	regexp	2.229s (+540%)
ok  	compress/bzip2	4.703s (+1447%)
ok  	encoding/json	3.629s (+449%)

Also removes some race-related special cases from cgocall and scheduler.
In long-term it will allow to remove cyclic runtime/race dependency on cmd/cgo.

Fixes #4249.
Fixes #7460.
Update #6508
Update #6688

R=iant, rsc, bradfitz
CC=golang-codereviews
https://golang.org/cl/55100044

											
										
										
											2014-03-06 12:48:30 -07:00
+								// The following thunks allow calling the gcc-compiled race runtime directly
 								// from Go code without going all the way through cgo.
 								// First, it's much faster (up to 50% speedup for real Go programs).
 								// Second, it eliminates race-related special cases from cgocall and scheduler.
 								// Third, in long-term it will allow to remove cyclic runtime/race dependency on cmd/go.
 								// A brief recap of the amd64 calling convention.
 								// Arguments are passed in DI, SI, DX, CX, R8, R9, the rest is on stack.
 								// Callee-saved registers are: BX, BP, R12-R15.
 								// SP must be 16-byte aligned.
 								// On Windows:
 								// Arguments are passed in CX, DX, R8, R9, the rest is on stack.
 								// Callee-saved registers are: BX, BP, DI, SI, R12-R15.
 								// SP must be 16-byte aligned. Windows also requires "stack-backing" for the 4 register arguments:
 								// http://msdn.microsoft.com/en-us/library/ms235286.aspx
 								// We do not do this, because it seems to be intended for vararg/unprototyped functions.
 								// Gcc-compiled race runtime does not try to use that space.
 								#ifdef GOOS_windows
 								#define RARG0 CX
 								#define RARG1 DX
 								#define RARG2 R8
 								#define RARG3 R9
 								#else
 								#define RARG0 DI
 								#define RARG1 SI
 								#define RARG2 DX
 								#define RARG3 CX
 								#endif
 								// func runtime·raceread(addr uintptr)
 								// Called from instrumented code.
 								TEXT	runtime·raceread(SB), NOSPLIT, $0-8
 									MOVQ	addr+0(FP), RARG1
 									MOVQ	(SP), RARG2
 									// void __tsan_read(ThreadState *thr, void *addr, void *pc);
 									MOVQ	$__tsan_read(SB), AX
 									JMP	racecalladdr<>(SB)
 								// func runtime·RaceRead(addr uintptr)
 								TEXT	runtime·RaceRead(SB), NOSPLIT, $0-8
 									// This needs to be a tail call, because raceread reads caller pc.
 									JMP	runtime·raceread(SB)
 								// void runtime·racereadpc(void *addr, void *callpc, void *pc)
 								TEXT	runtime·racereadpc(SB), NOSPLIT, $0-24
 									MOVQ	addr+0(FP), RARG1
 									MOVQ	callpc+8(FP), RARG2
 									MOVQ	pc+16(FP), RARG3
 									// void __tsan_read_pc(ThreadState *thr, void *addr, void *callpc, void *pc);
 									MOVQ	$__tsan_read_pc(SB), AX
 									JMP	racecalladdr<>(SB)
 								// func runtime·racewrite(addr uintptr)
 								// Called from instrumented code.
 								TEXT	runtime·racewrite(SB), NOSPLIT, $0-8
 									MOVQ	addr+0(FP), RARG1
 									MOVQ	(SP), RARG2
 									// void __tsan_write(ThreadState *thr, void *addr, void *pc);
 									MOVQ	$__tsan_write(SB), AX
 									JMP	racecalladdr<>(SB)
 								// func runtime·RaceWrite(addr uintptr)
 								TEXT	runtime·RaceWrite(SB), NOSPLIT, $0-8
 									// This needs to be a tail call, because racewrite reads caller pc.
 									JMP	runtime·racewrite(SB)
 								// void runtime·racewritepc(void *addr, void *callpc, void *pc)
 								TEXT	runtime·racewritepc(SB), NOSPLIT, $0-24
 									MOVQ	addr+0(FP), RARG1
 									MOVQ	callpc+8(FP), RARG2
-												runtime/race: better handling of atomic operations
This change fixes the last known false negative of the race detector --
detection of races between mutating atomic operations and non-atomic operations.
Race runtime already has functions for precise modelling of various atomic operations,
so this change just forwards all atomic ops to race runtime
instead of poor man modeling in sync/atomic package.
Performance is also improved -- full sync/atomic tests run in 60s instead of 85s now.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rsc
https://golang.org/cl/111310044

											
										
										
											2014-09-01 06:04:33 -06:00
+									MOVQ	pc+16(FP), RARG3
-												runtime: use custom thunks for race calls instead of cgo
Implement custom assembly thunks for hot race calls (memory accesses and function entry/exit).
The thunks extract caller pc, verify that the address is in heap or global and switch to g0 stack.

Before:
ok  	regexp	3.692s
ok  	compress/bzip2	9.461s
ok  	encoding/json	6.380s
After:
ok  	regexp	2.229s (-40%)
ok  	compress/bzip2	4.703s (-50%)
ok  	encoding/json	3.629s (-43%)

For comparison, normal non-race build:
ok  	regexp	0.348s
ok  	compress/bzip2	0.304s
ok  	encoding/json	0.661s
Race build:
ok  	regexp	2.229s (+540%)
ok  	compress/bzip2	4.703s (+1447%)
ok  	encoding/json	3.629s (+449%)

Also removes some race-related special cases from cgocall and scheduler.
In long-term it will allow to remove cyclic runtime/race dependency on cmd/cgo.

Fixes #4249.
Fixes #7460.
Update #6508
Update #6688

R=iant, rsc, bradfitz
CC=golang-codereviews
https://golang.org/cl/55100044

											
										
										
											2014-03-06 12:48:30 -07:00
+									// void __tsan_write_pc(ThreadState *thr, void *addr, void *callpc, void *pc);
 									MOVQ	$__tsan_write_pc(SB), AX
 									JMP	racecalladdr<>(SB)
 								// func runtime·racereadrange(addr, size uintptr)
 								// Called from instrumented code.
 								TEXT	runtime·racereadrange(SB), NOSPLIT, $0-16
 									MOVQ	addr+0(FP), RARG1
 									MOVQ	size+8(FP), RARG2
 									MOVQ	(SP), RARG3
 									// void __tsan_read_range(ThreadState *thr, void *addr, uintptr size, void *pc);
 									MOVQ	$__tsan_read_range(SB), AX
 									JMP	racecalladdr<>(SB)
 								// func runtime·RaceReadRange(addr, size uintptr)
 								TEXT	runtime·RaceReadRange(SB), NOSPLIT, $0-16
 									// This needs to be a tail call, because racereadrange reads caller pc.
 									JMP	runtime·racereadrange(SB)
 								// void runtime·racereadrangepc1(void *addr, uintptr sz, void *pc)
 								TEXT	runtime·racereadrangepc1(SB), NOSPLIT, $0-24
 									MOVQ	addr+0(FP), RARG1
 									MOVQ	size+8(FP), RARG2
 									MOVQ	pc+16(FP), RARG3
 									// void __tsan_read_range(ThreadState *thr, void *addr, uintptr size, void *pc);
 									MOVQ	$__tsan_read_range(SB), AX
 									JMP	racecalladdr<>(SB)
 								// func runtime·racewriterange(addr, size uintptr)
 								// Called from instrumented code.
 								TEXT	runtime·racewriterange(SB), NOSPLIT, $0-16
 									MOVQ	addr+0(FP), RARG1
 									MOVQ	size+8(FP), RARG2
 									MOVQ	(SP), RARG3
 									// void __tsan_write_range(ThreadState *thr, void *addr, uintptr size, void *pc);
 									MOVQ	$__tsan_write_range(SB), AX
 									JMP	racecalladdr<>(SB)
 								// func runtime·RaceWriteRange(addr, size uintptr)
 								TEXT	runtime·RaceWriteRange(SB), NOSPLIT, $0-16
 									// This needs to be a tail call, because racewriterange reads caller pc.
 									JMP	runtime·racewriterange(SB)
 								// void runtime·racewriterangepc1(void *addr, uintptr sz, void *pc)
 								TEXT	runtime·racewriterangepc1(SB), NOSPLIT, $0-24
 									MOVQ	addr+0(FP), RARG1
 									MOVQ	size+8(FP), RARG2
 									MOVQ	pc+16(FP), RARG3
 									// void __tsan_write_range(ThreadState *thr, void *addr, uintptr size, void *pc);
 									MOVQ	$__tsan_write_range(SB), AX
 									JMP	racecalladdr<>(SB)
 								// If addr (RARG1) is out of range, do nothing.
 								// Otherwise, setup goroutine context and invoke racecall. Other arguments already set.
 								TEXT	racecalladdr<>(SB), NOSPLIT, $0-0
 									get_tls(R12)
 									MOVQ	g(R12), R14
 									MOVQ	g_racectx(R14), RARG0	// goroutine context
-												runtime: fix atomic operations on non-heap addresses
Race detector runtime does not tolerate operations on addresses
that was not previously declared with __tsan_map_shadow
(namely, data, bss and heap). The corresponding address
checks for atomic operations were removed in
https://golang.org/cl/111310044
Restore these checks.
It's tricker than just not calling into race runtime,
because it is the race runtime that makes the atomic
operations themselves (if we do not call into race runtime
we skip the atomic operation itself as well). So instead we call
__tsan_go_ignore_sync_start/end around the atomic operation.
This forces race runtime to skip all other processing
except than doing the atomic operation itself.
Fixes #9136.

LGTM=rsc
R=rsc
CC=golang-codereviews
https://golang.org/cl/179030043

											
										
										
											2014-11-20 07:51:02 -07:00
+									// Check that addr is within [arenastart, arenaend) or within [racedatastart, racedataend).
-												runtime: use custom thunks for race calls instead of cgo
Implement custom assembly thunks for hot race calls (memory accesses and function entry/exit).
The thunks extract caller pc, verify that the address is in heap or global and switch to g0 stack.

Before:
ok  	regexp	3.692s
ok  	compress/bzip2	9.461s
ok  	encoding/json	6.380s
After:
ok  	regexp	2.229s (-40%)
ok  	compress/bzip2	4.703s (-50%)
ok  	encoding/json	3.629s (-43%)

For comparison, normal non-race build:
ok  	regexp	0.348s
ok  	compress/bzip2	0.304s
ok  	encoding/json	0.661s
Race build:
ok  	regexp	2.229s (+540%)
ok  	compress/bzip2	4.703s (+1447%)
ok  	encoding/json	3.629s (+449%)

Also removes some race-related special cases from cgocall and scheduler.
In long-term it will allow to remove cyclic runtime/race dependency on cmd/cgo.

Fixes #4249.
Fixes #7460.
Update #6508
Update #6688

R=iant, rsc, bradfitz
CC=golang-codereviews
https://golang.org/cl/55100044

											
										
										
											2014-03-06 12:48:30 -07:00
+									CMPQ	RARG1, runtime·racearenastart(SB)
-												[dev.power64] cmd/5a, cmd/6a, cmd/8a, cmd/9a: make labels function-scoped

I removed support for jumping between functions years ago,
as part of doing the instruction layout for each function separately.

Given that, it makes sense to treat labels as function-scoped.
This lets each function have its own 'loop' label, for example.

Makes the assembly much cleaner and removes the last
reason anyone would reach for the 123(PC) form instead.

Note that this is on the dev.power64 branch, but it changes all
the assemblers. The change will ship in Go 1.5 (perhaps after
being ported into the new assembler).

Came up as part of CL 167730043.

LGTM=r
R=r
CC=austin, dave, golang-codereviews, minux
https://golang.org/cl/159670043

											
										
										
											2014-10-28 19:50:16 -06:00
+									JB	data
-												runtime: use custom thunks for race calls instead of cgo
Implement custom assembly thunks for hot race calls (memory accesses and function entry/exit).
The thunks extract caller pc, verify that the address is in heap or global and switch to g0 stack.

Before:
ok  	regexp	3.692s
ok  	compress/bzip2	9.461s
ok  	encoding/json	6.380s
After:
ok  	regexp	2.229s (-40%)
ok  	compress/bzip2	4.703s (-50%)
ok  	encoding/json	3.629s (-43%)

For comparison, normal non-race build:
ok  	regexp	0.348s
ok  	compress/bzip2	0.304s
ok  	encoding/json	0.661s
Race build:
ok  	regexp	2.229s (+540%)
ok  	compress/bzip2	4.703s (+1447%)
ok  	encoding/json	3.629s (+449%)

Also removes some race-related special cases from cgocall and scheduler.
In long-term it will allow to remove cyclic runtime/race dependency on cmd/cgo.

Fixes #4249.
Fixes #7460.
Update #6508
Update #6688

R=iant, rsc, bradfitz
CC=golang-codereviews
https://golang.org/cl/55100044

											
										
										
											2014-03-06 12:48:30 -07:00
+									CMPQ	RARG1, runtime·racearenaend(SB)
-												[dev.power64] cmd/5a, cmd/6a, cmd/8a, cmd/9a: make labels function-scoped

I removed support for jumping between functions years ago,
as part of doing the instruction layout for each function separately.

Given that, it makes sense to treat labels as function-scoped.
This lets each function have its own 'loop' label, for example.

Makes the assembly much cleaner and removes the last
reason anyone would reach for the 123(PC) form instead.

Note that this is on the dev.power64 branch, but it changes all
the assemblers. The change will ship in Go 1.5 (perhaps after
being ported into the new assembler).

Came up as part of CL 167730043.

LGTM=r
R=r
CC=austin, dave, golang-codereviews, minux
https://golang.org/cl/159670043

											
										
										
											2014-10-28 19:50:16 -06:00
+									JB	call
 								data:
-												runtime: fix atomic operations on non-heap addresses
Race detector runtime does not tolerate operations on addresses
that was not previously declared with __tsan_map_shadow
(namely, data, bss and heap). The corresponding address
checks for atomic operations were removed in
https://golang.org/cl/111310044
Restore these checks.
It's tricker than just not calling into race runtime,
because it is the race runtime that makes the atomic
operations themselves (if we do not call into race runtime
we skip the atomic operation itself as well). So instead we call
__tsan_go_ignore_sync_start/end around the atomic operation.
This forces race runtime to skip all other processing
except than doing the atomic operation itself.
Fixes #9136.

LGTM=rsc
R=rsc
CC=golang-codereviews
https://golang.org/cl/179030043

											
										
										
											2014-11-20 07:51:02 -07:00
+									CMPQ	RARG1, runtime·racedatastart(SB)
-												[dev.power64] cmd/5a, cmd/6a, cmd/8a, cmd/9a: make labels function-scoped

I removed support for jumping between functions years ago,
as part of doing the instruction layout for each function separately.

Given that, it makes sense to treat labels as function-scoped.
This lets each function have its own 'loop' label, for example.

Makes the assembly much cleaner and removes the last
reason anyone would reach for the 123(PC) form instead.

Note that this is on the dev.power64 branch, but it changes all
the assemblers. The change will ship in Go 1.5 (perhaps after
being ported into the new assembler).

Came up as part of CL 167730043.

LGTM=r
R=r
CC=austin, dave, golang-codereviews, minux
https://golang.org/cl/159670043

											
										
										
											2014-10-28 19:50:16 -06:00
+									JB	ret
-												runtime: fix atomic operations on non-heap addresses
Race detector runtime does not tolerate operations on addresses
that was not previously declared with __tsan_map_shadow
(namely, data, bss and heap). The corresponding address
checks for atomic operations were removed in
https://golang.org/cl/111310044
Restore these checks.
It's tricker than just not calling into race runtime,
because it is the race runtime that makes the atomic
operations themselves (if we do not call into race runtime
we skip the atomic operation itself as well). So instead we call
__tsan_go_ignore_sync_start/end around the atomic operation.
This forces race runtime to skip all other processing
except than doing the atomic operation itself.
Fixes #9136.

LGTM=rsc
R=rsc
CC=golang-codereviews
https://golang.org/cl/179030043

											
										
										
											2014-11-20 07:51:02 -07:00
+									CMPQ	RARG1, runtime·racedataend(SB)
-												[dev.power64] cmd/5a, cmd/6a, cmd/8a, cmd/9a: make labels function-scoped

I removed support for jumping between functions years ago,
as part of doing the instruction layout for each function separately.

Given that, it makes sense to treat labels as function-scoped.
This lets each function have its own 'loop' label, for example.

Makes the assembly much cleaner and removes the last
reason anyone would reach for the 123(PC) form instead.

Note that this is on the dev.power64 branch, but it changes all
the assemblers. The change will ship in Go 1.5 (perhaps after
being ported into the new assembler).

Came up as part of CL 167730043.

LGTM=r
R=r
CC=austin, dave, golang-codereviews, minux
https://golang.org/cl/159670043

											
										
										
											2014-10-28 19:50:16 -06:00
+									JAE	ret
 								call:
-												runtime: use custom thunks for race calls instead of cgo
Implement custom assembly thunks for hot race calls (memory accesses and function entry/exit).
The thunks extract caller pc, verify that the address is in heap or global and switch to g0 stack.

Before:
ok  	regexp	3.692s
ok  	compress/bzip2	9.461s
ok  	encoding/json	6.380s
After:
ok  	regexp	2.229s (-40%)
ok  	compress/bzip2	4.703s (-50%)
ok  	encoding/json	3.629s (-43%)

For comparison, normal non-race build:
ok  	regexp	0.348s
ok  	compress/bzip2	0.304s
ok  	encoding/json	0.661s
Race build:
ok  	regexp	2.229s (+540%)
ok  	compress/bzip2	4.703s (+1447%)
ok  	encoding/json	3.629s (+449%)

Also removes some race-related special cases from cgocall and scheduler.
In long-term it will allow to remove cyclic runtime/race dependency on cmd/cgo.

Fixes #4249.
Fixes #7460.
Update #6508
Update #6688

R=iant, rsc, bradfitz
CC=golang-codereviews
https://golang.org/cl/55100044

											
										
										
											2014-03-06 12:48:30 -07:00
+									MOVQ	AX, AX		// w/o this 6a miscompiles this function
 									JMP	racecall<>(SB)
-												[dev.power64] cmd/5a, cmd/6a, cmd/8a, cmd/9a: make labels function-scoped

I removed support for jumping between functions years ago,
as part of doing the instruction layout for each function separately.

Given that, it makes sense to treat labels as function-scoped.
This lets each function have its own 'loop' label, for example.

Makes the assembly much cleaner and removes the last
reason anyone would reach for the 123(PC) form instead.

Note that this is on the dev.power64 branch, but it changes all
the assemblers. The change will ship in Go 1.5 (perhaps after
being ported into the new assembler).

Came up as part of CL 167730043.

LGTM=r
R=r
CC=austin, dave, golang-codereviews, minux
https://golang.org/cl/159670043

											
										
										
											2014-10-28 19:50:16 -06:00
+								ret:
-												runtime: use custom thunks for race calls instead of cgo
Implement custom assembly thunks for hot race calls (memory accesses and function entry/exit).
The thunks extract caller pc, verify that the address is in heap or global and switch to g0 stack.

Before:
ok  	regexp	3.692s
ok  	compress/bzip2	9.461s
ok  	encoding/json	6.380s
After:
ok  	regexp	2.229s (-40%)
ok  	compress/bzip2	4.703s (-50%)
ok  	encoding/json	3.629s (-43%)

For comparison, normal non-race build:
ok  	regexp	0.348s
ok  	compress/bzip2	0.304s
ok  	encoding/json	0.661s
Race build:
ok  	regexp	2.229s (+540%)
ok  	compress/bzip2	4.703s (+1447%)
ok  	encoding/json	3.629s (+449%)

Also removes some race-related special cases from cgocall and scheduler.
In long-term it will allow to remove cyclic runtime/race dependency on cmd/cgo.

Fixes #4249.
Fixes #7460.
Update #6508
Update #6688

R=iant, rsc, bradfitz
CC=golang-codereviews
https://golang.org/cl/55100044

											
										
										
											2014-03-06 12:48:30 -07:00
+									RET
-												runtime: fix racefuncenter argument corruption.

Revision 6a88e1893941 corrupts the argument to
racefuncenter by pushing the data block pointer
to the stack.

Fixes #4885.

R=dvyukov, rsc
CC=golang-dev
https://golang.org/cl/7381053

											
										
										
											2013-02-27 23:32:29 -07:00
+								// func runtime·racefuncenter(pc uintptr)
-												runtime: use custom thunks for race calls instead of cgo
Implement custom assembly thunks for hot race calls (memory accesses and function entry/exit).
The thunks extract caller pc, verify that the address is in heap or global and switch to g0 stack.

Before:
ok  	regexp	3.692s
ok  	compress/bzip2	9.461s
ok  	encoding/json	6.380s
After:
ok  	regexp	2.229s (-40%)
ok  	compress/bzip2	4.703s (-50%)
ok  	encoding/json	3.629s (-43%)

For comparison, normal non-race build:
ok  	regexp	0.348s
ok  	compress/bzip2	0.304s
ok  	encoding/json	0.661s
Race build:
ok  	regexp	2.229s (+540%)
ok  	compress/bzip2	4.703s (+1447%)
ok  	encoding/json	3.629s (+449%)

Also removes some race-related special cases from cgocall and scheduler.
In long-term it will allow to remove cyclic runtime/race dependency on cmd/cgo.

Fixes #4249.
Fixes #7460.
Update #6508
Update #6688

R=iant, rsc, bradfitz
CC=golang-codereviews
https://golang.org/cl/55100044

											
										
										
											2014-03-06 12:48:30 -07:00
+								// Called from instrumented code.
 								TEXT	runtime·racefuncenter(SB), NOSPLIT, $0-8
 									MOVQ	DX, R15		// save function entry context (for closures)
 									get_tls(R12)
 									MOVQ	g(R12), R14
 									MOVQ	g_racectx(R14), RARG0	// goroutine context
 									MOVQ	callpc+0(FP), RARG1
 									// void __tsan_func_enter(ThreadState *thr, void *pc);
 									MOVQ	$__tsan_func_enter(SB), AX
-												runtime: fix atomic operations on non-heap addresses
Race detector runtime does not tolerate operations on addresses
that was not previously declared with __tsan_map_shadow
(namely, data, bss and heap). The corresponding address
checks for atomic operations were removed in
https://golang.org/cl/111310044
Restore these checks.
It's tricker than just not calling into race runtime,
because it is the race runtime that makes the atomic
operations themselves (if we do not call into race runtime
we skip the atomic operation itself as well). So instead we call
__tsan_go_ignore_sync_start/end around the atomic operation.
This forces race runtime to skip all other processing
except than doing the atomic operation itself.
Fixes #9136.

LGTM=rsc
R=rsc
CC=golang-codereviews
https://golang.org/cl/179030043

											
										
										
											2014-11-20 07:51:02 -07:00
+									// racecall<> preserves R15
-												runtime: use custom thunks for race calls instead of cgo
Implement custom assembly thunks for hot race calls (memory accesses and function entry/exit).
The thunks extract caller pc, verify that the address is in heap or global and switch to g0 stack.

Before:
ok  	regexp	3.692s
ok  	compress/bzip2	9.461s
ok  	encoding/json	6.380s
After:
ok  	regexp	2.229s (-40%)
ok  	compress/bzip2	4.703s (-50%)
ok  	encoding/json	3.629s (-43%)

For comparison, normal non-race build:
ok  	regexp	0.348s
ok  	compress/bzip2	0.304s
ok  	encoding/json	0.661s
Race build:
ok  	regexp	2.229s (+540%)
ok  	compress/bzip2	4.703s (+1447%)
ok  	encoding/json	3.629s (+449%)

Also removes some race-related special cases from cgocall and scheduler.
In long-term it will allow to remove cyclic runtime/race dependency on cmd/cgo.

Fixes #4249.
Fixes #7460.
Update #6508
Update #6688

R=iant, rsc, bradfitz
CC=golang-codereviews
https://golang.org/cl/55100044

											
										
										
											2014-03-06 12:48:30 -07:00
+									CALL	racecall<>(SB)
 									MOVQ	R15, DX	// restore function entry context
 									RET
 								// func runtime·racefuncexit()
 								// Called from instrumented code.
 								TEXT	runtime·racefuncexit(SB), NOSPLIT, $0-0
 									get_tls(R12)
 									MOVQ	g(R12), R14
 									MOVQ	g_racectx(R14), RARG0	// goroutine context
 									// void __tsan_func_exit(ThreadState *thr);
 									MOVQ	$__tsan_func_exit(SB), AX
 									JMP	racecall<>(SB)
-												runtime/race: better handling of atomic operations
This change fixes the last known false negative of the race detector --
detection of races between mutating atomic operations and non-atomic operations.
Race runtime already has functions for precise modelling of various atomic operations,
so this change just forwards all atomic ops to race runtime
instead of poor man modeling in sync/atomic package.
Performance is also improved -- full sync/atomic tests run in 60s instead of 85s now.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rsc
https://golang.org/cl/111310044

											
										
										
											2014-09-01 06:04:33 -06:00
+								// Atomic operations for sync/atomic package.
 								// Load
 								TEXT	sync∕atomic·LoadInt32(SB), NOSPLIT, $0-0
 									MOVQ	$__tsan_go_atomic32_load(SB), AX
 									CALL	racecallatomic<>(SB)
 									RET
 								TEXT	sync∕atomic·LoadInt64(SB), NOSPLIT, $0-0
 									MOVQ	$__tsan_go_atomic64_load(SB), AX
 									CALL	racecallatomic<>(SB)
 									RET
 								TEXT	sync∕atomic·LoadUint32(SB), NOSPLIT, $0-0
 									JMP	sync∕atomic·LoadInt32(SB)
 								TEXT	sync∕atomic·LoadUint64(SB), NOSPLIT, $0-0
 									JMP	sync∕atomic·LoadInt64(SB)
 								TEXT	sync∕atomic·LoadUintptr(SB), NOSPLIT, $0-0
 									JMP	sync∕atomic·LoadInt64(SB)
-												runtime: fix build for race detector

This CL only fixes the build, there are two failing tests:
RaceMapBigValAccess1 and RaceMapBigValAccess2
in runtime/race tests. I haven't investigated why yet.

Updates #9516.

Change-Id: If5bd2f0bee1ee45b1977990ab71e2917aada505f
Reviewed-on: https://go-review.googlesource.com/2401
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>

											
										
										
											2015-01-06 18:40:16 -07:00
+								TEXT	sync∕atomic·LoadPointer(SB), NOSPLIT, $0-0
 									JMP	sync∕atomic·LoadInt64(SB)
-												runtime/race: better handling of atomic operations
This change fixes the last known false negative of the race detector --
detection of races between mutating atomic operations and non-atomic operations.
Race runtime already has functions for precise modelling of various atomic operations,
so this change just forwards all atomic ops to race runtime
instead of poor man modeling in sync/atomic package.
Performance is also improved -- full sync/atomic tests run in 60s instead of 85s now.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rsc
https://golang.org/cl/111310044

											
										
										
											2014-09-01 06:04:33 -06:00
+								// Store
 								TEXT	sync∕atomic·StoreInt32(SB), NOSPLIT, $0-0
 									MOVQ	$__tsan_go_atomic32_store(SB), AX
 									CALL	racecallatomic<>(SB)
 									RET
 								TEXT	sync∕atomic·StoreInt64(SB), NOSPLIT, $0-0
 									MOVQ	$__tsan_go_atomic64_store(SB), AX
 									CALL	racecallatomic<>(SB)
 									RET
 								TEXT	sync∕atomic·StoreUint32(SB), NOSPLIT, $0-0
 									JMP	sync∕atomic·StoreInt32(SB)
 								TEXT	sync∕atomic·StoreUint64(SB), NOSPLIT, $0-0
 									JMP	sync∕atomic·StoreInt64(SB)
 								TEXT	sync∕atomic·StoreUintptr(SB), NOSPLIT, $0-0
 									JMP	sync∕atomic·StoreInt64(SB)
 								// Swap
 								TEXT	sync∕atomic·SwapInt32(SB), NOSPLIT, $0-0
 									MOVQ	$__tsan_go_atomic32_exchange(SB), AX
 									CALL	racecallatomic<>(SB)
 									RET
 								TEXT	sync∕atomic·SwapInt64(SB), NOSPLIT, $0-0
 									MOVQ	$__tsan_go_atomic64_exchange(SB), AX
 									CALL	racecallatomic<>(SB)
 									RET
 								TEXT	sync∕atomic·SwapUint32(SB), NOSPLIT, $0-0
 									JMP	sync∕atomic·SwapInt32(SB)
 								TEXT	sync∕atomic·SwapUint64(SB), NOSPLIT, $0-0
 									JMP	sync∕atomic·SwapInt64(SB)
 								TEXT	sync∕atomic·SwapUintptr(SB), NOSPLIT, $0-0
 									JMP	sync∕atomic·SwapInt64(SB)
 								// Add
 								TEXT	sync∕atomic·AddInt32(SB), NOSPLIT, $0-0
 									MOVQ	$__tsan_go_atomic32_fetch_add(SB), AX
 									CALL	racecallatomic<>(SB)
 									MOVL	add+8(FP), AX	// convert fetch_add to add_fetch
 									ADDL	AX, ret+16(FP)
 									RET
 								TEXT	sync∕atomic·AddInt64(SB), NOSPLIT, $0-0
 									MOVQ	$__tsan_go_atomic64_fetch_add(SB), AX
 									CALL	racecallatomic<>(SB)
 									MOVQ	add+8(FP), AX	// convert fetch_add to add_fetch
 									ADDQ	AX, ret+16(FP)
 									RET
 								TEXT	sync∕atomic·AddUint32(SB), NOSPLIT, $0-0
 									JMP	sync∕atomic·AddInt32(SB)
 								TEXT	sync∕atomic·AddUint64(SB), NOSPLIT, $0-0
 									JMP	sync∕atomic·AddInt64(SB)
 								TEXT	sync∕atomic·AddUintptr(SB), NOSPLIT, $0-0
 									JMP	sync∕atomic·AddInt64(SB)
 								// CompareAndSwap
 								TEXT	sync∕atomic·CompareAndSwapInt32(SB), NOSPLIT, $0-0
 									MOVQ	$__tsan_go_atomic32_compare_exchange(SB), AX
 									CALL	racecallatomic<>(SB)
 									RET
 								TEXT	sync∕atomic·CompareAndSwapInt64(SB), NOSPLIT, $0-0
 									MOVQ	$__tsan_go_atomic64_compare_exchange(SB), AX
 									CALL	racecallatomic<>(SB)
 									RET
 								TEXT	sync∕atomic·CompareAndSwapUint32(SB), NOSPLIT, $0-0
 									JMP	sync∕atomic·CompareAndSwapInt32(SB)
 								TEXT	sync∕atomic·CompareAndSwapUint64(SB), NOSPLIT, $0-0
 									JMP	sync∕atomic·CompareAndSwapInt64(SB)
 								TEXT	sync∕atomic·CompareAndSwapUintptr(SB), NOSPLIT, $0-0
 									JMP	sync∕atomic·CompareAndSwapInt64(SB)
 								// Generic atomic operation implementation.
 								// AX already contains target function.
 								TEXT	racecallatomic<>(SB), NOSPLIT, $0-0
 									// Trigger SIGSEGV early.
 									MOVQ	16(SP), R12
-												runtime: fix atomic operations on non-heap addresses
Race detector runtime does not tolerate operations on addresses
that was not previously declared with __tsan_map_shadow
(namely, data, bss and heap). The corresponding address
checks for atomic operations were removed in
https://golang.org/cl/111310044
Restore these checks.
It's tricker than just not calling into race runtime,
because it is the race runtime that makes the atomic
operations themselves (if we do not call into race runtime
we skip the atomic operation itself as well). So instead we call
__tsan_go_ignore_sync_start/end around the atomic operation.
This forces race runtime to skip all other processing
except than doing the atomic operation itself.
Fixes #9136.

LGTM=rsc
R=rsc
CC=golang-codereviews
https://golang.org/cl/179030043

											
										
										
											2014-11-20 07:51:02 -07:00
+									MOVL	(R12), R13
 									// Check that addr is within [arenastart, arenaend) or within [racedatastart, racedataend).
 									CMPQ	R12, runtime·racearenastart(SB)
 									JB	racecallatomic_data
 									CMPQ	R12, runtime·racearenaend(SB)
 									JB	racecallatomic_ok
 								racecallatomic_data:
 									CMPQ	R12, runtime·racedatastart(SB)
 									JB	racecallatomic_ignore
 									CMPQ	R12, runtime·racedataend(SB)
 									JAE	racecallatomic_ignore
 								racecallatomic_ok:
 									// Addr is within the good range, call the atomic function.
-												runtime/race: better handling of atomic operations
This change fixes the last known false negative of the race detector --
detection of races between mutating atomic operations and non-atomic operations.
Race runtime already has functions for precise modelling of various atomic operations,
so this change just forwards all atomic ops to race runtime
instead of poor man modeling in sync/atomic package.
Performance is also improved -- full sync/atomic tests run in 60s instead of 85s now.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rsc
https://golang.org/cl/111310044

											
										
										
											2014-09-01 06:04:33 -06:00
+									get_tls(R12)
 									MOVQ	g(R12), R14
 									MOVQ	g_racectx(R14), RARG0	// goroutine context
 									MOVQ	8(SP), RARG1	// caller pc
 									MOVQ	(SP), RARG2	// pc
 									LEAQ	16(SP), RARG3	// arguments
-												runtime: fix atomic operations on non-heap addresses
Race detector runtime does not tolerate operations on addresses
that was not previously declared with __tsan_map_shadow
(namely, data, bss and heap). The corresponding address
checks for atomic operations were removed in
https://golang.org/cl/111310044
Restore these checks.
It's tricker than just not calling into race runtime,
because it is the race runtime that makes the atomic
operations themselves (if we do not call into race runtime
we skip the atomic operation itself as well). So instead we call
__tsan_go_ignore_sync_start/end around the atomic operation.
This forces race runtime to skip all other processing
except than doing the atomic operation itself.
Fixes #9136.

LGTM=rsc
R=rsc
CC=golang-codereviews
https://golang.org/cl/179030043

											
										
										
											2014-11-20 07:51:02 -07:00
+									JMP	racecall<>(SB)	// does not return
 								racecallatomic_ignore:
 									// Addr is outside the good range.
 									// Call __tsan_go_ignore_sync_begin to ignore synchronization during the atomic op.
 									// An attempt to synchronize on the address would cause crash.
 									MOVQ	AX, R15	// remember the original function
 									MOVQ	$__tsan_go_ignore_sync_begin(SB), AX
 									MOVQ	g(R12), R14
 									MOVQ	g_racectx(R14), RARG0	// goroutine context
 									CALL	racecall<>(SB)
 									MOVQ	R15, AX	// restore the original function
 									// Call the atomic function.
 									MOVQ	g_racectx(R14), RARG0	// goroutine context
 									MOVQ	8(SP), RARG1	// caller pc
 									MOVQ	(SP), RARG2	// pc
 									LEAQ	16(SP), RARG3	// arguments
 									CALL	racecall<>(SB)
 									// Call __tsan_go_ignore_sync_end.
 									MOVQ	$__tsan_go_ignore_sync_end(SB), AX
 									MOVQ	g_racectx(R14), RARG0	// goroutine context
-												runtime/race: better handling of atomic operations
This change fixes the last known false negative of the race detector --
detection of races between mutating atomic operations and non-atomic operations.
Race runtime already has functions for precise modelling of various atomic operations,
so this change just forwards all atomic ops to race runtime
instead of poor man modeling in sync/atomic package.
Performance is also improved -- full sync/atomic tests run in 60s instead of 85s now.

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rsc
https://golang.org/cl/111310044

											
										
										
											2014-09-01 06:04:33 -06:00
+									JMP	racecall<>(SB)
-												runtime: use custom thunks for race calls instead of cgo
Implement custom assembly thunks for hot race calls (memory accesses and function entry/exit).
The thunks extract caller pc, verify that the address is in heap or global and switch to g0 stack.

Before:
ok  	regexp	3.692s
ok  	compress/bzip2	9.461s
ok  	encoding/json	6.380s
After:
ok  	regexp	2.229s (-40%)
ok  	compress/bzip2	4.703s (-50%)
ok  	encoding/json	3.629s (-43%)

For comparison, normal non-race build:
ok  	regexp	0.348s
ok  	compress/bzip2	0.304s
ok  	encoding/json	0.661s
Race build:
ok  	regexp	2.229s (+540%)
ok  	compress/bzip2	4.703s (+1447%)
ok  	encoding/json	3.629s (+449%)

Also removes some race-related special cases from cgocall and scheduler.
In long-term it will allow to remove cyclic runtime/race dependency on cmd/cgo.

Fixes #4249.
Fixes #7460.
Update #6508
Update #6688

R=iant, rsc, bradfitz
CC=golang-codereviews
https://golang.org/cl/55100044

											
										
										
											2014-03-06 12:48:30 -07:00
+								// void runtime·racecall(void(*f)(...), ...)
 								// Calls C function f from race runtime and passes up to 4 arguments to it.
 								// The arguments are never heap-object-preserving pointers, so we pretend there are no arguments.
 								TEXT	runtime·racecall(SB), NOSPLIT, $0-0
 									MOVQ	fn+0(FP), AX
 									MOVQ	arg0+8(FP), RARG0
 									MOVQ	arg1+16(FP), RARG1
 									MOVQ	arg2+24(FP), RARG2
 									MOVQ	arg3+32(FP), RARG3
 									JMP	racecall<>(SB)
 								// Switches SP to g0 stack and calls (AX). Arguments already set.
 								TEXT	racecall<>(SB), NOSPLIT, $0-0
 									get_tls(R12)
 									MOVQ	g(R12), R14
-												all: remove 'extern register M *m' from runtime

The runtime has historically held two dedicated values g (current goroutine)
and m (current thread) in 'extern register' slots (TLS on x86, real registers
backed by TLS on ARM).

This CL removes the extern register m; code now uses g->m.

On ARM, this frees up the register that formerly held m (R9).
This is important for NaCl, because NaCl ARM code cannot use R9 at all.

The Go 1 macrobenchmarks (those with per-op times >= 10 µs) are unaffected:

BenchmarkBinaryTree17              5491374955     5471024381     -0.37%
BenchmarkFannkuch11                4357101311     4275174828     -1.88%
BenchmarkGobDecode                 11029957       11364184       +3.03%
BenchmarkGobEncode                 6852205        6784822        -0.98%
BenchmarkGzip                      650795967      650152275      -0.10%
BenchmarkGunzip                    140962363      141041670      +0.06%
BenchmarkHTTPClientServer          71581          73081          +2.10%
BenchmarkJSONEncode                31928079       31913356       -0.05%
BenchmarkJSONDecode                117470065      113689916      -3.22%
BenchmarkMandelbrot200             6008923        5998712        -0.17%
BenchmarkGoParse                   6310917        6327487        +0.26%
BenchmarkRegexpMatchMedium_1K      114568         114763         +0.17%
BenchmarkRegexpMatchHard_1K        168977         169244         +0.16%
BenchmarkRevcomp                   935294971      914060918      -2.27%
BenchmarkTemplate                  145917123      148186096      +1.55%

Minux previous reported larger variations, but these were caused by
run-to-run noise, not repeatable slowdowns.

Actual code changes by Minux.
I only did the docs and the benchmarking.

LGTM=dvyukov, iant, minux
R=minux, josharian, iant, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/109050043

											
										
										
											2014-06-26 09:54:39 -06:00
+									MOVQ	g_m(R14), R13
-												runtime: use custom thunks for race calls instead of cgo
Implement custom assembly thunks for hot race calls (memory accesses and function entry/exit).
The thunks extract caller pc, verify that the address is in heap or global and switch to g0 stack.

Before:
ok  	regexp	3.692s
ok  	compress/bzip2	9.461s
ok  	encoding/json	6.380s
After:
ok  	regexp	2.229s (-40%)
ok  	compress/bzip2	4.703s (-50%)
ok  	encoding/json	3.629s (-43%)

For comparison, normal non-race build:
ok  	regexp	0.348s
ok  	compress/bzip2	0.304s
ok  	encoding/json	0.661s
Race build:
ok  	regexp	2.229s (+540%)
ok  	compress/bzip2	4.703s (+1447%)
ok  	encoding/json	3.629s (+449%)

Also removes some race-related special cases from cgocall and scheduler.
In long-term it will allow to remove cyclic runtime/race dependency on cmd/cgo.

Fixes #4249.
Fixes #7460.
Update #6508
Update #6688

R=iant, rsc, bradfitz
CC=golang-codereviews
https://golang.org/cl/55100044

											
										
										
											2014-03-06 12:48:30 -07:00
+									// Switch to g0 stack.
 									MOVQ	SP, R12		// callee-saved, preserved across the CALL
 									MOVQ	m_g0(R13), R10
 									CMPQ	R10, R14
-												[dev.power64] cmd/5a, cmd/6a, cmd/8a, cmd/9a: make labels function-scoped

I removed support for jumping between functions years ago,
as part of doing the instruction layout for each function separately.

Given that, it makes sense to treat labels as function-scoped.
This lets each function have its own 'loop' label, for example.

Makes the assembly much cleaner and removes the last
reason anyone would reach for the 123(PC) form instead.

Note that this is on the dev.power64 branch, but it changes all
the assemblers. The change will ship in Go 1.5 (perhaps after
being ported into the new assembler).

Came up as part of CL 167730043.

LGTM=r
R=r
CC=austin, dave, golang-codereviews, minux
https://golang.org/cl/159670043

											
										
										
											2014-10-28 19:50:16 -06:00
+									JE	call	// already on g0
-												runtime: use custom thunks for race calls instead of cgo
Implement custom assembly thunks for hot race calls (memory accesses and function entry/exit).
The thunks extract caller pc, verify that the address is in heap or global and switch to g0 stack.

Before:
ok  	regexp	3.692s
ok  	compress/bzip2	9.461s
ok  	encoding/json	6.380s
After:
ok  	regexp	2.229s (-40%)
ok  	compress/bzip2	4.703s (-50%)
ok  	encoding/json	3.629s (-43%)

For comparison, normal non-race build:
ok  	regexp	0.348s
ok  	compress/bzip2	0.304s
ok  	encoding/json	0.661s
Race build:
ok  	regexp	2.229s (+540%)
ok  	compress/bzip2	4.703s (+1447%)
ok  	encoding/json	3.629s (+449%)

Also removes some race-related special cases from cgocall and scheduler.
In long-term it will allow to remove cyclic runtime/race dependency on cmd/cgo.

Fixes #4249.
Fixes #7460.
Update #6508
Update #6688

R=iant, rsc, bradfitz
CC=golang-codereviews
https://golang.org/cl/55100044

											
										
										
											2014-03-06 12:48:30 -07:00
+									MOVQ	(g_sched+gobuf_sp)(R10), SP
-												[dev.power64] cmd/5a, cmd/6a, cmd/8a, cmd/9a: make labels function-scoped

I removed support for jumping between functions years ago,
as part of doing the instruction layout for each function separately.

Given that, it makes sense to treat labels as function-scoped.
This lets each function have its own 'loop' label, for example.

Makes the assembly much cleaner and removes the last
reason anyone would reach for the 123(PC) form instead.

Note that this is on the dev.power64 branch, but it changes all
the assemblers. The change will ship in Go 1.5 (perhaps after
being ported into the new assembler).

Came up as part of CL 167730043.

LGTM=r
R=r
CC=austin, dave, golang-codereviews, minux
https://golang.org/cl/159670043

											
										
										
											2014-10-28 19:50:16 -06:00
+								call:
-												runtime: use custom thunks for race calls instead of cgo
Implement custom assembly thunks for hot race calls (memory accesses and function entry/exit).
The thunks extract caller pc, verify that the address is in heap or global and switch to g0 stack.

Before:
ok  	regexp	3.692s
ok  	compress/bzip2	9.461s
ok  	encoding/json	6.380s
After:
ok  	regexp	2.229s (-40%)
ok  	compress/bzip2	4.703s (-50%)
ok  	encoding/json	3.629s (-43%)

For comparison, normal non-race build:
ok  	regexp	0.348s
ok  	compress/bzip2	0.304s
ok  	encoding/json	0.661s
Race build:
ok  	regexp	2.229s (+540%)
ok  	compress/bzip2	4.703s (+1447%)
ok  	encoding/json	3.629s (+449%)

Also removes some race-related special cases from cgocall and scheduler.
In long-term it will allow to remove cyclic runtime/race dependency on cmd/cgo.

Fixes #4249.
Fixes #7460.
Update #6508
Update #6688

R=iant, rsc, bradfitz
CC=golang-codereviews
https://golang.org/cl/55100044

											
										
										
											2014-03-06 12:48:30 -07:00
+									ANDQ	$~15, SP	// alignment for gcc ABI
 									CALL	AX
 									MOVQ	R12, SP
 									RET
 								// C->Go callback thunk that allows to call runtime·racesymbolize from C code.
 								// Direct Go->C race call has only switched SP, finish g->g0 switch by setting correct g.
 								// The overall effect of Go->C->Go call chain is similar to that of mcall.
 								TEXT	runtime·racesymbolizethunk(SB), NOSPLIT, $56-8
 									// Save callee-saved registers (Go code won't respect that).
 									// This is superset of darwin/linux/windows registers.
 									PUSHQ	BX
 									PUSHQ	BP
 									PUSHQ	DI
 									PUSHQ	SI
 									PUSHQ	R12
 									PUSHQ	R13
 									PUSHQ	R14
 									PUSHQ	R15
 									// Set g = g0.
 									get_tls(R12)
-												all: remove 'extern register M *m' from runtime

The runtime has historically held two dedicated values g (current goroutine)
and m (current thread) in 'extern register' slots (TLS on x86, real registers
backed by TLS on ARM).

This CL removes the extern register m; code now uses g->m.

On ARM, this frees up the register that formerly held m (R9).
This is important for NaCl, because NaCl ARM code cannot use R9 at all.

The Go 1 macrobenchmarks (those with per-op times >= 10 µs) are unaffected:

BenchmarkBinaryTree17              5491374955     5471024381     -0.37%
BenchmarkFannkuch11                4357101311     4275174828     -1.88%
BenchmarkGobDecode                 11029957       11364184       +3.03%
BenchmarkGobEncode                 6852205        6784822        -0.98%
BenchmarkGzip                      650795967      650152275      -0.10%
BenchmarkGunzip                    140962363      141041670      +0.06%
BenchmarkHTTPClientServer          71581          73081          +2.10%
BenchmarkJSONEncode                31928079       31913356       -0.05%
BenchmarkJSONDecode                117470065      113689916      -3.22%
BenchmarkMandelbrot200             6008923        5998712        -0.17%
BenchmarkGoParse                   6310917        6327487        +0.26%
BenchmarkRegexpMatchMedium_1K      114568         114763         +0.17%
BenchmarkRegexpMatchHard_1K        168977         169244         +0.16%
BenchmarkRevcomp                   935294971      914060918      -2.27%
BenchmarkTemplate                  145917123      148186096      +1.55%

Minux previous reported larger variations, but these were caused by
run-to-run noise, not repeatable slowdowns.

Actual code changes by Minux.
I only did the docs and the benchmarking.

LGTM=dvyukov, iant, minux
R=minux, josharian, iant, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/109050043

											
										
										
											2014-06-26 09:54:39 -06:00
+									MOVQ	g(R12), R13
 									MOVQ	g_m(R13), R13
-												runtime: use custom thunks for race calls instead of cgo
Implement custom assembly thunks for hot race calls (memory accesses and function entry/exit).
The thunks extract caller pc, verify that the address is in heap or global and switch to g0 stack.

Before:
ok  	regexp	3.692s
ok  	compress/bzip2	9.461s
ok  	encoding/json	6.380s
After:
ok  	regexp	2.229s (-40%)
ok  	compress/bzip2	4.703s (-50%)
ok  	encoding/json	3.629s (-43%)

For comparison, normal non-race build:
ok  	regexp	0.348s
ok  	compress/bzip2	0.304s
ok  	encoding/json	0.661s
Race build:
ok  	regexp	2.229s (+540%)
ok  	compress/bzip2	4.703s (+1447%)
ok  	encoding/json	3.629s (+449%)

Also removes some race-related special cases from cgocall and scheduler.
In long-term it will allow to remove cyclic runtime/race dependency on cmd/cgo.

Fixes #4249.
Fixes #7460.
Update #6508
Update #6688

R=iant, rsc, bradfitz
CC=golang-codereviews
https://golang.org/cl/55100044

											
										
										
											2014-03-06 12:48:30 -07:00
+									MOVQ	m_g0(R13), R14
 									MOVQ	R14, g(R12)	// g = m->g0
-												runtime: fix stack corruption in race mode

MOVQ RARG0, 0(SP) smashes exactly what was saved by PUSHQ R15.
This code managed to work somehow with the current race runtime,
but corrupts caller arguments with new race runtime that I am testing.

Change-Id: I9ffe8b5eee86451db36e99dbf4d11f320192e576
Reviewed-on: https://go-review.googlesource.com/4810
Reviewed-by: Keith Randall <khr@golang.org>

											
										
										
											2015-02-13 07:14:48 -07:00
+									PUSHQ	RARG0	// func arg
-												runtime: use custom thunks for race calls instead of cgo
Implement custom assembly thunks for hot race calls (memory accesses and function entry/exit).
The thunks extract caller pc, verify that the address is in heap or global and switch to g0 stack.

Before:
ok  	regexp	3.692s
ok  	compress/bzip2	9.461s
ok  	encoding/json	6.380s
After:
ok  	regexp	2.229s (-40%)
ok  	compress/bzip2	4.703s (-50%)
ok  	encoding/json	3.629s (-43%)

For comparison, normal non-race build:
ok  	regexp	0.348s
ok  	compress/bzip2	0.304s
ok  	encoding/json	0.661s
Race build:
ok  	regexp	2.229s (+540%)
ok  	compress/bzip2	4.703s (+1447%)
ok  	encoding/json	3.629s (+449%)

Also removes some race-related special cases from cgocall and scheduler.
In long-term it will allow to remove cyclic runtime/race dependency on cmd/cgo.

Fixes #4249.
Fixes #7460.
Update #6508
Update #6688

R=iant, rsc, bradfitz
CC=golang-codereviews
https://golang.org/cl/55100044

											
										
										
											2014-03-06 12:48:30 -07:00
+									CALL	runtime·racesymbolize(SB)
-												runtime: fix stack corruption in race mode

MOVQ RARG0, 0(SP) smashes exactly what was saved by PUSHQ R15.
This code managed to work somehow with the current race runtime,
but corrupts caller arguments with new race runtime that I am testing.

Change-Id: I9ffe8b5eee86451db36e99dbf4d11f320192e576
Reviewed-on: https://go-review.googlesource.com/4810
Reviewed-by: Keith Randall <khr@golang.org>

											
										
										
											2015-02-13 07:14:48 -07:00
+									POPQ	R12
-												runtime: use custom thunks for race calls instead of cgo
Implement custom assembly thunks for hot race calls (memory accesses and function entry/exit).
The thunks extract caller pc, verify that the address is in heap or global and switch to g0 stack.

Before:
ok  	regexp	3.692s
ok  	compress/bzip2	9.461s
ok  	encoding/json	6.380s
After:
ok  	regexp	2.229s (-40%)
ok  	compress/bzip2	4.703s (-50%)
ok  	encoding/json	3.629s (-43%)

For comparison, normal non-race build:
ok  	regexp	0.348s
ok  	compress/bzip2	0.304s
ok  	encoding/json	0.661s
Race build:
ok  	regexp	2.229s (+540%)
ok  	compress/bzip2	4.703s (+1447%)
ok  	encoding/json	3.629s (+449%)

Also removes some race-related special cases from cgocall and scheduler.
In long-term it will allow to remove cyclic runtime/race dependency on cmd/cgo.

Fixes #4249.
Fixes #7460.
Update #6508
Update #6688

R=iant, rsc, bradfitz
CC=golang-codereviews
https://golang.org/cl/55100044

											
										
										
											2014-03-06 12:48:30 -07:00
+									// All registers are smashed after Go code, reload.
 									get_tls(R12)
-												all: remove 'extern register M *m' from runtime

The runtime has historically held two dedicated values g (current goroutine)
and m (current thread) in 'extern register' slots (TLS on x86, real registers
backed by TLS on ARM).

This CL removes the extern register m; code now uses g->m.

On ARM, this frees up the register that formerly held m (R9).
This is important for NaCl, because NaCl ARM code cannot use R9 at all.

The Go 1 macrobenchmarks (those with per-op times >= 10 µs) are unaffected:

BenchmarkBinaryTree17              5491374955     5471024381     -0.37%
BenchmarkFannkuch11                4357101311     4275174828     -1.88%
BenchmarkGobDecode                 11029957       11364184       +3.03%
BenchmarkGobEncode                 6852205        6784822        -0.98%
BenchmarkGzip                      650795967      650152275      -0.10%
BenchmarkGunzip                    140962363      141041670      +0.06%
BenchmarkHTTPClientServer          71581          73081          +2.10%
BenchmarkJSONEncode                31928079       31913356       -0.05%
BenchmarkJSONDecode                117470065      113689916      -3.22%
BenchmarkMandelbrot200             6008923        5998712        -0.17%
BenchmarkGoParse                   6310917        6327487        +0.26%
BenchmarkRegexpMatchMedium_1K      114568         114763         +0.17%
BenchmarkRegexpMatchHard_1K        168977         169244         +0.16%
BenchmarkRevcomp                   935294971      914060918      -2.27%
BenchmarkTemplate                  145917123      148186096      +1.55%

Minux previous reported larger variations, but these were caused by
run-to-run noise, not repeatable slowdowns.

Actual code changes by Minux.
I only did the docs and the benchmarking.

LGTM=dvyukov, iant, minux
R=minux, josharian, iant, dave, bradfitz, dvyukov
CC=golang-codereviews
https://golang.org/cl/109050043

											
										
										
											2014-06-26 09:54:39 -06:00
+									MOVQ	g(R12), R13
 									MOVQ	g_m(R13), R13
-												runtime: use custom thunks for race calls instead of cgo
Implement custom assembly thunks for hot race calls (memory accesses and function entry/exit).
The thunks extract caller pc, verify that the address is in heap or global and switch to g0 stack.

Before:
ok  	regexp	3.692s
ok  	compress/bzip2	9.461s
ok  	encoding/json	6.380s
After:
ok  	regexp	2.229s (-40%)
ok  	compress/bzip2	4.703s (-50%)
ok  	encoding/json	3.629s (-43%)

For comparison, normal non-race build:
ok  	regexp	0.348s
ok  	compress/bzip2	0.304s
ok  	encoding/json	0.661s
Race build:
ok  	regexp	2.229s (+540%)
ok  	compress/bzip2	4.703s (+1447%)
ok  	encoding/json	3.629s (+449%)

Also removes some race-related special cases from cgocall and scheduler.
In long-term it will allow to remove cyclic runtime/race dependency on cmd/cgo.

Fixes #4249.
Fixes #7460.
Update #6508
Update #6688

R=iant, rsc, bradfitz
CC=golang-codereviews
https://golang.org/cl/55100044

											
										
										
											2014-03-06 12:48:30 -07:00
+									MOVQ	m_curg(R13), R14
 									MOVQ	R14, g(R12)	// g = m->curg
 									// Restore callee-saved registers.
 									POPQ	R15
 									POPQ	R14
 									POPQ	R13
 									POPQ	R12
 									POPQ	SI
 									POPQ	DI
 									POPQ	BP
 									POPQ	BX
-												runtime: preserve DX during racefuncenter

R=golang-dev, dvyukov
CC=golang-dev
https://golang.org/cl/7382049

											
										
										
											2013-02-22 11:06:43 -07:00
+									RET