1
0
mirror of https://github.com/golang/go synced 2024-09-29 11:24:28 -06:00

runtime,cmd/internal/obj/x86: use TEB TLS slots on windows/i386

This CL redesign how we get the TLS pointer on windows/i386.
It applies the same changes as done in CL 431775 for windows/amd64.

We were previously reading it from the [TEB] arbitrary data slot,
located at 0x14(FS), which can only hold 1 TLS pointer.

With this CL, we will read the TLS pointer from the TEB TLS slot array,
located at 0xE10(GS). The TLS slot array can hold multiple
TLS pointers, up to 64, so multiple Go runtimes running on the
same thread can coexists with different TLS.

Each new TLS slot has to be allocated via [TlsAlloc],
which returns the slot index. This index can then be used to get the
slot offset from GS with the following formula: 0xE10 + index*4.

The slot index is fixed per Go runtime, so we can store it
in runtime.tls_g and use it latter on to read/update the TLS pointer.

Loading the TLS pointer requires the following asm instructions:

  MOVQ runtime.tls_g, AX
  MOVQ AX(FS), AX

Notice that this approach will now be implemented in all the supported
windows arches.

[TEB]: https://en.wikipedia.org/wiki/Win32_Thread_Information_Block
[TlsAlloc]: https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-tlsalloc

Change-Id: If4550b0d44694ee6480d4093b851f4991a088b32
Reviewed-on: https://go-review.googlesource.com/c/go/+/454675
Reviewed-by: Michael Pratt <mpratt@google.com>
Run-TryBot: Quim Muntal <quimmuntal@gmail.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
This commit is contained in:
qmuntal 2022-12-02 09:25:26 +01:00 committed by Quim Muntal
parent bb5ff5342d
commit 28f8dbd7b9
5 changed files with 84 additions and 57 deletions

View File

@ -2551,22 +2551,6 @@ func prefixof(ctxt *obj.Link, a *obj.Addr) int {
}
}
if ctxt.Arch.Family == sys.I386 {
if a.Index == REG_TLS && ctxt.Flag_shared {
// When building for inclusion into a shared library, an instruction of the form
// MOVL off(CX)(TLS*1), AX
// becomes
// mov %gs:off(%ecx), %eax
// which assumes that the correct TLS offset has been loaded into %ecx (today
// there is only one TLS variable -- g -- so this is OK). When not building for
// a shared library the instruction it becomes
// mov 0x0(%ecx), %eax
// and a R_TLS_LE relocation, and so does not require a prefix.
return 0x65 // GS
}
return 0
}
switch a.Index {
case REG_CS:
return 0x2e
@ -2582,11 +2566,18 @@ func prefixof(ctxt *obj.Link, a *obj.Addr) int {
// When building for inclusion into a shared library, an instruction of the form
// MOV off(CX)(TLS*1), AX
// becomes
// mov %fs:off(%rcx), %rax
// which assumes that the correct TLS offset has been loaded into %rcx (today
// mov %gs:off(%ecx), %eax // on i386
// mov %fs:off(%rcx), %rax // on amd64
// which assumes that the correct TLS offset has been loaded into CX (today
// there is only one TLS variable -- g -- so this is OK). When not building for
// a shared library the instruction does not require a prefix.
return 0x64
// a shared library the instruction it becomes
// mov 0x0(%ecx), %eax // on i386
// mov 0x0(%rcx), %rax // on amd64
// and a R_TLS_LE relocation, and so does not require a prefix.
if ctxt.Arch.Family == sys.I386 {
return 0x65 // GS
}
return 0x64 // FS
}
case REG_FS:
@ -3725,7 +3716,7 @@ func (ab *AsmBuf) asmandsz(ctxt *obj.Link, cursym *obj.LSym, p *obj.Prog, a *obj
if REG_AX <= base && base <= REG_R15 {
if a.Index == REG_TLS && !ctxt.Flag_shared && !isAndroid &&
!(ctxt.Headtype == objabi.Hwindows && ctxt.Arch.Family == sys.AMD64) {
ctxt.Headtype != objabi.Hwindows {
rel = obj.Reloc{}
rel.Type = objabi.R_TLS_LE
rel.Siz = 4
@ -5137,19 +5128,6 @@ func (ab *AsmBuf) doasm(ctxt *obj.Link, cursym *obj.LSym, p *obj.Prog) {
pp.From.Index = REG_NONE
ab.Put1(0x8B)
ab.asmand(ctxt, cursym, p, &pp.From, &p.To)
case objabi.Hwindows:
// Windows TLS base is always 0x14(FS).
pp.From = p.From
pp.From.Type = obj.TYPE_MEM
pp.From.Reg = REG_FS
pp.From.Offset = 0x14
pp.From.Index = REG_NONE
pp.From.Scale = 0
ab.Put2(0x64, // FS
0x8B)
ab.asmand(ctxt, cursym, p, &pp.From, &p.To)
}
break
}

View File

@ -158,11 +158,11 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) {
}
}
// Android and Win64 use a tls offset determined at runtime. Rewrite
// Android and Windows use a tls offset determined at runtime. Rewrite
// MOVQ TLS, BX
// to
// MOVQ runtime.tls_g(SB), BX
if (isAndroid || (ctxt.Headtype == objabi.Hwindows && ctxt.Arch.Family == sys.AMD64)) &&
if (isAndroid || ctxt.Headtype == objabi.Hwindows) &&
(p.As == AMOVQ || p.As == AMOVL) && p.From.Type == obj.TYPE_REG && p.From.Reg == REG_TLS && p.To.Type == obj.TYPE_REG && REG_AX <= p.To.Reg && p.To.Reg <= REG_R15 {
p.From.Type = obj.TYPE_MEM
p.From.Name = obj.NAME_EXTERN
@ -170,17 +170,23 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) {
p.From.Sym = ctxt.Lookup("runtime.tls_g")
p.From.Index = REG_NONE
if ctxt.Headtype == objabi.Hwindows {
// Win64 requires an additional indirection
// Windows requires an additional indirection
// to retrieve the TLS pointer,
// as runtime.tls_g contains the TLS offset from GS.
// add
// as runtime.tls_g contains the TLS offset from GS or FS.
// on AMD64 add
// MOVQ 0(BX)(GS*1), BX
// on 386 add
// MOVQ 0(BX)(FS*1), BX4
q := obj.Appendp(p, newprog)
q.As = p.As
q.From = obj.Addr{}
q.From.Type = obj.TYPE_MEM
q.From.Reg = p.To.Reg
q.From.Index = REG_GS
if ctxt.Arch.Family == sys.AMD64 {
q.From.Index = REG_GS
} else {
q.From.Index = REG_FS
}
q.From.Scale = 1
q.From.Offset = 0
q.To = p.To

View File

@ -171,8 +171,12 @@ nocpuinfo:
MOVL $runtime·tls_g(SB), 8(SP) // arg 3: &tls_g
#else
MOVL $0, BX
MOVL BX, 12(SP) // arg 3,4: not used when using platform's TLS
MOVL BX, 8(SP)
MOVL BX, 12(SP) // arg 4: not used when using platform's TLS
#ifdef GOOS_windows
MOVL $runtime·tls_g(SB), 8(SP) // arg 3: &tls_g
#else
MOVL BX, 8(SP) // arg 3: not used when using platform's TLS
#endif
#endif
MOVL $setg_gcc<>(SB), BX
MOVL BX, 4(SP) // arg 2: setg_gcc
@ -795,14 +799,15 @@ havem:
TEXT runtime·setg(SB), NOSPLIT, $0-4
MOVL gg+0(FP), BX
#ifdef GOOS_windows
MOVL runtime·tls_g(SB), CX
CMPL BX, $0
JNE settls
MOVL $0, 0x14(FS)
MOVL $0, 0(CX)(FS)
RET
settls:
MOVL g_m(BX), AX
LEAL m_tls(AX), AX
MOVL AX, 0x14(FS)
MOVL AX, 0(CX)(FS)
#endif
get_tls(CX)
MOVL BX, g(CX)
@ -867,6 +872,9 @@ rdtsc:
JMP done
TEXT ldt0setup<>(SB),NOSPLIT,$16-0
#ifdef GOOS_windows
CALL runtime·wintls(SB)
#endif
// set up ldt 7 to point at m0.tls
// ldt 1 would be fine on Linux, but on OS X, 7 is as low as we can go.
// the entry number is just a hint. setldt will set up GS with what it used.
@ -1577,3 +1585,6 @@ TEXT runtime·panicExtendSlice3CU(SB),NOSPLIT,$0-12
DATA runtime·tls_g+0(SB)/4, $8
GLOBL runtime·tls_g+0(SB), NOPTR, $4
#endif
#ifdef GOOS_windows
GLOBL runtime·tls_g+0(SB), NOPTR, $4
#endif

View File

@ -12,10 +12,12 @@
#include "libcgo_windows.h"
static void threadentry(void*);
static DWORD *tls_g;
void
x_cgo_init(G *g)
x_cgo_init(G *g, void (*setg)(void*), void **tlsg, void **tlsbase)
{
tls_g = (DWORD *)tlsg;
}
@ -39,10 +41,10 @@ threadentry(void *v)
* Set specific keys in thread local storage.
*/
asm volatile (
"movl %0, %%fs:0x14\n" // MOVL tls0, 0x14(FS)
"movl %%fs:0x14, %%eax\n" // MOVL 0x14(FS), tmp
"movl %1, 0(%%eax)\n" // MOVL g, 0(FS)
:: "r"(ts.tls), "r"(ts.g) : "%eax"
"movl %0, %%fs:0(%1)\n" // MOVL tls0, 0(tls_g)(FS)
"movl %%fs:0(%1), %%eax\n" // MOVL 0(tls_g)(FS), tmp
"movl %2, 0(%%eax)\n" // MOVL g, 0(AX)
:: "r"(ts.tls), "r"(*tls_g), "r"(ts.g) : "%eax"
);
crosscall_386(ts.fn);

View File

@ -7,6 +7,9 @@
#include "textflag.h"
#include "time_windows.h"
// Offsets into Thread Environment Block (pointer in FS)
#define TEB_TlsSlots 0xE10
// void runtime·asmstdcall(void *c);
TEXT runtime·asmstdcall(SB),NOSPLIT,$0
MOVL fn+0(FP), BX
@ -222,7 +225,7 @@ TEXT runtime·callbackasm1(SB),NOSPLIT,$0
RET
// void tstart(M *newm);
TEXT tstart<>(SB),NOSPLIT,$0
TEXT tstart<>(SB),NOSPLIT,$8-4
MOVL newm+0(FP), CX // m
MOVL m_g0(CX), DX // g
@ -236,10 +239,11 @@ TEXT tstart<>(SB),NOSPLIT,$0
MOVL AX, g_stackguard1(DX)
// Set up tls.
LEAL m_tls(CX), SI
MOVL SI, 0x14(FS)
LEAL m_tls(CX), DI
MOVL CX, g_m(DX)
MOVL DX, g(SI)
MOVL DX, g(DI)
MOVL DI, 4(SP)
CALL runtime·setldt(SB) // clobbers CX and DX
// Someday the convention will be D is always cleared.
CLD
@ -266,10 +270,11 @@ TEXT runtime·tstart_stdcall(SB),NOSPLIT,$0
RET
// setldt(int entry, int address, int limit)
TEXT runtime·setldt(SB),NOSPLIT,$0
MOVL base+4(FP), CX
MOVL CX, 0x14(FS)
// setldt(int slot, int base, int size)
TEXT runtime·setldt(SB),NOSPLIT,$0-12
MOVL base+4(FP), DX
MOVL runtime·tls_g(SB), CX
MOVL DX, 0(CX)(FS)
RET
// Runs on OS stack.
@ -356,3 +361,28 @@ loop:
useQPC:
JMP runtime·nanotimeQPC(SB)
RET
// This is called from rt0_go, which runs on the system stack
// using the initial stack allocated by the OS.
TEXT runtime·wintls(SB),NOSPLIT|NOFRAME,$0
// Allocate a TLS slot to hold g across calls to external code
MOVL SP, BP
MOVL runtime·_TlsAlloc(SB), AX
CALL AX
MOVL BP, SP
MOVL AX, CX // TLS index
// Assert that slot is less than 64 so we can use _TEB->TlsSlots
CMPL CX, $64
JB ok
CALL runtime·abort(SB)
ok:
// Convert the TLS index at CX into
// an offset from TEB_TlsSlots.
SHLL $2, CX
// Save offset from TLS into tls_g.
ADDL $TEB_TlsSlots, CX
MOVL CX, runtime·tls_g(SB)
RET