From 29d5272da8ca0ba8291ba8c83ed8bb5c802a1d9e Mon Sep 17 00:00:00 2001 From: "Paul E. Murphy" Date: Tue, 10 Jan 2023 15:18:04 -0600 Subject: [PATCH] all: generate NOTOC shared code on power10/PPC64/linux An explicit TOC pointer is not needed when building with GOPPC64=power10 on linux/PPC64 as all addressing is PC relative. Apply changes to the compiler, assembler, and linker to remove R2/TOC maintenance in these build configurations. This results in noticeably smaller PIC binaries. For example the size (in bytes) of the k8s binaries before and with this patch: GOFLAGS="-buildmode=pie" \ FORCE_HOST_GO=y \ GOPPC64=power10 \ CGO_CFLAGS="-mcpu=power10 -O2 -g" \ make all apiextensions-apiserver 66060288 64487424 -1572864 -2.38% e2e_node.test 163520856 159850760 -3670096 -2.24% e2e.test 178167304 174890432 -3276872 -1.83% ginkgo 11010048 10747904 -262144 -2.38% go-runner 2162688 2162688 0 0% k8s-tests 170182216 166970880 -3211336 -1.88% kubeadm 52625408 51314688 -1310720 -2.49% kube-aggregator 62849024 61341696 -1507328 -2.39% kube-apiserver 147783680 144375808 -3407872 -2.30% kube-controller-manager 131137536 127991808 -3145728 -2.39% kubectl 53608448 52363264 -1245184 -2.32% kubectl-convert 52625408 51314688 -1310720 -2.49% kubelet 120913920 118095872 -2818048 -2.33% kube-log-runner 1900544 1835008 -65536 -3.44% kubemark 119078912 116326400 -2752512 -2.31% kube-proxy 58392576 56885248 -1507328 -2.58% kube-scheduler 60751872 59244544 -1507328 -2.48% mounter 1835008 1769472 -65536 -3.57% watch-termination 38076416 37158912 -917504 -2.40% And text size changes: apiextensions-apiserver 30243288 28654116 -1589172 -5.25% e2e_node.test 71132064 67520288 -3611776 -5.07% e2e.test 61843984 58635088 -3208896 -5.18% ginkgo 4975916 4769304 -206612 -4.15% go-runner 896532 858400 -38132 -4.25% k8s-tests 60925792 57752032 -3173760 -5.20% kubeadm 24643240 23404100 -1239140 -5.02% kube-aggregator 28688060 27160976 -1527084 -5.32% kube-apiserver 65627332 62259460 -3367872 -5.13% kube-controller-manager 56773844 53706532 -3067312 -5.40% kubectl 24344276 23080640 -1263636 -5.19% kubectl-convert 23733764 22521768 -1211996 -5.10% kubelet 52494580 49720340 -2774240 -5.28% kube-log-runner 787128 753232 -33896 -4.30% kubemark 51576580 48837380 -2739200 -5.31% kube-proxy 26541092 25124080 -1417012 -5.33% kube-scheduler 27448512 25976172 -1472340 -5.36% mounter 744100 712628 -31472 -4.22% watch-termination 18047276 17139912 -907364 -5.02% Change-Id: Ib4872823b06f93861e46a00679b5d4e5e30b538a Reviewed-on: https://go-review.googlesource.com/c/go/+/495416 Reviewed-by: Lynn Boger Reviewed-by: Dmitri Shuralyov Run-TryBot: Paul Murphy TryBot-Result: Gopher Robot Reviewed-by: Heschi Kreinick --- src/cmd/compile/internal/ppc64/ssa.go | 2 +- src/cmd/internal/obj/ppc64/asm9.go | 8 +++++- src/cmd/internal/obj/ppc64/obj9.go | 10 ++++---- src/cmd/link/internal/ld/symtab.go | 19 ++++++++------ src/cmd/link/internal/ppc64/asm.go | 37 ++++++++++++++++----------- 5 files changed, 46 insertions(+), 30 deletions(-) diff --git a/src/cmd/compile/internal/ppc64/ssa.go b/src/cmd/compile/internal/ppc64/ssa.go index 3c2a7713064..ae0a4c1e52b 100644 --- a/src/cmd/compile/internal/ppc64/ssa.go +++ b/src/cmd/compile/internal/ppc64/ssa.go @@ -1869,7 +1869,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { pp.To.Reg = ppc64.REG_LR pp.SetFrom3Const(1) - if base.Ctxt.Flag_shared { + if ppc64.NeedTOCpointer(base.Ctxt) { // When compiling Go into PIC, the function we just // called via pointer might have been implemented in // a separate module and so overwritten the TOC diff --git a/src/cmd/internal/obj/ppc64/asm9.go b/src/cmd/internal/obj/ppc64/asm9.go index c993600a73b..4559eed36a8 100644 --- a/src/cmd/internal/obj/ppc64/asm9.go +++ b/src/cmd/internal/obj/ppc64/asm9.go @@ -1071,7 +1071,7 @@ func (c *ctxt9) aclass(a *obj.Addr) int { } case obj.TYPE_BRANCH: - if a.Sym != nil && c.ctxt.Flag_dynlink { + if a.Sym != nil && c.ctxt.Flag_dynlink && !pfxEnabled { return C_LBRAPIC } return C_SBRA @@ -1275,6 +1275,12 @@ func opset(a, b0 obj.As) { oprange[a&obj.AMask] = oprange[b0] } +// Determine if the build configuration requires a TOC pointer. +// It is assumed this always called after buildop. +func NeedTOCpointer(ctxt *obj.Link) bool { + return !pfxEnabled && ctxt.Flag_shared +} + // Build the opcode table func buildop(ctxt *obj.Link) { // Limit PC-relative prefix instruction usage to supported and tested targets. diff --git a/src/cmd/internal/obj/ppc64/obj9.go b/src/cmd/internal/obj/ppc64/obj9.go index e10cd56e044..02831b890a3 100644 --- a/src/cmd/internal/obj/ppc64/obj9.go +++ b/src/cmd/internal/obj/ppc64/obj9.go @@ -643,8 +643,8 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { q = p - if c.ctxt.Flag_shared && c.cursym.Name != "runtime.duffzero" && c.cursym.Name != "runtime.duffcopy" { - // When compiling Go into PIC, all functions must start + if NeedTOCpointer(c.ctxt) && c.cursym.Name != "runtime.duffzero" && c.cursym.Name != "runtime.duffcopy" { + // When compiling Go into PIC, without PCrel support, all functions must start // with instructions to load the TOC pointer into r2: // // addis r2, r12, .TOC.-func@ha @@ -763,7 +763,7 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { break } - if c.ctxt.Flag_shared { + if NeedTOCpointer(c.ctxt) { q = obj.Appendp(q, c.newprog) q.As = AMOVD q.Pos = p.Pos @@ -1289,7 +1289,7 @@ func (c *ctxt9) stacksplit(p *obj.Prog, framesize int32) *obj.Prog { morestacksym = c.ctxt.Lookup("runtime.morestack") } - if c.ctxt.Flag_shared { + if NeedTOCpointer(c.ctxt) { // In PPC64 PIC code, R2 is used as TOC pointer derived from R12 // which is the address of function entry point when entering // the function. We need to preserve R2 across call to morestack. @@ -1352,7 +1352,7 @@ func (c *ctxt9) stacksplit(p *obj.Prog, framesize int32) *obj.Prog { p.To.Sym = morestacksym } - if c.ctxt.Flag_shared { + if NeedTOCpointer(c.ctxt) { // MOVD 8(SP), R2 p = obj.Appendp(p, c.newprog) p.As = AMOVD diff --git a/src/cmd/link/internal/ld/symtab.go b/src/cmd/link/internal/ld/symtab.go index 6faa8819dd4..b039e7d8749 100644 --- a/src/cmd/link/internal/ld/symtab.go +++ b/src/cmd/link/internal/ld/symtab.go @@ -137,14 +137,17 @@ func putelfsym(ctxt *Link, x loader.Sym, typ elf.SymType, curbind elf.SymBind) { // externally linking, I don't think this makes a lot of sense. other = int(elf.STV_HIDDEN) } - if ctxt.IsPPC64() && typ == elf.STT_FUNC && ldr.AttrShared(x) && ldr.SymName(x) != "runtime.duffzero" && ldr.SymName(x) != "runtime.duffcopy" { - // On ppc64 the top three bits of the st_other field indicate how - // many instructions separate the global and local entry points. In - // our case it is two instructions, indicated by the value 3. - // The conditions here match those in preprocess in - // cmd/internal/obj/ppc64/obj9.go, which is where the - // instructions are inserted. - other |= 3 << 5 + if ctxt.IsPPC64() && typ == elf.STT_FUNC && ldr.AttrShared(x) { + // On ppc64 the top three bits of the st_other field indicate how many + // bytes separate the global and local entry points. For non-PCrel shared + // symbols this is always 8 bytes except for some special functions. + hasPCrel := buildcfg.GOPPC64 >= 10 && buildcfg.GOOS == "linux" + + // This should match the preprocessing behavior in cmd/internal/obj/ppc64/obj9.go + // where the distinct global entry is inserted. + if !hasPCrel && ldr.SymName(x) != "runtime.duffzero" && ldr.SymName(x) != "runtime.duffcopy" { + other |= 3 << 5 + } } // When dynamically linking, we create Symbols by reading the names from diff --git a/src/cmd/link/internal/ppc64/asm.go b/src/cmd/link/internal/ppc64/asm.go index 5f1419c08ea..8bdcf5d3eb9 100644 --- a/src/cmd/link/internal/ppc64/asm.go +++ b/src/cmd/link/internal/ppc64/asm.go @@ -221,10 +221,10 @@ func genpltstub(ctxt *ld.Link, ldr *loader.Loader, r loader.Reloc, ri int, s loa // An R_PPC64_REL24_NOTOC relocation does not use or maintain // a TOC pointer, and almost always implies a Power10 target. // - // For dynamic calls made from a Go object, the shared attribute - // indicates a PIC symbol, which requires a TOC pointer be - // maintained. Otherwise, a simpler non-PIC stub suffices. - if (r.Type() == objabi.ElfRelocOffset+objabi.RelocType(elf.R_PPC64_REL24)) || (!ldr.AttrExternal(s) && ldr.AttrShared(s)) { + // For dynamic calls made from a Go caller, a TOC relative stub is + // always needed when a TOC pointer is maintained (specifically, if + // the Go caller is PIC, and cannot use PCrel instructions). + if (r.Type() == objabi.ElfRelocOffset+objabi.RelocType(elf.R_PPC64_REL24)) || (!ldr.AttrExternal(s) && ldr.AttrShared(s) && !hasPCrel) { stubTypeStr = "_tocrel" stubType = 1 } else { @@ -318,7 +318,7 @@ func genstubs(ctxt *ld.Link, ldr *loader.Loader) { case sym.STEXT: targ := r.Sym() - if (ldr.AttrExternal(targ) && ldr.SymLocalentry(targ) <= 1) || (!ldr.AttrExternal(targ) && !ldr.AttrShared(targ)) { + if (ldr.AttrExternal(targ) && ldr.SymLocalentry(targ) <= 1) || (!ldr.AttrExternal(targ) && (!ldr.AttrShared(targ) || hasPCrel)) { // This is NOTOC to NOTOC call (st_other is 0 or 1). No call stub is needed. } else { // This is a NOTOC to TOC function. Generate a calling stub. @@ -387,10 +387,12 @@ func genaddmoduledata(ctxt *ld.Link, ldr *loader.Loader) { // runtime.addmoduledata(local.moduledata) // } - // Regenerate TOC from R12 (the address of this function). - sz := initfunc.AddSymRef(ctxt.Arch, ctxt.DotTOC[0], 0, objabi.R_ADDRPOWER_PCREL, 8) - initfunc.SetUint32(ctxt.Arch, sz-8, 0x3c4c0000) // addis r2, r12, .TOC.-func@ha - initfunc.SetUint32(ctxt.Arch, sz-4, 0x38420000) // addi r2, r2, .TOC.-func@l + if !hasPCrel { + // Regenerate TOC from R12 (the address of this function). + sz := initfunc.AddSymRef(ctxt.Arch, ctxt.DotTOC[0], 0, objabi.R_ADDRPOWER_PCREL, 8) + initfunc.SetUint32(ctxt.Arch, sz-8, 0x3c4c0000) // addis r2, r12, .TOC.-func@ha + initfunc.SetUint32(ctxt.Arch, sz-4, 0x38420000) // addi r2, r2, .TOC.-func@l + } // This is Go ABI. Stack a frame and save LR. o(OP_MFLR_R0) // mflr r0 @@ -407,11 +409,11 @@ func genaddmoduledata(ctxt *ld.Link, ldr *loader.Loader) { } if !hasPCrel { - sz = initfunc.AddSymRef(ctxt.Arch, tgt, 0, objabi.R_ADDRPOWER_GOT, 8) + sz := initfunc.AddSymRef(ctxt.Arch, tgt, 0, objabi.R_ADDRPOWER_GOT, 8) initfunc.SetUint32(ctxt.Arch, sz-8, 0x3c620000) // addis r3, r2, local.moduledata@got@ha initfunc.SetUint32(ctxt.Arch, sz-4, 0xe8630000) // ld r3, local.moduledata@got@l(r3) } else { - sz = initfunc.AddSymRef(ctxt.Arch, tgt, 0, objabi.R_ADDRPOWER_GOT_PCREL34, 8) + sz := initfunc.AddSymRef(ctxt.Arch, tgt, 0, objabi.R_ADDRPOWER_GOT_PCREL34, 8) // Note, this is prefixed instruction. It must not cross a 64B boundary. // It is doubleworld aligned here, so it will never cross (this function is 16B aligned, minimum). initfunc.SetUint32(ctxt.Arch, sz-8, OP_PLD_PFX_PCREL) @@ -419,7 +421,7 @@ func genaddmoduledata(ctxt *ld.Link, ldr *loader.Loader) { } // Call runtime.addmoduledata - sz = initfunc.AddSymRef(ctxt.Arch, addmoduledata, 0, objabi.R_CALLPOWER, 4) + sz := initfunc.AddSymRef(ctxt.Arch, addmoduledata, 0, objabi.R_CALLPOWER, 4) initfunc.SetUint32(ctxt.Arch, sz-4, OP_BL) // bl runtime.addmoduledata o(OP_NOP) // nop (for TOC restore) @@ -995,7 +997,12 @@ func elfreloc1(ctxt *ld.Link, out *ld.OutBuf, ldr *loader.Loader, s loader.Sym, if r.Size != 4 { return false } - out.Write64(uint64(elf.R_PPC64_REL24) | uint64(elfsym)<<32) + if !hasPCrel { + out.Write64(uint64(elf.R_PPC64_REL24) | uint64(elfsym)<<32) + } else { + // TOC is not used in PCrel compiled Go code. + out.Write64(uint64(elf.R_PPC64_REL24_NOTOC) | uint64(elfsym)<<32) + } } out.Write64(uint64(r.Xadd)) @@ -1441,10 +1448,10 @@ func archreloc(target *ld.Target, ldr *loader.Loader, syms *ld.ArchSyms, r loade tgtName := ldr.SymName(rs) - // If we are linking PIE or shared code, all golang generated object files have an extra 2 instruction prologue + // If we are linking PIE or shared code, non-PCrel golang generated object files have an extra 2 instruction prologue // to regenerate the TOC pointer from R12. The exception are two special case functions tested below. Note, // local call offsets for externally generated objects are accounted for when converting into golang relocs. - if !ldr.AttrExternal(rs) && ldr.AttrShared(rs) && tgtName != "runtime.duffzero" && tgtName != "runtime.duffcopy" { + if !hasPCrel && !ldr.AttrExternal(rs) && ldr.AttrShared(rs) && tgtName != "runtime.duffzero" && tgtName != "runtime.duffcopy" { // Furthermore, only apply the offset if the target looks like the start of a function call. if r.Add() == 0 && ldr.SymType(rs) == sym.STEXT { t += 8