From 0a17b2c174dcbe0f30a19be2ca5517ed0660f706 Mon Sep 17 00:00:00 2001 From: Joel Sing Date: Sat, 7 Jan 2023 16:26:15 +1100 Subject: [PATCH] cmd/internal/obj/arm64: load large constants into vector registers from rodata Load large constants into vector registers from rodata, instead of placing them in the literal pool. This treats VMOVQ/VMOVD/VMOVS the same as FMOVD/FMOVS and makes use of the existing mechanism for storing values in rodata. Two additional instructions are required for a load, however these instructions are used infrequently and already have a high latency. Updates #59615 Change-Id: I54226730267689963d73321e548733ae2d66740e Reviewed-on: https://go-review.googlesource.com/c/go/+/515617 Reviewed-by: Eric Fang Reviewed-by: Carlos Amedee Run-TryBot: Joel Sing TryBot-Result: Gopher Robot Reviewed-by: Cherry Mui --- src/cmd/internal/obj/arm64/asm7.go | 53 +++++------------------------- src/cmd/internal/obj/arm64/obj7.go | 29 ++++++++++++++-- src/cmd/internal/obj/sym.go | 31 +++++++++++++++++ 3 files changed, 66 insertions(+), 47 deletions(-) diff --git a/src/cmd/internal/obj/arm64/asm7.go b/src/cmd/internal/obj/arm64/asm7.go index 2b8c2180f50..2e5d84f647e 100644 --- a/src/cmd/internal/obj/arm64/asm7.go +++ b/src/cmd/internal/obj/arm64/asm7.go @@ -282,7 +282,6 @@ func MOVCONST(d int64, s int, rt int) uint32 { const ( // Optab.flag LFROM = 1 << iota // p.From uses constant pool - LFROM128 // p.From3<<64+p.From forms a 128-bit constant in literal pool LTO // p.To uses constant pool NOTUSETMP // p expands to multiple instructions, but does NOT use REGTMP BRANCH14BITS // branch instruction encodes 14 bits @@ -423,10 +422,10 @@ var optab = []Optab{ /* load long effective stack address (load int32 offset and add) */ {AMOVD, C_LACON, C_NONE, C_NONE, C_RSP, C_NONE, 34, 8, REGSP, LFROM, 0}, - // Move a large constant to a vector register. - {AVMOVQ, C_VCON, C_NONE, C_VCON, C_VREG, C_NONE, 101, 4, 0, LFROM128, 0}, - {AVMOVD, C_VCON, C_NONE, C_NONE, C_VREG, C_NONE, 101, 4, 0, LFROM, 0}, - {AVMOVS, C_LCON, C_NONE, C_NONE, C_VREG, C_NONE, 101, 4, 0, LFROM, 0}, + // Load a large constant into a vector register. + {AVMOVS, C_ADDR, C_NONE, C_NONE, C_VREG, C_NONE, 65, 12, 0, 0, 0}, + {AVMOVD, C_ADDR, C_NONE, C_NONE, C_VREG, C_NONE, 65, 12, 0, 0, 0}, + {AVMOVQ, C_ADDR, C_NONE, C_NONE, C_VREG, C_NONE, 65, 12, 0, 0, 0}, /* jump operations */ {AB, C_NONE, C_NONE, C_NONE, C_SBRA, C_NONE, 5, 4, 0, 0, 0}, @@ -1117,9 +1116,6 @@ func span7(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) { if o.flag&LFROM != 0 { c.addpool(p, &p.From) } - if o.flag&LFROM128 != 0 { - c.addpool128(p, &p.From, p.GetFrom3()) - } if o.flag<O != 0 { c.addpool(p, &p.To) } @@ -1321,34 +1317,6 @@ func (c *ctxt7) flushpool(p *obj.Prog) { c.pool.start = 0 } -// addpool128 adds a 128-bit constant to literal pool by two consecutive DWORD -// instructions, the 128-bit constant is formed by ah.Offset<<64+al.Offset. -func (c *ctxt7) addpool128(p *obj.Prog, al, ah *obj.Addr) { - q := c.newprog() - q.As = ADWORD - q.To.Type = obj.TYPE_CONST - q.To.Offset = al.Offset // q.Pc is lower than t.Pc, so al.Offset is stored in q. - - t := c.newprog() - t.As = ADWORD - t.To.Type = obj.TYPE_CONST - t.To.Offset = ah.Offset - - q.Link = t - - if c.blitrl == nil { - c.blitrl = q - c.pool.start = uint32(p.Pc) - } else { - c.elitrl.Link = q - } - - c.elitrl = t - c.pool.size = roundUp(c.pool.size, 16) - c.pool.size += 16 - p.Pool = q -} - /* * MOVD foo(SB), R is actually * MOVD addr, REGTMP @@ -1365,8 +1333,8 @@ func (c *ctxt7) addpool(p *obj.Prog, a *obj.Addr) { sz := 4 if a.Type == obj.TYPE_CONST { - if (lit != int64(int32(lit)) && uint64(lit) != uint64(uint32(lit))) || p.As == AVMOVQ || p.As == AVMOVD { - // out of range -0x80000000 ~ 0xffffffff or VMOVQ or VMOVD operand, must store 64-bit. + if lit != int64(int32(lit)) && uint64(lit) != uint64(uint32(lit)) { + // out of range -0x80000000 ~ 0xffffffff, must store 64-bit. t.As = ADWORD sz = 8 } // else store 32-bit @@ -5660,9 +5628,6 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { o1 = q<<30 | 0xe<<24 | len<<13 | op<<12 o1 |= (uint32(rf&31) << 16) | uint32(offset&31)<<5 | uint32(rt&31) - case 101: // VMOVQ $vcon1, $vcon2, Vd or VMOVD|VMOVS $vcon, Vd -> FMOVQ/FMOVD/FMOVS pool(PC), Vd: load from constant pool. - o1 = c.omovlit(p.As, p, &p.From, int(p.To.Reg)) - case 102: /* vushll, vushll2, vuxtl, vuxtl2 */ o1 = c.opirr(p, p.As) rf := p.Reg @@ -7187,13 +7152,13 @@ func (c *ctxt7) opldr(p *obj.Prog, a obj.As) uint32 { case AMOVBU: return LDSTR(0, 0, 1) - case AFMOVS: + case AFMOVS, AVMOVS: return LDSTR(2, 1, 1) - case AFMOVD: + case AFMOVD, AVMOVD: return LDSTR(3, 1, 1) - case AFMOVQ: + case AFMOVQ, AVMOVQ: return LDSTR(0, 1, 3) } diff --git a/src/cmd/internal/obj/arm64/obj7.go b/src/cmd/internal/obj/arm64/obj7.go index f963f62dcd1..9774e0120bd 100644 --- a/src/cmd/internal/obj/arm64/obj7.go +++ b/src/cmd/internal/obj/arm64/obj7.go @@ -329,8 +329,33 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) { break } - // Rewrite float constants to values stored in memory. + // Rewrite float and vector constants to values stored in memory. switch p.As { + case AVMOVS: + if p.From.Type == obj.TYPE_CONST { + p.From.Type = obj.TYPE_MEM + p.From.Sym = c.ctxt.Int32Sym(p.From.Offset) + p.From.Name = obj.NAME_EXTERN + p.From.Offset = 0 + } + + case AVMOVD: + if p.From.Type == obj.TYPE_CONST { + p.From.Type = obj.TYPE_MEM + p.From.Sym = c.ctxt.Int64Sym(p.From.Offset) + p.From.Name = obj.NAME_EXTERN + p.From.Offset = 0 + } + + case AVMOVQ: + if p.From.Type == obj.TYPE_CONST { + p.From.Type = obj.TYPE_MEM + p.From.Sym = c.ctxt.Int128Sym(p.GetFrom3().Offset, p.From.Offset) + p.From.Name = obj.NAME_EXTERN + p.From.Offset = 0 + p.RestArgs = nil + } + case AFMOVS: if p.From.Type == obj.TYPE_FCONST { f64 := p.From.Val.(float64) @@ -365,8 +390,6 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) { p.From.Name = obj.NAME_EXTERN p.From.Offset = 0 } - - break } if c.ctxt.Flag_dynlink { diff --git a/src/cmd/internal/obj/sym.go b/src/cmd/internal/obj/sym.go index 2b885f6a100..fd39f896dc1 100644 --- a/src/cmd/internal/obj/sym.go +++ b/src/cmd/internal/obj/sym.go @@ -36,6 +36,7 @@ import ( "cmd/internal/notsha256" "cmd/internal/objabi" "encoding/base64" + "encoding/binary" "fmt" "internal/buildcfg" "log" @@ -162,6 +163,18 @@ func (ctxt *Link) Float64Sym(f float64) *LSym { }) } +func (ctxt *Link) Int32Sym(i int64) *LSym { + name := fmt.Sprintf("$i32.%08x", uint64(i)) + return ctxt.LookupInit(name, func(s *LSym) { + s.Size = 4 + s.WriteInt(ctxt, 0, 4, i) + s.Type = objabi.SRODATA + s.Set(AttrLocal, true) + s.Set(AttrContentAddressable, true) + ctxt.constSyms = append(ctxt.constSyms, s) + }) +} + func (ctxt *Link) Int64Sym(i int64) *LSym { name := fmt.Sprintf("$i64.%016x", uint64(i)) return ctxt.LookupInit(name, func(s *LSym) { @@ -174,6 +187,24 @@ func (ctxt *Link) Int64Sym(i int64) *LSym { }) } +func (ctxt *Link) Int128Sym(hi, lo int64) *LSym { + name := fmt.Sprintf("$i128.%016x%016x", uint64(hi), uint64(lo)) + return ctxt.LookupInit(name, func(s *LSym) { + s.Size = 16 + if ctxt.Arch.ByteOrder == binary.LittleEndian { + s.WriteInt(ctxt, 0, 8, lo) + s.WriteInt(ctxt, 8, 8, hi) + } else { + s.WriteInt(ctxt, 0, 8, hi) + s.WriteInt(ctxt, 8, 8, lo) + } + s.Type = objabi.SRODATA + s.Set(AttrLocal, true) + s.Set(AttrContentAddressable, true) + ctxt.constSyms = append(ctxt.constSyms, s) + }) +} + // GCLocalsSym generates a content-addressable sym containing data. func (ctxt *Link) GCLocalsSym(data []byte) *LSym { sum := notsha256.Sum256(data)