From f5de42001df2e61233c7ec7bbbd014bbaeaee242 Mon Sep 17 00:00:00 2001 From: erifan01 Date: Fri, 26 Jan 2018 10:18:50 +0000 Subject: [PATCH] cmd/asm: add arm64 instructions for math optimization Add arm64 HW instructions FMADDD, FMADDS, FMSUBD, FMSUBS, FNMADDD, FNMADDS, FNMSUBD, FNMSUBS, VFMLA, VFMLS, VMOV (element) for math optimization. Add check on register element index and test cases. Change-Id: Ice07c50b1a02d488ad2cde2a4e8aea93f3e3afff Reviewed-on: https://go-review.googlesource.com/90876 Reviewed-by: Cherry Zhang --- src/cmd/asm/internal/arch/arm64.go | 21 ++ src/cmd/asm/internal/asm/testdata/arm64.s | 39 +++- .../asm/internal/asm/testdata/arm64error.s | 56 +++++- src/cmd/internal/obj/arm64/a.out.go | 2 + src/cmd/internal/obj/arm64/anames.go | 2 + src/cmd/internal/obj/arm64/asm7.go | 188 +++++++++++++++--- src/cmd/internal/obj/arm64/doc.go | 54 +++++ 7 files changed, 314 insertions(+), 48 deletions(-) diff --git a/src/cmd/asm/internal/arch/arm64.go b/src/cmd/asm/internal/arch/arm64.go index 524a503472..10458b01a0 100644 --- a/src/cmd/asm/internal/arch/arm64.go +++ b/src/cmd/asm/internal/arch/arm64.go @@ -178,18 +178,39 @@ func ARM64RegisterExtension(a *obj.Addr, ext string, reg, num int16, isAmount, i a.Reg = arm64.REG_SXTX + (reg & 31) + int16(num<<5) a.Offset = int64(((rm & 31) << 16) | (7 << 13) | (uint32(num) << 10)) case "B8": + if isIndex { + return errors.New("invalid register extension") + } a.Reg = arm64.REG_ARNG + (reg & 31) + ((arm64.ARNG_8B & 15) << 5) case "B16": + if isIndex { + return errors.New("invalid register extension") + } a.Reg = arm64.REG_ARNG + (reg & 31) + ((arm64.ARNG_16B & 15) << 5) case "H4": + if isIndex { + return errors.New("invalid register extension") + } a.Reg = arm64.REG_ARNG + (reg & 31) + ((arm64.ARNG_4H & 15) << 5) case "H8": + if isIndex { + return errors.New("invalid register extension") + } a.Reg = arm64.REG_ARNG + (reg & 31) + ((arm64.ARNG_8H & 15) << 5) case "S2": + if isIndex { + return errors.New("invalid register extension") + } a.Reg = arm64.REG_ARNG + (reg & 31) + ((arm64.ARNG_2S & 15) << 5) case "S4": + if isIndex { + return errors.New("invalid register extension") + } a.Reg = arm64.REG_ARNG + (reg & 31) + ((arm64.ARNG_4S & 15) << 5) case "D2": + if isIndex { + return errors.New("invalid register extension") + } a.Reg = arm64.REG_ARNG + (reg & 31) + ((arm64.ARNG_2D & 15) << 5) case "B": if !isIndex { diff --git a/src/cmd/asm/internal/asm/testdata/arm64.s b/src/cmd/asm/internal/asm/testdata/arm64.s index 18527037b3..f74dc29f77 100644 --- a/src/cmd/asm/internal/asm/testdata/arm64.s +++ b/src/cmd/asm/internal/asm/testdata/arm64.s @@ -68,6 +68,12 @@ TEXT foo(SB), DUPOK|NOSPLIT, $-8 VADD V1, V3, V3 // 6384e15e VSUB V12, V30, V30 // de87ec7e VSUB V12, V20, V30 // 9e86ec7e + VFMLA V1.D2, V12.D2, V1.D2 // 81cd614e + VFMLA V1.S2, V12.S2, V1.S2 // 81cd210e + VFMLA V1.S4, V12.S4, V1.S4 // 81cd214e + VFMLS V1.D2, V12.D2, V1.D2 // 81cde14e + VFMLS V1.S2, V12.S2, V1.S2 // 81cda10e + VFMLS V1.S4, V12.S4, V1.S4 // 81cda14e // LTYPE1 imsr ',' spreg ',' // { @@ -204,16 +210,20 @@ TEXT foo(SB), DUPOK|NOSPLIT, $-8 // outcode($1, &$2, NREG, &$4); // } MOVK $1, R1 - VMOV V8.S[1], R1 // 013d0c0e - VMOV V0.D[0], R11 // 0b3c084e - VMOV V0.D[1], R11 // 0b3c184e - VMOV R20, V1.S[0] // 811e044e - VMOV R1, V9.H4 // 290c020e - VMOV R22, V11.D2 // cb0e084e - VMOV V2.B16, V4.B16 // 441ca24e - VMOV V20.S[0], V20 // 9406045e - VREV32 V5.B16, V5.B16 // a508206e - VDUP V19.S[0], V17.S4 // 7106044e + VMOV V8.S[1], R1 // 013d0c0e + VMOV V0.D[0], R11 // 0b3c084e + VMOV V0.D[1], R11 // 0b3c184e + VMOV R20, V1.S[0] // 811e044e + VMOV R1, V9.H4 // 290c020e + VMOV R22, V11.D2 // cb0e084e + VMOV V2.B16, V4.B16 // 441ca24e + VMOV V20.S[0], V20 // 9406045e + VMOV V12.D[0], V12.D[1] // 8c05186e + VMOV V10.S[0], V12.S[1] // 4c050c6e + VMOV V9.H[0], V12.H[1] // 2c05066e + VMOV V8.B[0], V12.B[1] // 0c05036e + VREV32 V5.B16, V5.B16 // a508206e + VDUP V19.S[0], V17.S4 // 7106044e // // B/BL // @@ -367,6 +377,15 @@ again: // } // MADD R1, R2, R3, R4 + FMADDS F1, F3, F2, F4 // 440c011f + FMADDD F4, F5, F4, F4 // 8414441f + FMSUBS F13, F21, F13, F19 // b3d50d1f + FMSUBD F11, F7, F15, F31 // ff9d4b1f + FNMADDS F1, F3, F2, F4 // 440c211f + FNMADDD F1, F3, F2, F4 // 440c611f + FNMSUBS F1, F3, F2, F4 // 448c211f + FNMSUBD F1, F3, F2, F4 // 448c611f + // DMB, HINT // // LDMB imm diff --git a/src/cmd/asm/internal/asm/testdata/arm64error.s b/src/cmd/asm/internal/asm/testdata/arm64error.s index e4fad9c741..b77dabd4e1 100644 --- a/src/cmd/asm/internal/asm/testdata/arm64error.s +++ b/src/cmd/asm/internal/asm/testdata/arm64error.s @@ -3,13 +3,51 @@ // license that can be found in the LICENSE file. TEXT errors(SB),$0 - MOVD.P 300(R2), R3 // ERROR "offset out of range [-255,254]" - MOVD.P R3, 344(R2) // ERROR "offset out of range [-255,254]" - VLD1 (R8)(R13), [V2.B16] // ERROR "illegal combination" - VLD1 8(R9), [V2.B16] // ERROR "illegal combination" - VST1 [V1.B16], (R8)(R13) // ERROR "illegal combination" - VST1 [V1.B16], 9(R2) // ERROR "illegal combination" - VLD1 8(R8)(R13), [V2.B16] // ERROR "illegal combination" - ADD R1.UXTB<<5, R2, R3 // ERROR "shift amount out of range 0 to 4" - ADDS R1.UXTX<<7, R2, R3 // ERROR "shift amount out of range 0 to 4" + MOVD.P 300(R2), R3 // ERROR "offset out of range [-255,254]" + MOVD.P R3, 344(R2) // ERROR "offset out of range [-255,254]" + VLD1 (R8)(R13), [V2.B16] // ERROR "illegal combination" + VLD1 8(R9), [V2.B16] // ERROR "illegal combination" + VST1 [V1.B16], (R8)(R13) // ERROR "illegal combination" + VST1 [V1.B16], 9(R2) // ERROR "illegal combination" + VLD1 8(R8)(R13), [V2.B16] // ERROR "illegal combination" + ADD R1.UXTB<<5, R2, R3 // ERROR "shift amount out of range 0 to 4" + ADDS R1.UXTX<<7, R2, R3 // ERROR "shift amount out of range 0 to 4" + VMOV V8.D[2], V12.D[1] // ERROR "register element index out of range 0 to 1" + VMOV V8.S[4], V12.S[1] // ERROR "register element index out of range 0 to 3" + VMOV V8.H[8], V12.H[1] // ERROR "register element index out of range 0 to 7" + VMOV V8.B[16], V12.B[1] // ERROR "register element index out of range 0 to 15" + VMOV V8.D[0], V12.S[1] // ERROR "operand mismatch" + VMOV V8.D[0], V12.H[1] // ERROR "operand mismatch" + VMOV V8.D[0], V12.B[1] // ERROR "operand mismatch" + VMOV V8.S[0], V12.H[1] // ERROR "operand mismatch" + VMOV V8.S[0], V12.B[1] // ERROR "operand mismatch" + VMOV V8.H[0], V12.B[1] // ERROR "operand mismatch" + VMOV V8.B[16], R3 // ERROR "register element index out of range 0 to 15" + VMOV V8.H[9], R3 // ERROR "register element index out of range 0 to 7" + VMOV V8.S[4], R3 // ERROR "register element index out of range 0 to 3" + VMOV V8.D[2], R3 // ERROR "register element index out of range 0 to 1" + VDUP V8.B[16], R3.B16 // ERROR "register element index out of range 0 to 15" + VDUP V8.B[17], R3.B8 // ERROR "register element index out of range 0 to 15" + VDUP V8.H[9], R3.H4 // ERROR "register element index out of range 0 to 7" + VDUP V8.H[9], R3.H8 // ERROR "register element index out of range 0 to 7" + VDUP V8.S[4], R3.S2 // ERROR "register element index out of range 0 to 3" + VDUP V8.S[4], R3.S4 // ERROR "register element index out of range 0 to 3" + VDUP V8.D[2], R3.D2 // ERROR "register element index out of range 0 to 1" + VFMLA V1.D2, V12.D2, V3.S2 // ERROR "operand mismatch" + VFMLA V1.S2, V12.S2, V3.D2 // ERROR "operand mismatch" + VFMLA V1.S4, V12.S2, V3.D2 // ERROR "operand mismatch" + VFMLA V1.H4, V12.H4, V3.D2 // ERROR "operand mismatch" + VFMLS V1.S2, V12.S2, V3.S4 // ERROR "operand mismatch" + VFMLS V1.S2, V12.D2, V3.S4 // ERROR "operand mismatch" + VFMLS V1.S2, V12.S4, V3.D2 // ERROR "operand mismatch" + VFMLA V1.B8, V12.B8, V3.B8 // ERROR "invalid arrangement" + VFMLA V1.B16, V12.B16, V3.B16 // ERROR "invalid arrangement" + VFMLA V1.H4, V12.H4, V3.H4 // ERROR "invalid arrangement" + VFMLA V1.H8, V12.H8, V3.H8 // ERROR "invalid arrangement" + VFMLA V1.H4, V12.H4, V3.H4 // ERROR "invalid arrangement" + VFMLS V1.B8, V12.B8, V3.B8 // ERROR "invalid arrangement" + VFMLS V1.B16, V12.B16, V3.B16 // ERROR "invalid arrangement" + VFMLS V1.H4, V12.H4, V3.H4 // ERROR "invalid arrangement" + VFMLS V1.H8, V12.H8, V3.H8 // ERROR "invalid arrangement" + VFMLS V1.H4, V12.H4, V3.H4 // ERROR "invalid arrangement" RET diff --git a/src/cmd/internal/obj/arm64/a.out.go b/src/cmd/internal/obj/arm64/a.out.go index 93322c77e1..3bb897c7a8 100644 --- a/src/cmd/internal/obj/arm64/a.out.go +++ b/src/cmd/internal/obj/arm64/a.out.go @@ -766,6 +766,8 @@ const ( AVMOVI AVUADDLV AVSUB + AVFMLA + AVFMLS ALAST AB = obj.AJMP ABL = obj.ACALL diff --git a/src/cmd/internal/obj/arm64/anames.go b/src/cmd/internal/obj/arm64/anames.go index 13dbaae894..c369b66198 100644 --- a/src/cmd/internal/obj/arm64/anames.go +++ b/src/cmd/internal/obj/arm64/anames.go @@ -383,5 +383,7 @@ var Anames = []string{ "VMOVI", "VUADDLV", "VSUB", + "VFMLA", + "VFMLS", "LAST", } diff --git a/src/cmd/internal/obj/arm64/asm7.go b/src/cmd/internal/obj/arm64/asm7.go index 423f55f741..9d064806a1 100644 --- a/src/cmd/internal/obj/arm64/asm7.go +++ b/src/cmd/internal/obj/arm64/asm7.go @@ -146,6 +146,10 @@ func FPOP2S(m uint32, s uint32, type_ uint32, op uint32) uint32 { return m<<31 | s<<29 | 0x1E<<24 | type_<<22 | 1<<21 | op<<12 | 2<<10 } +func FPOP3S(m uint32, s uint32, type_ uint32, op uint32, op2 uint32) uint32 { + return m<<31 | s<<29 | 0x1F<<24 | type_<<22 | op<<21 | op2<<15 +} + func FPCVTI(sf uint32, s uint32, type_ uint32, rmode uint32, op uint32) uint32 { return sf<<31 | s<<29 | 0x1E<<24 | type_<<22 | 1<<21 | rmode<<19 | op<<16 | 0<<10 } @@ -539,6 +543,7 @@ var optab = []Optab{ {AFADDS, C_FREG, C_FREG, C_FREG, 54, 4, 0, 0, 0}, {AFADDS, C_FCON, C_NONE, C_FREG, 54, 4, 0, 0, 0}, {AFADDS, C_FCON, C_FREG, C_FREG, 54, 4, 0, 0, 0}, + {AFMSUBD, C_FREG, C_FREG, C_FREG, 15, 4, 0, 0, 0}, {AFMOVS, C_FCON, C_NONE, C_FREG, 54, 4, 0, 0, 0}, {AFMOVS, C_FREG, C_NONE, C_FREG, 54, 4, 0, 0, 0}, {AFMOVD, C_FCON, C_NONE, C_FREG, 54, 4, 0, 0, 0}, @@ -589,6 +594,7 @@ var optab = []Optab{ {AVLD1, C_ROFF, C_NONE, C_LIST, 81, 4, 0, 0, C_XPOST}, {AVMOV, C_ELEM, C_NONE, C_REG, 73, 4, 0, 0, 0}, {AVMOV, C_REG, C_NONE, C_ARNG, 82, 4, 0, 0, 0}, + {AVMOV, C_ELEM, C_NONE, C_ELEM, 92, 4, 0, 0, 0}, {AVMOV, C_ARNG, C_NONE, C_ARNG, 83, 4, 0, 0, 0}, {AVMOV, C_REG, C_NONE, C_ELEM, 78, 4, 0, 0, 0}, {AVMOV, C_ELEM, C_NONE, C_VREG, 80, 4, 0, 0, 0}, @@ -600,6 +606,7 @@ var optab = []Optab{ {AVADDV, C_ARNG, C_NONE, C_VREG, 85, 4, 0, 0, 0}, {AVCNT, C_ARNG, C_NONE, C_ARNG, 29, 4, 0, 0, 0}, {AVMOVI, C_ADDCON, C_NONE, C_ARNG, 86, 4, 0, 0, 0}, + {AVFMLA, C_ARNG, C_ARNG, C_ARNG, 72, 4, 0, 0, 0}, {obj.AUNDEF, C_NONE, C_NONE, C_NONE, 90, 4, 0, 0, 0}, {obj.APCDATA, C_VCON, C_NONE, C_VCON, 0, 0, 0, 0, 0}, @@ -1987,6 +1994,15 @@ func buildop(ctxt *obj.Link) { oprangeset(AFMINNMS, t) oprangeset(AFDIVD, t) + case AFMSUBD: + oprangeset(AFMSUBS, t) + oprangeset(AFMADDS, t) + oprangeset(AFMADDD, t) + oprangeset(AFNMSUBS, t) + oprangeset(AFNMSUBD, t) + oprangeset(AFNMADDS, t) + oprangeset(AFNMADDD, t) + case AFCVTSD: oprangeset(AFCVTDS, t) oprangeset(AFABSD, t) @@ -2126,6 +2142,9 @@ func buildop(ctxt *obj.Link) { case AVADDV: oprangeset(AVUADDLV, t) + case AVFMLA: + oprangeset(AVFMLS, t) + case ASHA1H, AVCNT, AVMOV, @@ -2189,6 +2208,13 @@ func SYSARG4(op1 int, Cn int, Cm int, op2 int) int { return SYSARG5(0, op1, Cn, Cm, op2) } +/* checkindex checks if index >= 0 && index <= maxindex */ +func (c *ctxt7) checkindex(p *obj.Prog, index, maxindex int) { + if index < 0 || index > maxindex { + c.ctxt.Diag("register element index out of range 0 to %d: %v", maxindex, p) + } +} + func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { o1 := uint32(0) o2 := uint32(0) @@ -2420,7 +2446,7 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { o1 = 0 } - case 15: /* mul/mneg/umulh/umull r,[r,]r; madd/msub Rm,Ra,Rn,Rd */ + case 15: /* mul/mneg/umulh/umull r,[r,]r; madd/msub/fmadd/fmsub/fnmadd/fnmsub Rm,Ra,Rn,Rd */ o1 = c.oprrr(p, p.As) rf := int(p.From.Reg) @@ -3283,12 +3309,13 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { rel.Add = 0 rel.Type = objabi.R_ARM64_GOTPCREL - case 72: /* vaddp/vand/vcmeq/vorr/vadd/veor Vm., Vn., Vd. */ + case 72: /* vaddp/vand/vcmeq/vorr/vadd/veor/vfmla/vfmls Vm., Vn., Vd. */ af := int((p.From.Reg >> 5) & 15) af3 := int((p.Reg >> 5) & 15) at := int((p.To.Reg >> 5) & 15) if af != af3 || af != at { - c.ctxt.Diag("invalid arrangement: %v\n", p) + c.ctxt.Diag("operand mismatch: %v", p) + break } o1 = c.oprrr(p, p.As) rf := int((p.From.Reg) & 31) @@ -3320,16 +3347,25 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { Q = 1 size = 1 default: - c.ctxt.Diag("invalid arrangement: %v\n", p) + c.ctxt.Diag("invalid arrangement: %v", p) } if (p.As == AVORR || p.As == AVAND || p.As == AVEOR) && (af != ARNG_16B && af != ARNG_8B) { - c.ctxt.Diag("invalid arrangement on op %v", p.As) + c.ctxt.Diag("invalid arrangement: %v", p) + } else if (p.As == AVFMLA || p.As == AVFMLS) && + (af != ARNG_2D && af != ARNG_2S && af != ARNG_4S) { + c.ctxt.Diag("invalid arrangement: %v", p) } else if p.As == AVORR { size = 2 } else if p.As == AVAND || p.As == AVEOR { size = 0 + } else if (p.As == AVFMLA || p.As == AVFMLS) { + if af == ARNG_2D { + size = 1 + } else { + size = 0 + } } o1 |= (uint32(Q&1) << 30) | (uint32(size&3) << 22) | (uint32(rf&31) << 16) | (uint32(r&31) << 5) | uint32(rt&31) @@ -3339,22 +3375,27 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { rt := int(p.To.Reg) imm5 := 0 o1 = 7<<25 | 0xf<<10 + index := int(p.From.Index) switch (p.From.Reg >> 5) & 15 { case ARNG_B: + c.checkindex(p, index, 15) imm5 |= 1 - imm5 |= int(p.From.Index) << 1 + imm5 |= index << 1 case ARNG_H: + c.checkindex(p, index, 7) imm5 |= 2 - imm5 |= int(p.From.Index) << 2 + imm5 |= index << 2 case ARNG_S: + c.checkindex(p, index, 3) imm5 |= 4 - imm5 |= int(p.From.Index) << 3 + imm5 |= index << 3 case ARNG_D: + c.checkindex(p, index, 1) imm5 |= 8 - imm5 |= int(p.From.Index) << 4 + imm5 |= index << 4 o1 |= 1 << 30 default: - c.ctxt.Diag("invalid arrangement on op V.[index], R: %v\n", p) + c.ctxt.Diag("invalid arrangement: %v", p) } o1 |= (uint32(imm5&0x1f) << 16) | (uint32(rf&31) << 5) | uint32(rt&31) @@ -3471,21 +3512,26 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { rt := int(p.To.Reg) imm5 := 0 o1 = 1<<30 | 7<<25 | 7<<10 + index :=int(p.From.Index) switch (p.To.Reg >> 5) & 15 { case ARNG_B: + c.checkindex(p, index, 15) imm5 |= 1 - imm5 |= int(p.From.Index) << 1 + imm5 |= index << 1 case ARNG_H: + c.checkindex(p, index, 7) imm5 |= 2 - imm5 |= int(p.From.Index) << 2 + imm5 |= index << 2 case ARNG_S: + c.checkindex(p, index, 3) imm5 |= 4 - imm5 |= int(p.From.Index) << 3 + imm5 |= index << 3 case ARNG_D: + c.checkindex(p, index, 1) imm5 |= 8 - imm5 |= int(p.From.Index) << 4 + imm5 |= index << 4 default: - c.ctxt.Diag("invalid arrangement on op R, V.[index]: %v\n", p) + c.ctxt.Diag("invalid arrangement: %v", p) } o1 |= (uint32(imm5&0x1f) << 16) | (uint32(rf&31) << 5) | uint32(rt&31) @@ -3493,38 +3539,46 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { rf := int(p.From.Reg) rt := int(p.To.Reg) o1 = 7<<25 | 1<<10 - var imm5, Q uint32 + var imm5, Q int + index := int(p.From.Index) switch (p.To.Reg >> 5) & 15 { case ARNG_16B: + c.checkindex(p, index, 15) Q = 1 imm5 = 1 - imm5 |= uint32(p.From.Index) << 1 + imm5 |= index << 1 case ARNG_2D: + c.checkindex(p, index, 1) Q = 1 imm5 = 8 - imm5 |= uint32(p.From.Index) << 4 + imm5 |= index << 4 case ARNG_2S: + c.checkindex(p, index, 3) Q = 0 imm5 = 4 - imm5 |= uint32(p.From.Index) << 3 + imm5 |= index << 3 case ARNG_4H: + c.checkindex(p, index, 7) Q = 0 imm5 = 2 - imm5 |= uint32(p.From.Index) << 2 + imm5 |= index << 2 case ARNG_4S: + c.checkindex(p, index, 3) Q = 1 imm5 = 4 - imm5 |= uint32(p.From.Index) << 3 + imm5 |= index << 3 case ARNG_8B: + c.checkindex(p, index, 15) Q = 0 imm5 = 1 - imm5 |= uint32(p.From.Index) << 1 + imm5 |= index << 1 case ARNG_8H: + c.checkindex(p, index, 7) Q = 1 imm5 = 2 - imm5 |= uint32(p.From.Index) << 2 + imm5 |= index << 2 default: - c.ctxt.Diag("invalid arrangement on VDUP Vn.[index], Vd.: %v\n", p) + c.ctxt.Diag("invalid arrangement: %v", p) } o1 |= (uint32(Q&1) << 30) | (uint32(imm5&0x1f) << 16) o1 |= (uint32(rf&31) << 5) | uint32(rt&31) @@ -3533,24 +3587,29 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { rf := int(p.From.Reg) rt := int(p.To.Reg) imm5 := 0 + index := int(p.From.Index) switch p.As { case AVMOV: o1 = 1<<30 | 15<<25 | 1<<10 switch (p.From.Reg >> 5) & 15 { case ARNG_B: + c.checkindex(p, index, 15) imm5 |= 1 - imm5 |= int(p.From.Index) << 1 + imm5 |= index << 1 case ARNG_H: + c.checkindex(p, index, 7) imm5 |= 2 - imm5 |= int(p.From.Index) << 2 + imm5 |= index << 2 case ARNG_S: + c.checkindex(p, index, 3) imm5 |= 4 - imm5 |= int(p.From.Index) << 3 + imm5 |= index << 3 case ARNG_D: + c.checkindex(p, index, 1) imm5 |= 8 - imm5 |= int(p.From.Index) << 4 + imm5 |= index << 4 default: - c.ctxt.Diag("invalid arrangement on op V.[index], Vn: %v\n", p) + c.ctxt.Diag("invalid arrangement: %v", p) } default: c.ctxt.Diag("unsupported op %v", p.As) @@ -3759,6 +3818,47 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { case 90: o1 = 0xbea71700 + case 92: /* vmov Vn.[index], Vd.[index] */ + rf := int(p.From.Reg) + rt := int(p.To.Reg) + imm4 := 0 + imm5 := 0 + o1 = 3<<29 | 7<<25 | 1<<10 + index1 := int(p.To.Index) + index2 := int(p.From.Index) + if ((p.To.Reg >> 5) & 15) != ((p.From.Reg >> 5) & 15) { + c.ctxt.Diag("operand mismatch: %v", p) + } + switch (p.To.Reg >> 5) & 15 { + case ARNG_B: + c.checkindex(p, index1, 15) + c.checkindex(p, index2, 15) + imm5 |= 1 + imm5 |= index1 << 1 + imm4 |= index2 + case ARNG_H: + c.checkindex(p, index1, 7) + c.checkindex(p, index2, 7) + imm5 |= 2 + imm5 |= index1 << 2 + imm4 |= index2 << 1 + case ARNG_S: + c.checkindex(p, index1, 3) + c.checkindex(p, index2, 3) + imm5 |= 4 + imm5 |= index1 << 3 + imm4 |= index2 << 2 + case ARNG_D: + c.checkindex(p, index1, 1) + c.checkindex(p, index2, 1) + imm5 |= 8 + imm5 |= index1 << 4 + imm4 |= index2 << 3 + default: + c.ctxt.Diag("invalid arrangement: %v", p) + } + o1 |= (uint32(imm5&0x1f) << 16) | (uint32(imm4&0xf) << 16) | (uint32(rf&31) << 5) | uint32(rt&31) + break case 91: /* prfm imm(Rn), */ @@ -4157,6 +4257,30 @@ func (c *ctxt7) oprrr(p *obj.Prog, a obj.As) uint32 { case AFSUBD: return FPOP2S(0, 0, 1, 3) + case AFMADDD: + return FPOP3S(0, 0, 1, 0, 0) + + case AFMADDS: + return FPOP3S(0, 0, 0, 0, 0) + + case AFMSUBD: + return FPOP3S(0, 0, 1, 0, 1) + + case AFMSUBS: + return FPOP3S(0, 0, 0, 0, 1) + + case AFNMADDD: + return FPOP3S(0, 0, 1, 1, 0) + + case AFNMADDS: + return FPOP3S(0, 0, 0, 1, 0) + + case AFNMSUBD: + return FPOP3S(0, 0, 1, 1, 1) + + case AFNMSUBS: + return FPOP3S(0, 0, 0, 1, 1) + case AFMULS: return FPOP2S(0, 0, 0, 0) @@ -4345,6 +4469,12 @@ func (c *ctxt7) oprrr(p *obj.Prog, a obj.As) uint32 { case AVUADDLV: return 1<<29 | 7<<25 | 3<<20 | 7<<11 + + case AVFMLA: + return 7<<25 | 0<<23 | 1<<21 | 3<<14 | 3<<10 + + case AVFMLS: + return 7<<25 | 1<<23 | 1<<21 | 3<<14 | 3<<10 } c.ctxt.Diag("%v: bad rrr %d %v", p, a, a) diff --git a/src/cmd/internal/obj/arm64/doc.go b/src/cmd/internal/obj/arm64/doc.go index 3d65541fd2..a808d4c3ad 100644 --- a/src/cmd/internal/obj/arm64/doc.go +++ b/src/cmd/internal/obj/arm64/doc.go @@ -22,6 +22,46 @@ Go Assembly for ARM64 Reference Manual 2. Alphabetical list of float-point instructions // TODO + FMADDD: 64-bit floating-point fused Multiply-Add + FMADDD , , , + Multiplies the values of and , + adds the product to , and writes the result to . + + FMADDS: 32-bit floating-point fused Multiply-Add + FMADDS , , , + Multiplies the values of and , + adds the product to , and writes the result to . + + FMSUBD: 64-bit floating-point fused Multiply-Subtract + FMSUBD , , , + Multiplies the values of and , negates the product, + adds the product to , and writes the result to . + + FMSUBS: 32-bit floating-point fused Multiply-Subtract + FMSUBS , , , + Multiplies the values of and , negates the product, + adds the product to , and writes the result to . + + FNMADDD: 64-bit floating-point negated fused Multiply-Add + FNMADDD , , , + Multiplies the values of and , negates the product, + subtracts the value of , and writes the result to . + + FNMADDS: 32-bit floating-point negated fused Multiply-Add + FNMADDS , , , + Multiplies the values of and , negates the product, + subtracts the value of , and writes the result to . + + FNMSUBD: 64-bit floating-point negated fused Multiply-Subtract + FNMSUBD , , , + Multiplies the values of and , + subtracts the value of , and writes the result to . + + FNMSUBS: 32-bit floating-point negated fused Multiply-Subtract + FNMSUBS , , , + Multiplies the values of and , + subtracts the value of , and writes the result to . + 3. Alphabetical list of SIMD instructions VADD: Add (scalar) VADD , , @@ -65,6 +105,16 @@ Go Assembly for ARM64 Reference Manual Is an arrangement specifier and can have the following values: B8, B16 + VFMLA: Floating-point fused Multiply-Add to accumulator (vector) + VFMLA ., ., . + Is an arrangement specifier and can have the following values: + S2, S4, D2 + + VFMLS: Floating-point fused Multiply-Subtract from accumulator (vector) + VFMLS ., ., . + Is an arrangement specifier and can have the following values: + S2, S4, D2 + VLD1: Load multiple single-element structures VLD1 (Rn), [., . ...] // no offset VLD1.P imm(Rn), [., . ...] // immediate offset variant @@ -96,6 +146,10 @@ Go Assembly for ARM64 Reference Manual Is an element size specifier and can have the following values: B, H, S, D + VMOV .[index], .[index] // Move vector element to another vector element. + Is an element size specifier and can have the following values: + B, H, S, D + VMOVI: Move Immediate (vector). VMOVI $imm8, . is an arrangement specifier and can have the following values: