1
0
mirror of https://github.com/golang/go synced 2024-11-21 21:44:40 -07:00

cmd/compile: implement FMA codegen for loong64

Benchmark results on Loongson 3A5000 and 3A6000:

goos: linux
goarch: loong64
pkg: math
cpu: Loongson-3A6000 @ 2500.00MHz
    |  bench.old   |              bench.new              |
    |    sec/op    |   sec/op     vs base                |
FMA   25.930n ± 0%   2.002n ± 0%  -92.28% (p=0.000 n=10)

goos: linux
goarch: loong64
pkg: math
cpu: Loongson-3A5000 @ 2500.00MHz
    |  bench.old   |              bench.new              |
    |    sec/op    |   sec/op     vs base                |
FMA   32.840n ± 0%   2.002n ± 0%  -93.90% (p=0.000 n=10)

Updates #59120

This patch is a copy of CL 483355.
Co-authored-by: WANG Xuerui <git@xen0n.name>

Change-Id: I88b89d23f00864f9173a182a47ee135afec7ed6e
Reviewed-on: https://go-review.googlesource.com/c/go/+/625335
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Carlos Amedee <carlos@golang.org>
This commit is contained in:
Xiaolin Zhao 2024-11-05 15:30:45 +08:00 committed by abner chenc
parent 2751443e92
commit e6cc9d228a
9 changed files with 503 additions and 5 deletions

View File

@ -123,7 +123,9 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
p.From.Reg = x
p.To.Type = obj.TYPE_REG
p.To.Reg = y
case ssa.OpLOONG64MOVVnop:
case ssa.OpLOONG64MOVVnop,
ssa.OpLOONG64LoweredRound32F,
ssa.OpLOONG64LoweredRound64F:
// nothing to do
case ssa.OpLoadReg:
if v.Type.IsFlags() {
@ -320,6 +322,30 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
p.Reg = v.Args[1].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = loong64.REG_FCC0
case ssa.OpLOONG64FMADDF,
ssa.OpLOONG64FMADDD,
ssa.OpLOONG64FMSUBF,
ssa.OpLOONG64FMSUBD,
ssa.OpLOONG64FNMADDF,
ssa.OpLOONG64FNMADDD,
ssa.OpLOONG64FNMSUBF,
ssa.OpLOONG64FNMSUBD:
p := s.Prog(v.Op.Asm())
// r=(FMA x y z) -> FMADDD z, y, x, r
// the SSA operand order is for taking advantage of
// commutativity (that only applies for the first two operands)
r := v.Reg()
x := v.Args[0].Reg()
y := v.Args[1].Reg()
z := v.Args[2].Reg()
p.From.Type = obj.TYPE_REG
p.From.Reg = z
p.Reg = y
p.AddRestSourceReg(x)
p.To.Type = obj.TYPE_REG
p.To.Reg = r
case ssa.OpLOONG64MOVVaddr:
p := s.Prog(loong64.AMOVV)
p.From.Type = obj.TYPE_ADDR

View File

@ -211,7 +211,7 @@
(CvtBoolToUint8 ...) => (Copy ...)
(Round(32|64)F ...) => (Copy ...)
(Round(32|64)F ...) => (LoweredRound(32|64)F ...)
// comparisons
(Eq8 x y) => (SGTU (MOVVconst [1]) (XOR (ZeroExt8to64 x) (ZeroExt8to64 y)))
@ -675,6 +675,21 @@
(REMVU _ (MOVVconst [1])) => (MOVVconst [0]) // mod
(REMVU x (MOVVconst [c])) && isPowerOfTwo(c) => (ANDconst [c-1] x) // mod
// FMA
(FMA ...) => (FMADDD ...)
((ADD|SUB)F (MULF x y) z) && z.Block.Func.useFMA(v) => (FM(ADD|SUB)F x y z)
((ADD|SUB)D (MULD x y) z) && z.Block.Func.useFMA(v) => (FM(ADD|SUB)D x y z)
// z - xy -> -(xy - z)
(SUBF z (MULF x y)) && z.Block.Func.useFMA(v) => (FNMSUBF x y z)
(SUBD z (MULD x y)) && z.Block.Func.useFMA(v) => (FNMSUBD x y z)
// z + (-xy) -> -(xy - z)
// z - (-xy) -> xy + z
((ADD|SUB)F z (NEGF (MULF x y))) && z.Block.Func.useFMA(v) => (F(NMSUB|MADD)F x y z)
((ADD|SUB)D z (NEGD (MULD x y))) && z.Block.Func.useFMA(v) => (F(NMSUB|MADD)D x y z)
// -xy - z -> -(xy + z)
(SUBF (NEGF (MULF x y)) z) && z.Block.Func.useFMA(v) => (FNMADDF x y z)
(SUBD (NEGD (MULD x y)) z) && z.Block.Func.useFMA(v) => (FNMADDD x y z)
// generic simplifications
(ADDV x (NEGV y)) => (SUBV x y)
(SUBV x x) => (MOVVconst [0])

View File

@ -151,6 +151,7 @@ func init() {
fp01 = regInfo{inputs: nil, outputs: []regMask{fp}}
fp11 = regInfo{inputs: []regMask{fp}, outputs: []regMask{fp}}
fp21 = regInfo{inputs: []regMask{fp, fp}, outputs: []regMask{fp}}
fp31 = regInfo{inputs: []regMask{fp, fp, fp}, outputs: []regMask{fp}}
fp2flags = regInfo{inputs: []regMask{fp, fp}}
fpload = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{fp}}
fp2load = regInfo{inputs: []regMask{gpspsbg, gpg}, outputs: []regMask{fp}}
@ -193,6 +194,15 @@ func init() {
{name: "NOR", argLength: 2, reg: gp21, asm: "NOR", commutative: true}, // ^(arg0 | arg1)
{name: "NORconst", argLength: 1, reg: gp11, asm: "NOR", aux: "Int64"}, // ^(arg0 | auxInt)
{name: "FMADDF", argLength: 3, reg: fp31, asm: "FMADDF", commutative: true, typ: "Float32"}, // (arg0 * arg1) + arg2
{name: "FMADDD", argLength: 3, reg: fp31, asm: "FMADDD", commutative: true, typ: "Float64"}, // (arg0 * arg1) + arg2
{name: "FMSUBF", argLength: 3, reg: fp31, asm: "FMSUBF", commutative: true, typ: "Float32"}, // (arg0 * arg1) - arg2
{name: "FMSUBD", argLength: 3, reg: fp31, asm: "FMSUBD", commutative: true, typ: "Float64"}, // (arg0 * arg1) - arg2
{name: "FNMADDF", argLength: 3, reg: fp31, asm: "FNMADDF", commutative: true, typ: "Float32"}, // -((arg0 * arg1) + arg2)
{name: "FNMADDD", argLength: 3, reg: fp31, asm: "FNMADDD", commutative: true, typ: "Float64"}, // -((arg0 * arg1) + arg2)
{name: "FNMSUBF", argLength: 3, reg: fp31, asm: "FNMSUBF", commutative: true, typ: "Float32"}, // -((arg0 * arg1) - arg2)
{name: "FNMSUBD", argLength: 3, reg: fp31, asm: "FNMSUBD", commutative: true, typ: "Float64"}, // -((arg0 * arg1) - arg2)
{name: "NEGV", argLength: 1, reg: gp11}, // -arg0
{name: "NEGF", argLength: 1, reg: fp11, asm: "NEGF"}, // -arg0, float32
{name: "NEGD", argLength: 1, reg: fp11, asm: "NEGD"}, // -arg0, float64
@ -330,6 +340,10 @@ func init() {
{name: "MOVFD", argLength: 1, reg: fp11, asm: "MOVFD"}, // float32 -> float64
{name: "MOVDF", argLength: 1, reg: fp11, asm: "MOVDF"}, // float64 -> float32
// Round ops to block fused-multiply-add extraction.
{name: "LoweredRound32F", argLength: 1, reg: fp11, resultInArg0: true},
{name: "LoweredRound64F", argLength: 1, reg: fp11, resultInArg0: true},
// function calls
{name: "CALLstatic", argLength: -1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call static function aux.(*obj.LSym). last arg=mem, auxint=argsize, returns mem
{name: "CALLtail", argLength: -1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true, tailCall: true}, // tail call static function aux.(*obj.LSym). last arg=mem, auxint=argsize, returns mem

View File

@ -1783,6 +1783,14 @@ const (
OpLOONG64XORconst
OpLOONG64NOR
OpLOONG64NORconst
OpLOONG64FMADDF
OpLOONG64FMADDD
OpLOONG64FMSUBF
OpLOONG64FMSUBD
OpLOONG64FNMADDF
OpLOONG64FNMADDD
OpLOONG64FNMSUBF
OpLOONG64FNMSUBD
OpLOONG64NEGV
OpLOONG64NEGF
OpLOONG64NEGD
@ -1887,6 +1895,8 @@ const (
OpLOONG64TRUNCDV
OpLOONG64MOVFD
OpLOONG64MOVDF
OpLOONG64LoweredRound32F
OpLOONG64LoweredRound64F
OpLOONG64CALLstatic
OpLOONG64CALLtail
OpLOONG64CALLclosure
@ -23928,6 +23938,134 @@ var opcodeTable = [...]opInfo{
},
},
},
{
name: "FMADDF",
argLen: 3,
commutative: true,
asm: loong64.AFMADDF,
reg: regInfo{
inputs: []inputInfo{
{0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
{1, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
{2, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
},
outputs: []outputInfo{
{0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
},
},
},
{
name: "FMADDD",
argLen: 3,
commutative: true,
asm: loong64.AFMADDD,
reg: regInfo{
inputs: []inputInfo{
{0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
{1, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
{2, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
},
outputs: []outputInfo{
{0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
},
},
},
{
name: "FMSUBF",
argLen: 3,
commutative: true,
asm: loong64.AFMSUBF,
reg: regInfo{
inputs: []inputInfo{
{0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
{1, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
{2, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
},
outputs: []outputInfo{
{0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
},
},
},
{
name: "FMSUBD",
argLen: 3,
commutative: true,
asm: loong64.AFMSUBD,
reg: regInfo{
inputs: []inputInfo{
{0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
{1, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
{2, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
},
outputs: []outputInfo{
{0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
},
},
},
{
name: "FNMADDF",
argLen: 3,
commutative: true,
asm: loong64.AFNMADDF,
reg: regInfo{
inputs: []inputInfo{
{0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
{1, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
{2, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
},
outputs: []outputInfo{
{0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
},
},
},
{
name: "FNMADDD",
argLen: 3,
commutative: true,
asm: loong64.AFNMADDD,
reg: regInfo{
inputs: []inputInfo{
{0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
{1, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
{2, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
},
outputs: []outputInfo{
{0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
},
},
},
{
name: "FNMSUBF",
argLen: 3,
commutative: true,
asm: loong64.AFNMSUBF,
reg: regInfo{
inputs: []inputInfo{
{0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
{1, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
{2, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
},
outputs: []outputInfo{
{0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
},
},
},
{
name: "FNMSUBD",
argLen: 3,
commutative: true,
asm: loong64.AFNMSUBD,
reg: regInfo{
inputs: []inputInfo{
{0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
{1, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
{2, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
},
outputs: []outputInfo{
{0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
},
},
},
{
name: "NEGV",
argLen: 1,
@ -25326,6 +25464,32 @@ var opcodeTable = [...]opInfo{
},
},
},
{
name: "LoweredRound32F",
argLen: 1,
resultInArg0: true,
reg: regInfo{
inputs: []inputInfo{
{0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
},
outputs: []outputInfo{
{0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
},
},
},
{
name: "LoweredRound64F",
argLen: 1,
resultInArg0: true,
reg: regInfo{
inputs: []inputInfo{
{0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
},
outputs: []outputInfo{
{0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
},
},
},
{
name: "CALLstatic",
auxType: auxCallOff,

View File

@ -216,6 +216,9 @@ func rewriteValueLOONG64(v *Value) bool {
return rewriteValueLOONG64_OpEqB(v)
case OpEqPtr:
return rewriteValueLOONG64_OpEqPtr(v)
case OpFMA:
v.Op = OpLOONG64FMADDD
return true
case OpGetCallerPC:
v.Op = OpLOONG64LoweredGetCallerPC
return true
@ -244,6 +247,10 @@ func rewriteValueLOONG64(v *Value) bool {
return rewriteValueLOONG64_OpIsNonNil(v)
case OpIsSliceInBounds:
return rewriteValueLOONG64_OpIsSliceInBounds(v)
case OpLOONG64ADDD:
return rewriteValueLOONG64_OpLOONG64ADDD(v)
case OpLOONG64ADDF:
return rewriteValueLOONG64_OpLOONG64ADDF(v)
case OpLOONG64ADDV:
return rewriteValueLOONG64_OpLOONG64ADDV(v)
case OpLOONG64ADDVconst:
@ -392,6 +399,10 @@ func rewriteValueLOONG64(v *Value) bool {
return rewriteValueLOONG64_OpLOONG64SRLV(v)
case OpLOONG64SRLVconst:
return rewriteValueLOONG64_OpLOONG64SRLVconst(v)
case OpLOONG64SUBD:
return rewriteValueLOONG64_OpLOONG64SUBD(v)
case OpLOONG64SUBF:
return rewriteValueLOONG64_OpLOONG64SUBF(v)
case OpLOONG64SUBV:
return rewriteValueLOONG64_OpLOONG64SUBV(v)
case OpLOONG64SUBVconst:
@ -596,10 +607,10 @@ func rewriteValueLOONG64(v *Value) bool {
case OpRotateLeft8:
return rewriteValueLOONG64_OpRotateLeft8(v)
case OpRound32F:
v.Op = OpCopy
v.Op = OpLOONG64LoweredRound32F
return true
case OpRound64F:
v.Op = OpCopy
v.Op = OpLOONG64LoweredRound64F
return true
case OpRsh16Ux16:
return rewriteValueLOONG64_OpRsh16Ux16(v)
@ -1410,6 +1421,104 @@ func rewriteValueLOONG64_OpIsSliceInBounds(v *Value) bool {
return true
}
}
func rewriteValueLOONG64_OpLOONG64ADDD(v *Value) bool {
v_1 := v.Args[1]
v_0 := v.Args[0]
// match: (ADDD (MULD x y) z)
// cond: z.Block.Func.useFMA(v)
// result: (FMADDD x y z)
for {
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
if v_0.Op != OpLOONG64MULD {
continue
}
y := v_0.Args[1]
x := v_0.Args[0]
z := v_1
if !(z.Block.Func.useFMA(v)) {
continue
}
v.reset(OpLOONG64FMADDD)
v.AddArg3(x, y, z)
return true
}
break
}
// match: (ADDD z (NEGD (MULD x y)))
// cond: z.Block.Func.useFMA(v)
// result: (FNMSUBD x y z)
for {
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
z := v_0
if v_1.Op != OpLOONG64NEGD {
continue
}
v_1_0 := v_1.Args[0]
if v_1_0.Op != OpLOONG64MULD {
continue
}
y := v_1_0.Args[1]
x := v_1_0.Args[0]
if !(z.Block.Func.useFMA(v)) {
continue
}
v.reset(OpLOONG64FNMSUBD)
v.AddArg3(x, y, z)
return true
}
break
}
return false
}
func rewriteValueLOONG64_OpLOONG64ADDF(v *Value) bool {
v_1 := v.Args[1]
v_0 := v.Args[0]
// match: (ADDF (MULF x y) z)
// cond: z.Block.Func.useFMA(v)
// result: (FMADDF x y z)
for {
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
if v_0.Op != OpLOONG64MULF {
continue
}
y := v_0.Args[1]
x := v_0.Args[0]
z := v_1
if !(z.Block.Func.useFMA(v)) {
continue
}
v.reset(OpLOONG64FMADDF)
v.AddArg3(x, y, z)
return true
}
break
}
// match: (ADDF z (NEGF (MULF x y)))
// cond: z.Block.Func.useFMA(v)
// result: (FNMSUBF x y z)
for {
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
z := v_0
if v_1.Op != OpLOONG64NEGF {
continue
}
v_1_0 := v_1.Args[0]
if v_1_0.Op != OpLOONG64MULF {
continue
}
y := v_1_0.Args[1]
x := v_1_0.Args[0]
if !(z.Block.Func.useFMA(v)) {
continue
}
v.reset(OpLOONG64FNMSUBF)
v.AddArg3(x, y, z)
return true
}
break
}
return false
}
func rewriteValueLOONG64_OpLOONG64ADDV(v *Value) bool {
v_1 := v.Args[1]
v_0 := v.Args[0]
@ -5944,6 +6053,168 @@ func rewriteValueLOONG64_OpLOONG64SRLVconst(v *Value) bool {
}
return false
}
func rewriteValueLOONG64_OpLOONG64SUBD(v *Value) bool {
v_1 := v.Args[1]
v_0 := v.Args[0]
// match: (SUBD (MULD x y) z)
// cond: z.Block.Func.useFMA(v)
// result: (FMSUBD x y z)
for {
if v_0.Op != OpLOONG64MULD {
break
}
y := v_0.Args[1]
x := v_0.Args[0]
z := v_1
if !(z.Block.Func.useFMA(v)) {
break
}
v.reset(OpLOONG64FMSUBD)
v.AddArg3(x, y, z)
return true
}
// match: (SUBD z (MULD x y))
// cond: z.Block.Func.useFMA(v)
// result: (FNMSUBD x y z)
for {
z := v_0
if v_1.Op != OpLOONG64MULD {
break
}
y := v_1.Args[1]
x := v_1.Args[0]
if !(z.Block.Func.useFMA(v)) {
break
}
v.reset(OpLOONG64FNMSUBD)
v.AddArg3(x, y, z)
return true
}
// match: (SUBD z (NEGD (MULD x y)))
// cond: z.Block.Func.useFMA(v)
// result: (FMADDD x y z)
for {
z := v_0
if v_1.Op != OpLOONG64NEGD {
break
}
v_1_0 := v_1.Args[0]
if v_1_0.Op != OpLOONG64MULD {
break
}
y := v_1_0.Args[1]
x := v_1_0.Args[0]
if !(z.Block.Func.useFMA(v)) {
break
}
v.reset(OpLOONG64FMADDD)
v.AddArg3(x, y, z)
return true
}
// match: (SUBD (NEGD (MULD x y)) z)
// cond: z.Block.Func.useFMA(v)
// result: (FNMADDD x y z)
for {
if v_0.Op != OpLOONG64NEGD {
break
}
v_0_0 := v_0.Args[0]
if v_0_0.Op != OpLOONG64MULD {
break
}
y := v_0_0.Args[1]
x := v_0_0.Args[0]
z := v_1
if !(z.Block.Func.useFMA(v)) {
break
}
v.reset(OpLOONG64FNMADDD)
v.AddArg3(x, y, z)
return true
}
return false
}
func rewriteValueLOONG64_OpLOONG64SUBF(v *Value) bool {
v_1 := v.Args[1]
v_0 := v.Args[0]
// match: (SUBF (MULF x y) z)
// cond: z.Block.Func.useFMA(v)
// result: (FMSUBF x y z)
for {
if v_0.Op != OpLOONG64MULF {
break
}
y := v_0.Args[1]
x := v_0.Args[0]
z := v_1
if !(z.Block.Func.useFMA(v)) {
break
}
v.reset(OpLOONG64FMSUBF)
v.AddArg3(x, y, z)
return true
}
// match: (SUBF z (MULF x y))
// cond: z.Block.Func.useFMA(v)
// result: (FNMSUBF x y z)
for {
z := v_0
if v_1.Op != OpLOONG64MULF {
break
}
y := v_1.Args[1]
x := v_1.Args[0]
if !(z.Block.Func.useFMA(v)) {
break
}
v.reset(OpLOONG64FNMSUBF)
v.AddArg3(x, y, z)
return true
}
// match: (SUBF z (NEGF (MULF x y)))
// cond: z.Block.Func.useFMA(v)
// result: (FMADDF x y z)
for {
z := v_0
if v_1.Op != OpLOONG64NEGF {
break
}
v_1_0 := v_1.Args[0]
if v_1_0.Op != OpLOONG64MULF {
break
}
y := v_1_0.Args[1]
x := v_1_0.Args[0]
if !(z.Block.Func.useFMA(v)) {
break
}
v.reset(OpLOONG64FMADDF)
v.AddArg3(x, y, z)
return true
}
// match: (SUBF (NEGF (MULF x y)) z)
// cond: z.Block.Func.useFMA(v)
// result: (FNMADDF x y z)
for {
if v_0.Op != OpLOONG64NEGF {
break
}
v_0_0 := v_0.Args[0]
if v_0_0.Op != OpLOONG64MULF {
break
}
y := v_0_0.Args[1]
x := v_0_0.Args[0]
z := v_1
if !(z.Block.Func.useFMA(v)) {
break
}
v.reset(OpLOONG64FNMADDF)
v.AddArg3(x, y, z)
return true
}
return false
}
func rewriteValueLOONG64_OpLOONG64SUBV(v *Value) bool {
v_1 := v.Args[1]
v_0 := v.Args[0]

View File

@ -689,7 +689,7 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
},
sys.ARM64, sys.PPC64, sys.RISCV64, sys.S390X)
sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("math", "FMA",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
if !s.config.UseFMA {

View File

@ -399,6 +399,7 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
{"loong64", "internal/runtime/sys", "Len8"}: struct{}{},
{"loong64", "math", "Abs"}: struct{}{},
{"loong64", "math", "Copysign"}: struct{}{},
{"loong64", "math", "FMA"}: struct{}{},
{"loong64", "math", "sqrt"}: struct{}{},
{"loong64", "math/big", "mulWW"}: struct{}{},
{"loong64", "math/bits", "Add"}: struct{}{},

View File

@ -72,6 +72,7 @@ func FusedAdd32(x, y, z float32) float32 {
// s390x:"FMADDS\t"
// ppc64x:"FMADDS\t"
// arm64:"FMADDS"
// loong64:"FMADDF\t"
// riscv64:"FMADDS\t"
return x*y + z
}
@ -80,11 +81,13 @@ func FusedSub32_a(x, y, z float32) float32 {
// s390x:"FMSUBS\t"
// ppc64x:"FMSUBS\t"
// riscv64:"FMSUBS\t"
// loong64:"FMSUBF\t"
return x*y - z
}
func FusedSub32_b(x, y, z float32) float32 {
// arm64:"FMSUBS"
// loong64:"FNMSUBF\t"
// riscv64:"FNMSUBS\t"
return z - x*y
}
@ -93,6 +96,7 @@ func FusedAdd64(x, y, z float64) float64 {
// s390x:"FMADD\t"
// ppc64x:"FMADD\t"
// arm64:"FMADDD"
// loong64:"FMADDD\t"
// riscv64:"FMADDD\t"
return x*y + z
}
@ -101,11 +105,13 @@ func FusedSub64_a(x, y, z float64) float64 {
// s390x:"FMSUB\t"
// ppc64x:"FMSUB\t"
// riscv64:"FMSUBD\t"
// loong64:"FMSUBD\t"
return x*y - z
}
func FusedSub64_b(x, y, z float64) float64 {
// arm64:"FMSUBD"
// loong64:"FNMSUBD\t"
// riscv64:"FNMSUBD\t"
return z - x*y
}

View File

@ -132,6 +132,7 @@ func fma(x, y, z float64) float64 {
// amd64:"VFMADD231SD"
// arm/6:"FMULAD"
// arm64:"FMADDD"
// loong64:"FMADDD"
// s390x:"FMADD"
// ppc64x:"FMADD"
// riscv64:"FMADDD"