diff --git a/src/cmd/compile/internal/loong64/ssa.go b/src/cmd/compile/internal/loong64/ssa.go index 02286b8de82..c49fee68084 100644 --- a/src/cmd/compile/internal/loong64/ssa.go +++ b/src/cmd/compile/internal/loong64/ssa.go @@ -123,7 +123,9 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { p.From.Reg = x p.To.Type = obj.TYPE_REG p.To.Reg = y - case ssa.OpLOONG64MOVVnop: + case ssa.OpLOONG64MOVVnop, + ssa.OpLOONG64LoweredRound32F, + ssa.OpLOONG64LoweredRound64F: // nothing to do case ssa.OpLoadReg: if v.Type.IsFlags() { @@ -320,6 +322,30 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { p.Reg = v.Args[1].Reg() p.To.Type = obj.TYPE_REG p.To.Reg = loong64.REG_FCC0 + + case ssa.OpLOONG64FMADDF, + ssa.OpLOONG64FMADDD, + ssa.OpLOONG64FMSUBF, + ssa.OpLOONG64FMSUBD, + ssa.OpLOONG64FNMADDF, + ssa.OpLOONG64FNMADDD, + ssa.OpLOONG64FNMSUBF, + ssa.OpLOONG64FNMSUBD: + p := s.Prog(v.Op.Asm()) + // r=(FMA x y z) -> FMADDD z, y, x, r + // the SSA operand order is for taking advantage of + // commutativity (that only applies for the first two operands) + r := v.Reg() + x := v.Args[0].Reg() + y := v.Args[1].Reg() + z := v.Args[2].Reg() + p.From.Type = obj.TYPE_REG + p.From.Reg = z + p.Reg = y + p.AddRestSourceReg(x) + p.To.Type = obj.TYPE_REG + p.To.Reg = r + case ssa.OpLOONG64MOVVaddr: p := s.Prog(loong64.AMOVV) p.From.Type = obj.TYPE_ADDR diff --git a/src/cmd/compile/internal/ssa/_gen/LOONG64.rules b/src/cmd/compile/internal/ssa/_gen/LOONG64.rules index 69119f1d96d..ef7cfdf3964 100644 --- a/src/cmd/compile/internal/ssa/_gen/LOONG64.rules +++ b/src/cmd/compile/internal/ssa/_gen/LOONG64.rules @@ -211,7 +211,7 @@ (CvtBoolToUint8 ...) => (Copy ...) -(Round(32|64)F ...) => (Copy ...) +(Round(32|64)F ...) => (LoweredRound(32|64)F ...) // comparisons (Eq8 x y) => (SGTU (MOVVconst [1]) (XOR (ZeroExt8to64 x) (ZeroExt8to64 y))) @@ -675,6 +675,21 @@ (REMVU _ (MOVVconst [1])) => (MOVVconst [0]) // mod (REMVU x (MOVVconst [c])) && isPowerOfTwo(c) => (ANDconst [c-1] x) // mod +// FMA +(FMA ...) => (FMADDD ...) +((ADD|SUB)F (MULF x y) z) && z.Block.Func.useFMA(v) => (FM(ADD|SUB)F x y z) +((ADD|SUB)D (MULD x y) z) && z.Block.Func.useFMA(v) => (FM(ADD|SUB)D x y z) +// z - xy -> -(xy - z) +(SUBF z (MULF x y)) && z.Block.Func.useFMA(v) => (FNMSUBF x y z) +(SUBD z (MULD x y)) && z.Block.Func.useFMA(v) => (FNMSUBD x y z) +// z + (-xy) -> -(xy - z) +// z - (-xy) -> xy + z +((ADD|SUB)F z (NEGF (MULF x y))) && z.Block.Func.useFMA(v) => (F(NMSUB|MADD)F x y z) +((ADD|SUB)D z (NEGD (MULD x y))) && z.Block.Func.useFMA(v) => (F(NMSUB|MADD)D x y z) +// -xy - z -> -(xy + z) +(SUBF (NEGF (MULF x y)) z) && z.Block.Func.useFMA(v) => (FNMADDF x y z) +(SUBD (NEGD (MULD x y)) z) && z.Block.Func.useFMA(v) => (FNMADDD x y z) + // generic simplifications (ADDV x (NEGV y)) => (SUBV x y) (SUBV x x) => (MOVVconst [0]) diff --git a/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go b/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go index 5789760683c..465e724a194 100644 --- a/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go +++ b/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go @@ -151,6 +151,7 @@ func init() { fp01 = regInfo{inputs: nil, outputs: []regMask{fp}} fp11 = regInfo{inputs: []regMask{fp}, outputs: []regMask{fp}} fp21 = regInfo{inputs: []regMask{fp, fp}, outputs: []regMask{fp}} + fp31 = regInfo{inputs: []regMask{fp, fp, fp}, outputs: []regMask{fp}} fp2flags = regInfo{inputs: []regMask{fp, fp}} fpload = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{fp}} fp2load = regInfo{inputs: []regMask{gpspsbg, gpg}, outputs: []regMask{fp}} @@ -193,6 +194,15 @@ func init() { {name: "NOR", argLength: 2, reg: gp21, asm: "NOR", commutative: true}, // ^(arg0 | arg1) {name: "NORconst", argLength: 1, reg: gp11, asm: "NOR", aux: "Int64"}, // ^(arg0 | auxInt) + {name: "FMADDF", argLength: 3, reg: fp31, asm: "FMADDF", commutative: true, typ: "Float32"}, // (arg0 * arg1) + arg2 + {name: "FMADDD", argLength: 3, reg: fp31, asm: "FMADDD", commutative: true, typ: "Float64"}, // (arg0 * arg1) + arg2 + {name: "FMSUBF", argLength: 3, reg: fp31, asm: "FMSUBF", commutative: true, typ: "Float32"}, // (arg0 * arg1) - arg2 + {name: "FMSUBD", argLength: 3, reg: fp31, asm: "FMSUBD", commutative: true, typ: "Float64"}, // (arg0 * arg1) - arg2 + {name: "FNMADDF", argLength: 3, reg: fp31, asm: "FNMADDF", commutative: true, typ: "Float32"}, // -((arg0 * arg1) + arg2) + {name: "FNMADDD", argLength: 3, reg: fp31, asm: "FNMADDD", commutative: true, typ: "Float64"}, // -((arg0 * arg1) + arg2) + {name: "FNMSUBF", argLength: 3, reg: fp31, asm: "FNMSUBF", commutative: true, typ: "Float32"}, // -((arg0 * arg1) - arg2) + {name: "FNMSUBD", argLength: 3, reg: fp31, asm: "FNMSUBD", commutative: true, typ: "Float64"}, // -((arg0 * arg1) - arg2) + {name: "NEGV", argLength: 1, reg: gp11}, // -arg0 {name: "NEGF", argLength: 1, reg: fp11, asm: "NEGF"}, // -arg0, float32 {name: "NEGD", argLength: 1, reg: fp11, asm: "NEGD"}, // -arg0, float64 @@ -330,6 +340,10 @@ func init() { {name: "MOVFD", argLength: 1, reg: fp11, asm: "MOVFD"}, // float32 -> float64 {name: "MOVDF", argLength: 1, reg: fp11, asm: "MOVDF"}, // float64 -> float32 + // Round ops to block fused-multiply-add extraction. + {name: "LoweredRound32F", argLength: 1, reg: fp11, resultInArg0: true}, + {name: "LoweredRound64F", argLength: 1, reg: fp11, resultInArg0: true}, + // function calls {name: "CALLstatic", argLength: -1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call static function aux.(*obj.LSym). last arg=mem, auxint=argsize, returns mem {name: "CALLtail", argLength: -1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true, tailCall: true}, // tail call static function aux.(*obj.LSym). last arg=mem, auxint=argsize, returns mem diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index b18a4385d2b..bcc358db508 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1783,6 +1783,14 @@ const ( OpLOONG64XORconst OpLOONG64NOR OpLOONG64NORconst + OpLOONG64FMADDF + OpLOONG64FMADDD + OpLOONG64FMSUBF + OpLOONG64FMSUBD + OpLOONG64FNMADDF + OpLOONG64FNMADDD + OpLOONG64FNMSUBF + OpLOONG64FNMSUBD OpLOONG64NEGV OpLOONG64NEGF OpLOONG64NEGD @@ -1887,6 +1895,8 @@ const ( OpLOONG64TRUNCDV OpLOONG64MOVFD OpLOONG64MOVDF + OpLOONG64LoweredRound32F + OpLOONG64LoweredRound64F OpLOONG64CALLstatic OpLOONG64CALLtail OpLOONG64CALLclosure @@ -23928,6 +23938,134 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "FMADDF", + argLen: 3, + commutative: true, + asm: loong64.AFMADDF, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {1, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {2, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + outputs: []outputInfo{ + {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + }, + }, + { + name: "FMADDD", + argLen: 3, + commutative: true, + asm: loong64.AFMADDD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {1, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {2, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + outputs: []outputInfo{ + {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + }, + }, + { + name: "FMSUBF", + argLen: 3, + commutative: true, + asm: loong64.AFMSUBF, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {1, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {2, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + outputs: []outputInfo{ + {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + }, + }, + { + name: "FMSUBD", + argLen: 3, + commutative: true, + asm: loong64.AFMSUBD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {1, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {2, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + outputs: []outputInfo{ + {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + }, + }, + { + name: "FNMADDF", + argLen: 3, + commutative: true, + asm: loong64.AFNMADDF, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {1, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {2, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + outputs: []outputInfo{ + {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + }, + }, + { + name: "FNMADDD", + argLen: 3, + commutative: true, + asm: loong64.AFNMADDD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {1, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {2, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + outputs: []outputInfo{ + {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + }, + }, + { + name: "FNMSUBF", + argLen: 3, + commutative: true, + asm: loong64.AFNMSUBF, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {1, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {2, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + outputs: []outputInfo{ + {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + }, + }, + { + name: "FNMSUBD", + argLen: 3, + commutative: true, + asm: loong64.AFNMSUBD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {1, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {2, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + outputs: []outputInfo{ + {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + }, + }, { name: "NEGV", argLen: 1, @@ -25326,6 +25464,32 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "LoweredRound32F", + argLen: 1, + resultInArg0: true, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + outputs: []outputInfo{ + {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + }, + }, + { + name: "LoweredRound64F", + argLen: 1, + resultInArg0: true, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + outputs: []outputInfo{ + {0, 4611686017353646080}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + }, + }, { name: "CALLstatic", auxType: auxCallOff, diff --git a/src/cmd/compile/internal/ssa/rewriteLOONG64.go b/src/cmd/compile/internal/ssa/rewriteLOONG64.go index fedcd196d46..e8c1d26554a 100644 --- a/src/cmd/compile/internal/ssa/rewriteLOONG64.go +++ b/src/cmd/compile/internal/ssa/rewriteLOONG64.go @@ -216,6 +216,9 @@ func rewriteValueLOONG64(v *Value) bool { return rewriteValueLOONG64_OpEqB(v) case OpEqPtr: return rewriteValueLOONG64_OpEqPtr(v) + case OpFMA: + v.Op = OpLOONG64FMADDD + return true case OpGetCallerPC: v.Op = OpLOONG64LoweredGetCallerPC return true @@ -244,6 +247,10 @@ func rewriteValueLOONG64(v *Value) bool { return rewriteValueLOONG64_OpIsNonNil(v) case OpIsSliceInBounds: return rewriteValueLOONG64_OpIsSliceInBounds(v) + case OpLOONG64ADDD: + return rewriteValueLOONG64_OpLOONG64ADDD(v) + case OpLOONG64ADDF: + return rewriteValueLOONG64_OpLOONG64ADDF(v) case OpLOONG64ADDV: return rewriteValueLOONG64_OpLOONG64ADDV(v) case OpLOONG64ADDVconst: @@ -392,6 +399,10 @@ func rewriteValueLOONG64(v *Value) bool { return rewriteValueLOONG64_OpLOONG64SRLV(v) case OpLOONG64SRLVconst: return rewriteValueLOONG64_OpLOONG64SRLVconst(v) + case OpLOONG64SUBD: + return rewriteValueLOONG64_OpLOONG64SUBD(v) + case OpLOONG64SUBF: + return rewriteValueLOONG64_OpLOONG64SUBF(v) case OpLOONG64SUBV: return rewriteValueLOONG64_OpLOONG64SUBV(v) case OpLOONG64SUBVconst: @@ -596,10 +607,10 @@ func rewriteValueLOONG64(v *Value) bool { case OpRotateLeft8: return rewriteValueLOONG64_OpRotateLeft8(v) case OpRound32F: - v.Op = OpCopy + v.Op = OpLOONG64LoweredRound32F return true case OpRound64F: - v.Op = OpCopy + v.Op = OpLOONG64LoweredRound64F return true case OpRsh16Ux16: return rewriteValueLOONG64_OpRsh16Ux16(v) @@ -1410,6 +1421,104 @@ func rewriteValueLOONG64_OpIsSliceInBounds(v *Value) bool { return true } } +func rewriteValueLOONG64_OpLOONG64ADDD(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (ADDD (MULD x y) z) + // cond: z.Block.Func.useFMA(v) + // result: (FMADDD x y z) + for { + for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 { + if v_0.Op != OpLOONG64MULD { + continue + } + y := v_0.Args[1] + x := v_0.Args[0] + z := v_1 + if !(z.Block.Func.useFMA(v)) { + continue + } + v.reset(OpLOONG64FMADDD) + v.AddArg3(x, y, z) + return true + } + break + } + // match: (ADDD z (NEGD (MULD x y))) + // cond: z.Block.Func.useFMA(v) + // result: (FNMSUBD x y z) + for { + for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 { + z := v_0 + if v_1.Op != OpLOONG64NEGD { + continue + } + v_1_0 := v_1.Args[0] + if v_1_0.Op != OpLOONG64MULD { + continue + } + y := v_1_0.Args[1] + x := v_1_0.Args[0] + if !(z.Block.Func.useFMA(v)) { + continue + } + v.reset(OpLOONG64FNMSUBD) + v.AddArg3(x, y, z) + return true + } + break + } + return false +} +func rewriteValueLOONG64_OpLOONG64ADDF(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (ADDF (MULF x y) z) + // cond: z.Block.Func.useFMA(v) + // result: (FMADDF x y z) + for { + for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 { + if v_0.Op != OpLOONG64MULF { + continue + } + y := v_0.Args[1] + x := v_0.Args[0] + z := v_1 + if !(z.Block.Func.useFMA(v)) { + continue + } + v.reset(OpLOONG64FMADDF) + v.AddArg3(x, y, z) + return true + } + break + } + // match: (ADDF z (NEGF (MULF x y))) + // cond: z.Block.Func.useFMA(v) + // result: (FNMSUBF x y z) + for { + for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 { + z := v_0 + if v_1.Op != OpLOONG64NEGF { + continue + } + v_1_0 := v_1.Args[0] + if v_1_0.Op != OpLOONG64MULF { + continue + } + y := v_1_0.Args[1] + x := v_1_0.Args[0] + if !(z.Block.Func.useFMA(v)) { + continue + } + v.reset(OpLOONG64FNMSUBF) + v.AddArg3(x, y, z) + return true + } + break + } + return false +} func rewriteValueLOONG64_OpLOONG64ADDV(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] @@ -5944,6 +6053,168 @@ func rewriteValueLOONG64_OpLOONG64SRLVconst(v *Value) bool { } return false } +func rewriteValueLOONG64_OpLOONG64SUBD(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (SUBD (MULD x y) z) + // cond: z.Block.Func.useFMA(v) + // result: (FMSUBD x y z) + for { + if v_0.Op != OpLOONG64MULD { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + z := v_1 + if !(z.Block.Func.useFMA(v)) { + break + } + v.reset(OpLOONG64FMSUBD) + v.AddArg3(x, y, z) + return true + } + // match: (SUBD z (MULD x y)) + // cond: z.Block.Func.useFMA(v) + // result: (FNMSUBD x y z) + for { + z := v_0 + if v_1.Op != OpLOONG64MULD { + break + } + y := v_1.Args[1] + x := v_1.Args[0] + if !(z.Block.Func.useFMA(v)) { + break + } + v.reset(OpLOONG64FNMSUBD) + v.AddArg3(x, y, z) + return true + } + // match: (SUBD z (NEGD (MULD x y))) + // cond: z.Block.Func.useFMA(v) + // result: (FMADDD x y z) + for { + z := v_0 + if v_1.Op != OpLOONG64NEGD { + break + } + v_1_0 := v_1.Args[0] + if v_1_0.Op != OpLOONG64MULD { + break + } + y := v_1_0.Args[1] + x := v_1_0.Args[0] + if !(z.Block.Func.useFMA(v)) { + break + } + v.reset(OpLOONG64FMADDD) + v.AddArg3(x, y, z) + return true + } + // match: (SUBD (NEGD (MULD x y)) z) + // cond: z.Block.Func.useFMA(v) + // result: (FNMADDD x y z) + for { + if v_0.Op != OpLOONG64NEGD { + break + } + v_0_0 := v_0.Args[0] + if v_0_0.Op != OpLOONG64MULD { + break + } + y := v_0_0.Args[1] + x := v_0_0.Args[0] + z := v_1 + if !(z.Block.Func.useFMA(v)) { + break + } + v.reset(OpLOONG64FNMADDD) + v.AddArg3(x, y, z) + return true + } + return false +} +func rewriteValueLOONG64_OpLOONG64SUBF(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (SUBF (MULF x y) z) + // cond: z.Block.Func.useFMA(v) + // result: (FMSUBF x y z) + for { + if v_0.Op != OpLOONG64MULF { + break + } + y := v_0.Args[1] + x := v_0.Args[0] + z := v_1 + if !(z.Block.Func.useFMA(v)) { + break + } + v.reset(OpLOONG64FMSUBF) + v.AddArg3(x, y, z) + return true + } + // match: (SUBF z (MULF x y)) + // cond: z.Block.Func.useFMA(v) + // result: (FNMSUBF x y z) + for { + z := v_0 + if v_1.Op != OpLOONG64MULF { + break + } + y := v_1.Args[1] + x := v_1.Args[0] + if !(z.Block.Func.useFMA(v)) { + break + } + v.reset(OpLOONG64FNMSUBF) + v.AddArg3(x, y, z) + return true + } + // match: (SUBF z (NEGF (MULF x y))) + // cond: z.Block.Func.useFMA(v) + // result: (FMADDF x y z) + for { + z := v_0 + if v_1.Op != OpLOONG64NEGF { + break + } + v_1_0 := v_1.Args[0] + if v_1_0.Op != OpLOONG64MULF { + break + } + y := v_1_0.Args[1] + x := v_1_0.Args[0] + if !(z.Block.Func.useFMA(v)) { + break + } + v.reset(OpLOONG64FMADDF) + v.AddArg3(x, y, z) + return true + } + // match: (SUBF (NEGF (MULF x y)) z) + // cond: z.Block.Func.useFMA(v) + // result: (FNMADDF x y z) + for { + if v_0.Op != OpLOONG64NEGF { + break + } + v_0_0 := v_0.Args[0] + if v_0_0.Op != OpLOONG64MULF { + break + } + y := v_0_0.Args[1] + x := v_0_0.Args[0] + z := v_1 + if !(z.Block.Func.useFMA(v)) { + break + } + v.reset(OpLOONG64FNMADDF) + v.AddArg3(x, y, z) + return true + } + return false +} func rewriteValueLOONG64_OpLOONG64SUBV(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go index fda273b3e56..a1d962ee3ad 100644 --- a/src/cmd/compile/internal/ssagen/intrinsics.go +++ b/src/cmd/compile/internal/ssagen/intrinsics.go @@ -689,7 +689,7 @@ func initIntrinsics(cfg *intrinsicBuildConfig) { func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2]) }, - sys.ARM64, sys.PPC64, sys.RISCV64, sys.S390X) + sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X) addF("math", "FMA", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { if !s.config.UseFMA { diff --git a/src/cmd/compile/internal/ssagen/intrinsics_test.go b/src/cmd/compile/internal/ssagen/intrinsics_test.go index 4e59714ce7f..9cf8cbc8772 100644 --- a/src/cmd/compile/internal/ssagen/intrinsics_test.go +++ b/src/cmd/compile/internal/ssagen/intrinsics_test.go @@ -399,6 +399,7 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{ {"loong64", "internal/runtime/sys", "Len8"}: struct{}{}, {"loong64", "math", "Abs"}: struct{}{}, {"loong64", "math", "Copysign"}: struct{}{}, + {"loong64", "math", "FMA"}: struct{}{}, {"loong64", "math", "sqrt"}: struct{}{}, {"loong64", "math/big", "mulWW"}: struct{}{}, {"loong64", "math/bits", "Add"}: struct{}{}, diff --git a/test/codegen/floats.go b/test/codegen/floats.go index a77843d0e73..1b85eba3524 100644 --- a/test/codegen/floats.go +++ b/test/codegen/floats.go @@ -72,6 +72,7 @@ func FusedAdd32(x, y, z float32) float32 { // s390x:"FMADDS\t" // ppc64x:"FMADDS\t" // arm64:"FMADDS" + // loong64:"FMADDF\t" // riscv64:"FMADDS\t" return x*y + z } @@ -80,11 +81,13 @@ func FusedSub32_a(x, y, z float32) float32 { // s390x:"FMSUBS\t" // ppc64x:"FMSUBS\t" // riscv64:"FMSUBS\t" + // loong64:"FMSUBF\t" return x*y - z } func FusedSub32_b(x, y, z float32) float32 { // arm64:"FMSUBS" + // loong64:"FNMSUBF\t" // riscv64:"FNMSUBS\t" return z - x*y } @@ -93,6 +96,7 @@ func FusedAdd64(x, y, z float64) float64 { // s390x:"FMADD\t" // ppc64x:"FMADD\t" // arm64:"FMADDD" + // loong64:"FMADDD\t" // riscv64:"FMADDD\t" return x*y + z } @@ -101,11 +105,13 @@ func FusedSub64_a(x, y, z float64) float64 { // s390x:"FMSUB\t" // ppc64x:"FMSUB\t" // riscv64:"FMSUBD\t" + // loong64:"FMSUBD\t" return x*y - z } func FusedSub64_b(x, y, z float64) float64 { // arm64:"FMSUBD" + // loong64:"FNMSUBD\t" // riscv64:"FNMSUBD\t" return z - x*y } diff --git a/test/codegen/math.go b/test/codegen/math.go index 806f9096484..4ce5fa419d2 100644 --- a/test/codegen/math.go +++ b/test/codegen/math.go @@ -132,6 +132,7 @@ func fma(x, y, z float64) float64 { // amd64:"VFMADD231SD" // arm/6:"FMULAD" // arm64:"FMADDD" + // loong64:"FMADDD" // s390x:"FMADD" // ppc64x:"FMADD" // riscv64:"FMADDD"