From ebb77aa867b4dddd63fc397a907ac93020480723 Mon Sep 17 00:00:00 2001 From: Ben Shi Date: Wed, 7 Feb 2018 12:24:41 +0000 Subject: [PATCH] cmd/compile/internal/ssa: optimize arm64 with FNMULS/FNMULD MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit FNMULS&FNMULD are efficient arm64 instructions, which can be used to improve FP performance. This CL use them to optimize pairs of neg-mul operations. Here are benchmark test results on Raspberry Pi 3 with ArchLinux. 1. A special test case gets about 15% improvement. (https://github.com/benshi001/ugo1/blob/master/fpmul_test.go) FPMul-4 485µs ± 0% 410µs ± 0% -15.49% (p=0.000 n=26+23) 2. There is little regression in the go1 benchmark (excluding noise). name old time/op new time/op delta BinaryTree17-4 42.0s ± 3% 42.1s ± 2% ~ (p=0.542 n=39+40) Fannkuch11-4 33.3s ± 3% 32.9s ± 1% ~ (p=0.200 n=40+32) FmtFprintfEmpty-4 534ns ± 0% 534ns ± 0% ~ (all equal) FmtFprintfString-4 1.09µs ± 1% 1.09µs ± 0% ~ (p=0.950 n=32+32) FmtFprintfInt-4 1.14µs ± 0% 1.14µs ± 1% ~ (p=0.571 n=32+31) FmtFprintfIntInt-4 1.79µs ± 3% 1.76µs ± 0% -1.42% (p=0.004 n=40+34) FmtFprintfPrefixedInt-4 2.17µs ± 0% 2.17µs ± 0% ~ (p=0.073 n=31+34) FmtFprintfFloat-4 3.33µs ± 3% 3.28µs ± 0% -1.46% (p=0.001 n=40+34) FmtManyArgs-4 7.28µs ± 6% 7.19µs ± 0% ~ (p=0.641 n=40+33) GobDecode-4 96.5ms ± 4% 96.5ms ± 9% ~ (p=0.214 n=40+40) GobEncode-4 79.5ms ± 0% 80.7ms ± 4% +1.51% (p=0.000 n=34+40) Gzip-4 4.53s ± 4% 4.56s ± 4% +0.60% (p=0.000 n=40+40) Gunzip-4 451ms ± 3% 442ms ± 0% -1.93% (p=0.000 n=40+32) HTTPClientServer-4 530µs ± 1% 535µs ± 1% +0.88% (p=0.000 n=39+39) JSONEncode-4 214ms ± 4% 211ms ± 0% ~ (p=0.059 n=40+31) JSONDecode-4 865ms ± 5% 864ms ± 4% -0.06% (p=0.003 n=40+40) Mandelbrot200-4 52.0ms ± 3% 52.1ms ± 3% ~ (p=0.556 n=40+40) GoParse-4 43.1ms ± 8% 42.1ms ± 0% ~ (p=0.083 n=40+33) RegexpMatchEasy0_32-4 1.02µs ± 3% 1.02µs ± 4% +0.06% (p=0.020 n=40+40) RegexpMatchEasy0_1K-4 3.90µs ± 0% 3.96µs ± 3% +1.58% (p=0.000 n=31+40) RegexpMatchEasy1_32-4 967ns ± 4% 981ns ± 3% +1.40% (p=0.000 n=40+40) RegexpMatchEasy1_1K-4 6.41µs ± 4% 6.43µs ± 3% ~ (p=0.386 n=40+40) RegexpMatchMedium_32-4 1.76µs ± 3% 1.78µs ± 3% +1.08% (p=0.000 n=40+40) RegexpMatchMedium_1K-4 561µs ± 0% 562µs ± 0% +0.09% (p=0.003 n=34+31) RegexpMatchHard_32-4 31.5µs ± 2% 31.1µs ± 4% -1.17% (p=0.000 n=30+40) RegexpMatchHard_1K-4 960µs ± 3% 950µs ± 4% -1.02% (p=0.016 n=40+40) Revcomp-4 7.79s ± 7% 7.79s ± 4% ~ (p=0.859 n=40+40) Template-4 889ms ± 6% 872ms ± 3% -1.86% (p=0.025 n=40+31) TimeParse-4 4.80µs ± 0% 4.89µs ± 3% +1.71% (p=0.001 n=31+40) TimeFormat-4 4.70µs ± 1% 4.78µs ± 3% +1.57% (p=0.000 n=33+40) [Geo mean] 710µs 709µs -0.13% name old speed new speed delta GobDecode-4 7.96MB/s ± 4% 7.96MB/s ± 9% ~ (p=0.174 n=40+40) GobEncode-4 9.65MB/s ± 0% 9.51MB/s ± 4% -1.45% (p=0.000 n=34+40) Gzip-4 4.29MB/s ± 4% 4.26MB/s ± 4% -0.59% (p=0.000 n=40+40) Gunzip-4 43.0MB/s ± 3% 43.9MB/s ± 0% +1.90% (p=0.000 n=40+32) JSONEncode-4 9.09MB/s ± 4% 9.22MB/s ± 0% ~ (p=0.429 n=40+31) JSONDecode-4 2.25MB/s ± 5% 2.25MB/s ± 4% ~ (p=0.278 n=40+40) GoParse-4 1.35MB/s ± 7% 1.37MB/s ± 0% ~ (p=0.071 n=40+25) RegexpMatchEasy0_32-4 31.5MB/s ± 3% 31.5MB/s ± 4% -0.08% (p=0.018 n=40+40) RegexpMatchEasy0_1K-4 263MB/s ± 0% 259MB/s ± 3% -1.51% (p=0.000 n=31+40) RegexpMatchEasy1_32-4 33.1MB/s ± 4% 32.6MB/s ± 3% -1.38% (p=0.000 n=40+40) RegexpMatchEasy1_1K-4 160MB/s ± 4% 159MB/s ± 3% ~ (p=0.364 n=40+40) RegexpMatchMedium_32-4 565kB/s ± 3% 562kB/s ± 2% ~ (p=0.208 n=40+40) RegexpMatchMedium_1K-4 1.82MB/s ± 0% 1.82MB/s ± 0% -0.27% (p=0.000 n=34+31) RegexpMatchHard_32-4 1.02MB/s ± 3% 1.03MB/s ± 4% +1.04% (p=0.000 n=32+40) RegexpMatchHard_1K-4 1.07MB/s ± 4% 1.08MB/s ± 4% +0.94% (p=0.003 n=40+40) Revcomp-4 32.6MB/s ± 7% 32.6MB/s ± 4% ~ (p=0.965 n=40+40) Template-4 2.18MB/s ± 6% 2.22MB/s ± 3% +1.83% (p=0.020 n=40+31) [Geo mean] 7.77MB/s 7.78MB/s +0.16% 3. There is little change in the compilecmp benchmark (excluding noise). name old time/op new time/op delta Template 2.37s ± 3% 2.35s ± 4% ~ (p=0.529 n=10+10) Unicode 1.38s ± 8% 1.36s ± 5% ~ (p=0.247 n=10+10) GoTypes 8.10s ± 2% 8.10s ± 2% ~ (p=0.971 n=10+10) Compiler 40.5s ± 4% 40.8s ± 1% ~ (p=0.529 n=10+10) SSA 115s ± 2% 115s ± 3% ~ (p=0.684 n=10+10) Flate 1.45s ± 5% 1.46s ± 3% ~ (p=0.796 n=10+10) GoParser 1.86s ± 4% 1.84s ± 2% ~ (p=0.095 n=9+10) Reflect 5.11s ± 2% 5.13s ± 2% ~ (p=0.315 n=10+10) Tar 2.22s ± 3% 2.23s ± 1% ~ (p=0.299 n=9+7) XML 2.72s ± 3% 2.72s ± 3% ~ (p=0.912 n=10+10) [Geo mean] 5.03s 5.02s -0.21% name old user-time/op new user-time/op delta Template 2.92s ± 2% 2.89s ± 1% ~ (p=0.247 n=10+10) Unicode 1.71s ± 5% 1.69s ± 4% ~ (p=0.393 n=10+10) GoTypes 9.78s ± 2% 9.76s ± 2% ~ (p=0.631 n=10+10) Compiler 49.1s ± 2% 49.1s ± 1% ~ (p=0.796 n=10+10) SSA 144s ± 1% 144s ± 2% ~ (p=0.796 n=10+10) Flate 1.74s ± 2% 1.73s ± 3% ~ (p=0.842 n=10+9) GoParser 2.23s ± 3% 2.25s ± 2% ~ (p=0.143 n=10+10) Reflect 5.93s ± 3% 5.98s ± 2% ~ (p=0.211 n=10+9) Tar 2.65s ± 2% 2.69s ± 3% +1.51% (p=0.010 n=9+10) XML 3.25s ± 2% 3.21s ± 1% -1.24% (p=0.035 n=10+9) [Geo mean] 6.07s 6.07s -0.08% name old text-bytes new text-bytes delta HelloSize 641kB ± 0% 641kB ± 0% ~ (all equal) name old data-bytes new data-bytes delta HelloSize 9.46kB ± 0% 9.46kB ± 0% ~ (all equal) name old bss-bytes new bss-bytes delta HelloSize 125kB ± 0% 125kB ± 0% ~ (all equal) name old exe-bytes new exe-bytes delta HelloSize 1.24MB ± 0% 1.24MB ± 0% ~ (all equal) Change-Id: Id095d998c380eef929755124084df02446a6b7c1 Reviewed-on: https://go-review.googlesource.com/92555 Run-TryBot: Cherry Zhang TryBot-Result: Gobot Gobot Reviewed-by: Cherry Zhang --- src/cmd/compile/internal/arm64/ssa.go | 2 + src/cmd/compile/internal/ssa/gen/ARM64.rules | 10 + src/cmd/compile/internal/ssa/gen/ARM64Ops.go | 18 +- src/cmd/compile/internal/ssa/opGen.go | 32 +++ src/cmd/compile/internal/ssa/rewriteARM64.go | 222 +++++++++++++++++++ 5 files changed, 276 insertions(+), 8 deletions(-) diff --git a/src/cmd/compile/internal/arm64/ssa.go b/src/cmd/compile/internal/arm64/ssa.go index ffb37ba705c..80dbfe113b7 100644 --- a/src/cmd/compile/internal/arm64/ssa.go +++ b/src/cmd/compile/internal/arm64/ssa.go @@ -171,6 +171,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { ssa.OpARM64FSUBD, ssa.OpARM64FMULS, ssa.OpARM64FMULD, + ssa.OpARM64FNMULS, + ssa.OpARM64FNMULD, ssa.OpARM64FDIVS, ssa.OpARM64FDIVD: r := v.Reg() diff --git a/src/cmd/compile/internal/ssa/gen/ARM64.rules b/src/cmd/compile/internal/ssa/gen/ARM64.rules index ba994479c73..888f5f85564 100644 --- a/src/cmd/compile/internal/ssa/gen/ARM64.rules +++ b/src/cmd/compile/internal/ssa/gen/ARM64.rules @@ -1373,3 +1373,13 @@ && clobber(o0) && clobber(o1) && clobber(o2) && clobber(o3) && clobber(o4) && clobber(o5) && clobber(s0) -> @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (REV (MOVDload {s} (OffPtr [i0] p) mem)) + +// FP simplification +(FNEGS (FMULS x y)) -> (FNMULS x y) +(FNEGD (FMULD x y)) -> (FNMULD x y) +(FMULS (FNEGS x) y) -> (FNMULS x y) +(FMULD (FNEGD x) y) -> (FNMULD x y) +(FNEGS (FNMULS x y)) -> (FMULS x y) +(FNEGD (FNMULD x y)) -> (FMULD x y) +(FNMULS (FNEGS x) y) -> (FMULS x y) +(FNMULD (FNEGD x) y) -> (FMULD x y) diff --git a/src/cmd/compile/internal/ssa/gen/ARM64Ops.go b/src/cmd/compile/internal/ssa/gen/ARM64Ops.go index 5764d6bb37d..583599186c3 100644 --- a/src/cmd/compile/internal/ssa/gen/ARM64Ops.go +++ b/src/cmd/compile/internal/ssa/gen/ARM64Ops.go @@ -178,14 +178,16 @@ func init() { {name: "MODW", argLength: 2, reg: gp21, asm: "REMW"}, // arg0 % arg1, signed, 32 bit {name: "UMODW", argLength: 2, reg: gp21, asm: "UREMW"}, // arg0 % arg1, unsigned, 32 bit - {name: "FADDS", argLength: 2, reg: fp21, asm: "FADDS", commutative: true}, // arg0 + arg1 - {name: "FADDD", argLength: 2, reg: fp21, asm: "FADDD", commutative: true}, // arg0 + arg1 - {name: "FSUBS", argLength: 2, reg: fp21, asm: "FSUBS"}, // arg0 - arg1 - {name: "FSUBD", argLength: 2, reg: fp21, asm: "FSUBD"}, // arg0 - arg1 - {name: "FMULS", argLength: 2, reg: fp21, asm: "FMULS", commutative: true}, // arg0 * arg1 - {name: "FMULD", argLength: 2, reg: fp21, asm: "FMULD", commutative: true}, // arg0 * arg1 - {name: "FDIVS", argLength: 2, reg: fp21, asm: "FDIVS"}, // arg0 / arg1 - {name: "FDIVD", argLength: 2, reg: fp21, asm: "FDIVD"}, // arg0 / arg1 + {name: "FADDS", argLength: 2, reg: fp21, asm: "FADDS", commutative: true}, // arg0 + arg1 + {name: "FADDD", argLength: 2, reg: fp21, asm: "FADDD", commutative: true}, // arg0 + arg1 + {name: "FSUBS", argLength: 2, reg: fp21, asm: "FSUBS"}, // arg0 - arg1 + {name: "FSUBD", argLength: 2, reg: fp21, asm: "FSUBD"}, // arg0 - arg1 + {name: "FMULS", argLength: 2, reg: fp21, asm: "FMULS", commutative: true}, // arg0 * arg1 + {name: "FMULD", argLength: 2, reg: fp21, asm: "FMULD", commutative: true}, // arg0 * arg1 + {name: "FNMULS", argLength: 2, reg: fp21, asm: "FNMULS", commutative: true}, // -(arg0 * arg1) + {name: "FNMULD", argLength: 2, reg: fp21, asm: "FNMULD", commutative: true}, // -(arg0 * arg1) + {name: "FDIVS", argLength: 2, reg: fp21, asm: "FDIVS"}, // arg0 / arg1 + {name: "FDIVD", argLength: 2, reg: fp21, asm: "FDIVD"}, // arg0 / arg1 {name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true}, // arg0 & arg1 {name: "ANDconst", argLength: 1, reg: gp11, asm: "AND", aux: "Int64"}, // arg0 & auxInt diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 5075c1cc239..0c2b8f61c6d 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -975,6 +975,8 @@ const ( OpARM64FSUBD OpARM64FMULS OpARM64FMULD + OpARM64FNMULS + OpARM64FNMULD OpARM64FDIVS OpARM64FDIVD OpARM64AND @@ -12359,6 +12361,36 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "FNMULS", + argLen: 2, + commutative: true, + asm: arm64.AFNMULS, + reg: regInfo{ + inputs: []inputInfo{ + {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + outputs: []outputInfo{ + {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + }, + }, + { + name: "FNMULD", + argLen: 2, + commutative: true, + asm: arm64.AFNMULD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + outputs: []outputInfo{ + {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + }, + }, { name: "FDIVS", argLen: 2, diff --git a/src/cmd/compile/internal/ssa/rewriteARM64.go b/src/cmd/compile/internal/ssa/rewriteARM64.go index 67b6d2fd203..05974dab4c8 100644 --- a/src/cmd/compile/internal/ssa/rewriteARM64.go +++ b/src/cmd/compile/internal/ssa/rewriteARM64.go @@ -77,6 +77,18 @@ func rewriteValueARM64(v *Value) bool { return rewriteValueARM64_OpARM64FMOVSload_0(v) case OpARM64FMOVSstore: return rewriteValueARM64_OpARM64FMOVSstore_0(v) + case OpARM64FMULD: + return rewriteValueARM64_OpARM64FMULD_0(v) + case OpARM64FMULS: + return rewriteValueARM64_OpARM64FMULS_0(v) + case OpARM64FNEGD: + return rewriteValueARM64_OpARM64FNEGD_0(v) + case OpARM64FNEGS: + return rewriteValueARM64_OpARM64FNEGS_0(v) + case OpARM64FNMULD: + return rewriteValueARM64_OpARM64FNMULD_0(v) + case OpARM64FNMULS: + return rewriteValueARM64_OpARM64FNMULS_0(v) case OpARM64GreaterEqual: return rewriteValueARM64_OpARM64GreaterEqual_0(v) case OpARM64GreaterEqualU: @@ -3028,6 +3040,216 @@ func rewriteValueARM64_OpARM64FMOVSstore_0(v *Value) bool { } return false } +func rewriteValueARM64_OpARM64FMULD_0(v *Value) bool { + // match: (FMULD (FNEGD x) y) + // cond: + // result: (FNMULD x y) + for { + _ = v.Args[1] + v_0 := v.Args[0] + if v_0.Op != OpARM64FNEGD { + break + } + x := v_0.Args[0] + y := v.Args[1] + v.reset(OpARM64FNMULD) + v.AddArg(x) + v.AddArg(y) + return true + } + // match: (FMULD y (FNEGD x)) + // cond: + // result: (FNMULD x y) + for { + _ = v.Args[1] + y := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpARM64FNEGD { + break + } + x := v_1.Args[0] + v.reset(OpARM64FNMULD) + v.AddArg(x) + v.AddArg(y) + return true + } + return false +} +func rewriteValueARM64_OpARM64FMULS_0(v *Value) bool { + // match: (FMULS (FNEGS x) y) + // cond: + // result: (FNMULS x y) + for { + _ = v.Args[1] + v_0 := v.Args[0] + if v_0.Op != OpARM64FNEGS { + break + } + x := v_0.Args[0] + y := v.Args[1] + v.reset(OpARM64FNMULS) + v.AddArg(x) + v.AddArg(y) + return true + } + // match: (FMULS y (FNEGS x)) + // cond: + // result: (FNMULS x y) + for { + _ = v.Args[1] + y := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpARM64FNEGS { + break + } + x := v_1.Args[0] + v.reset(OpARM64FNMULS) + v.AddArg(x) + v.AddArg(y) + return true + } + return false +} +func rewriteValueARM64_OpARM64FNEGD_0(v *Value) bool { + // match: (FNEGD (FMULD x y)) + // cond: + // result: (FNMULD x y) + for { + v_0 := v.Args[0] + if v_0.Op != OpARM64FMULD { + break + } + _ = v_0.Args[1] + x := v_0.Args[0] + y := v_0.Args[1] + v.reset(OpARM64FNMULD) + v.AddArg(x) + v.AddArg(y) + return true + } + // match: (FNEGD (FNMULD x y)) + // cond: + // result: (FMULD x y) + for { + v_0 := v.Args[0] + if v_0.Op != OpARM64FNMULD { + break + } + _ = v_0.Args[1] + x := v_0.Args[0] + y := v_0.Args[1] + v.reset(OpARM64FMULD) + v.AddArg(x) + v.AddArg(y) + return true + } + return false +} +func rewriteValueARM64_OpARM64FNEGS_0(v *Value) bool { + // match: (FNEGS (FMULS x y)) + // cond: + // result: (FNMULS x y) + for { + v_0 := v.Args[0] + if v_0.Op != OpARM64FMULS { + break + } + _ = v_0.Args[1] + x := v_0.Args[0] + y := v_0.Args[1] + v.reset(OpARM64FNMULS) + v.AddArg(x) + v.AddArg(y) + return true + } + // match: (FNEGS (FNMULS x y)) + // cond: + // result: (FMULS x y) + for { + v_0 := v.Args[0] + if v_0.Op != OpARM64FNMULS { + break + } + _ = v_0.Args[1] + x := v_0.Args[0] + y := v_0.Args[1] + v.reset(OpARM64FMULS) + v.AddArg(x) + v.AddArg(y) + return true + } + return false +} +func rewriteValueARM64_OpARM64FNMULD_0(v *Value) bool { + // match: (FNMULD (FNEGD x) y) + // cond: + // result: (FMULD x y) + for { + _ = v.Args[1] + v_0 := v.Args[0] + if v_0.Op != OpARM64FNEGD { + break + } + x := v_0.Args[0] + y := v.Args[1] + v.reset(OpARM64FMULD) + v.AddArg(x) + v.AddArg(y) + return true + } + // match: (FNMULD y (FNEGD x)) + // cond: + // result: (FMULD x y) + for { + _ = v.Args[1] + y := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpARM64FNEGD { + break + } + x := v_1.Args[0] + v.reset(OpARM64FMULD) + v.AddArg(x) + v.AddArg(y) + return true + } + return false +} +func rewriteValueARM64_OpARM64FNMULS_0(v *Value) bool { + // match: (FNMULS (FNEGS x) y) + // cond: + // result: (FMULS x y) + for { + _ = v.Args[1] + v_0 := v.Args[0] + if v_0.Op != OpARM64FNEGS { + break + } + x := v_0.Args[0] + y := v.Args[1] + v.reset(OpARM64FMULS) + v.AddArg(x) + v.AddArg(y) + return true + } + // match: (FNMULS y (FNEGS x)) + // cond: + // result: (FMULS x y) + for { + _ = v.Args[1] + y := v.Args[0] + v_1 := v.Args[1] + if v_1.Op != OpARM64FNEGS { + break + } + x := v_1.Args[0] + v.reset(OpARM64FMULS) + v.AddArg(x) + v.AddArg(y) + return true + } + return false +} func rewriteValueARM64_OpARM64GreaterEqual_0(v *Value) bool { // match: (GreaterEqual (FlagEQ)) // cond: