1
0
mirror of https://github.com/golang/go synced 2024-11-23 15:20:03 -07:00

cmd/compile/internal/ssa: optimize arm64 with FNMULS/FNMULD

FNMULS&FNMULD are efficient arm64 instructions, which can be used
to improve FP performance. This CL use them to optimize pairs of neg-mul
operations.

Here are benchmark test results on Raspberry Pi 3 with ArchLinux.

1. A special test case gets about 15% improvement.
(https://github.com/benshi001/ugo1/blob/master/fpmul_test.go)
FPMul-4                     485µs ± 0%     410µs ± 0%  -15.49%  (p=0.000 n=26+23)

2. There is little regression in the go1 benchmark (excluding noise).
name                     old time/op    new time/op    delta
BinaryTree17-4              42.0s ± 3%     42.1s ± 2%    ~     (p=0.542 n=39+40)
Fannkuch11-4                33.3s ± 3%     32.9s ± 1%    ~     (p=0.200 n=40+32)
FmtFprintfEmpty-4           534ns ± 0%     534ns ± 0%    ~     (all equal)
FmtFprintfString-4         1.09µs ± 1%    1.09µs ± 0%    ~     (p=0.950 n=32+32)
FmtFprintfInt-4            1.14µs ± 0%    1.14µs ± 1%    ~     (p=0.571 n=32+31)
FmtFprintfIntInt-4         1.79µs ± 3%    1.76µs ± 0%  -1.42%  (p=0.004 n=40+34)
FmtFprintfPrefixedInt-4    2.17µs ± 0%    2.17µs ± 0%    ~     (p=0.073 n=31+34)
FmtFprintfFloat-4          3.33µs ± 3%    3.28µs ± 0%  -1.46%  (p=0.001 n=40+34)
FmtManyArgs-4              7.28µs ± 6%    7.19µs ± 0%    ~     (p=0.641 n=40+33)
GobDecode-4                96.5ms ± 4%    96.5ms ± 9%    ~     (p=0.214 n=40+40)
GobEncode-4                79.5ms ± 0%    80.7ms ± 4%  +1.51%  (p=0.000 n=34+40)
Gzip-4                      4.53s ± 4%     4.56s ± 4%  +0.60%  (p=0.000 n=40+40)
Gunzip-4                    451ms ± 3%     442ms ± 0%  -1.93%  (p=0.000 n=40+32)
HTTPClientServer-4          530µs ± 1%     535µs ± 1%  +0.88%  (p=0.000 n=39+39)
JSONEncode-4                214ms ± 4%     211ms ± 0%    ~     (p=0.059 n=40+31)
JSONDecode-4                865ms ± 5%     864ms ± 4%  -0.06%  (p=0.003 n=40+40)
Mandelbrot200-4            52.0ms ± 3%    52.1ms ± 3%    ~     (p=0.556 n=40+40)
GoParse-4                  43.1ms ± 8%    42.1ms ± 0%    ~     (p=0.083 n=40+33)
RegexpMatchEasy0_32-4      1.02µs ± 3%    1.02µs ± 4%  +0.06%  (p=0.020 n=40+40)
RegexpMatchEasy0_1K-4      3.90µs ± 0%    3.96µs ± 3%  +1.58%  (p=0.000 n=31+40)
RegexpMatchEasy1_32-4       967ns ± 4%     981ns ± 3%  +1.40%  (p=0.000 n=40+40)
RegexpMatchEasy1_1K-4      6.41µs ± 4%    6.43µs ± 3%    ~     (p=0.386 n=40+40)
RegexpMatchMedium_32-4     1.76µs ± 3%    1.78µs ± 3%  +1.08%  (p=0.000 n=40+40)
RegexpMatchMedium_1K-4      561µs ± 0%     562µs ± 0%  +0.09%  (p=0.003 n=34+31)
RegexpMatchHard_32-4       31.5µs ± 2%    31.1µs ± 4%  -1.17%  (p=0.000 n=30+40)
RegexpMatchHard_1K-4        960µs ± 3%     950µs ± 4%  -1.02%  (p=0.016 n=40+40)
Revcomp-4                   7.79s ± 7%     7.79s ± 4%    ~     (p=0.859 n=40+40)
Template-4                  889ms ± 6%     872ms ± 3%  -1.86%  (p=0.025 n=40+31)
TimeParse-4                4.80µs ± 0%    4.89µs ± 3%  +1.71%  (p=0.001 n=31+40)
TimeFormat-4               4.70µs ± 1%    4.78µs ± 3%  +1.57%  (p=0.000 n=33+40)
[Geo mean]                  710µs          709µs       -0.13%

name                     old speed      new speed      delta
GobDecode-4              7.96MB/s ± 4%  7.96MB/s ± 9%    ~     (p=0.174 n=40+40)
GobEncode-4              9.65MB/s ± 0%  9.51MB/s ± 4%  -1.45%  (p=0.000 n=34+40)
Gzip-4                   4.29MB/s ± 4%  4.26MB/s ± 4%  -0.59%  (p=0.000 n=40+40)
Gunzip-4                 43.0MB/s ± 3%  43.9MB/s ± 0%  +1.90%  (p=0.000 n=40+32)
JSONEncode-4             9.09MB/s ± 4%  9.22MB/s ± 0%    ~     (p=0.429 n=40+31)
JSONDecode-4             2.25MB/s ± 5%  2.25MB/s ± 4%    ~     (p=0.278 n=40+40)
GoParse-4                1.35MB/s ± 7%  1.37MB/s ± 0%    ~     (p=0.071 n=40+25)
RegexpMatchEasy0_32-4    31.5MB/s ± 3%  31.5MB/s ± 4%  -0.08%  (p=0.018 n=40+40)
RegexpMatchEasy0_1K-4     263MB/s ± 0%   259MB/s ± 3%  -1.51%  (p=0.000 n=31+40)
RegexpMatchEasy1_32-4    33.1MB/s ± 4%  32.6MB/s ± 3%  -1.38%  (p=0.000 n=40+40)
RegexpMatchEasy1_1K-4     160MB/s ± 4%   159MB/s ± 3%    ~     (p=0.364 n=40+40)
RegexpMatchMedium_32-4    565kB/s ± 3%   562kB/s ± 2%    ~     (p=0.208 n=40+40)
RegexpMatchMedium_1K-4   1.82MB/s ± 0%  1.82MB/s ± 0%  -0.27%  (p=0.000 n=34+31)
RegexpMatchHard_32-4     1.02MB/s ± 3%  1.03MB/s ± 4%  +1.04%  (p=0.000 n=32+40)
RegexpMatchHard_1K-4     1.07MB/s ± 4%  1.08MB/s ± 4%  +0.94%  (p=0.003 n=40+40)
Revcomp-4                32.6MB/s ± 7%  32.6MB/s ± 4%    ~     (p=0.965 n=40+40)
Template-4               2.18MB/s ± 6%  2.22MB/s ± 3%  +1.83%  (p=0.020 n=40+31)
[Geo mean]               7.77MB/s       7.78MB/s       +0.16%

3. There is little change in the compilecmp benchmark (excluding noise).
name        old time/op       new time/op       delta
Template          2.37s ± 3%        2.35s ± 4%    ~     (p=0.529 n=10+10)
Unicode           1.38s ± 8%        1.36s ± 5%    ~     (p=0.247 n=10+10)
GoTypes           8.10s ± 2%        8.10s ± 2%    ~     (p=0.971 n=10+10)
Compiler          40.5s ± 4%        40.8s ± 1%    ~     (p=0.529 n=10+10)
SSA                115s ± 2%         115s ± 3%    ~     (p=0.684 n=10+10)
Flate             1.45s ± 5%        1.46s ± 3%    ~     (p=0.796 n=10+10)
GoParser          1.86s ± 4%        1.84s ± 2%    ~     (p=0.095 n=9+10)
Reflect           5.11s ± 2%        5.13s ± 2%    ~     (p=0.315 n=10+10)
Tar               2.22s ± 3%        2.23s ± 1%    ~     (p=0.299 n=9+7)
XML               2.72s ± 3%        2.72s ± 3%    ~     (p=0.912 n=10+10)
[Geo mean]        5.03s             5.02s       -0.21%

name        old user-time/op  new user-time/op  delta
Template          2.92s ± 2%        2.89s ± 1%    ~     (p=0.247 n=10+10)
Unicode           1.71s ± 5%        1.69s ± 4%    ~     (p=0.393 n=10+10)
GoTypes           9.78s ± 2%        9.76s ± 2%    ~     (p=0.631 n=10+10)
Compiler          49.1s ± 2%        49.1s ± 1%    ~     (p=0.796 n=10+10)
SSA                144s ± 1%         144s ± 2%    ~     (p=0.796 n=10+10)
Flate             1.74s ± 2%        1.73s ± 3%    ~     (p=0.842 n=10+9)
GoParser          2.23s ± 3%        2.25s ± 2%    ~     (p=0.143 n=10+10)
Reflect           5.93s ± 3%        5.98s ± 2%    ~     (p=0.211 n=10+9)
Tar               2.65s ± 2%        2.69s ± 3%  +1.51%  (p=0.010 n=9+10)
XML               3.25s ± 2%        3.21s ± 1%  -1.24%  (p=0.035 n=10+9)
[Geo mean]        6.07s             6.07s       -0.08%

name        old text-bytes    new text-bytes    delta
HelloSize         641kB ± 0%        641kB ± 0%    ~     (all equal)

name        old data-bytes    new data-bytes    delta
HelloSize        9.46kB ± 0%       9.46kB ± 0%    ~     (all equal)

name        old bss-bytes     new bss-bytes     delta
HelloSize         125kB ± 0%        125kB ± 0%    ~     (all equal)

name        old exe-bytes     new exe-bytes     delta
HelloSize        1.24MB ± 0%       1.24MB ± 0%    ~     (all equal)

Change-Id: Id095d998c380eef929755124084df02446a6b7c1
Reviewed-on: https://go-review.googlesource.com/92555
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
This commit is contained in:
Ben Shi 2018-02-07 12:24:41 +00:00 committed by Brad Fitzpatrick
parent 3773cbba72
commit ebb77aa867
5 changed files with 276 additions and 8 deletions

View File

@ -171,6 +171,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
ssa.OpARM64FSUBD,
ssa.OpARM64FMULS,
ssa.OpARM64FMULD,
ssa.OpARM64FNMULS,
ssa.OpARM64FNMULD,
ssa.OpARM64FDIVS,
ssa.OpARM64FDIVD:
r := v.Reg()

View File

@ -1373,3 +1373,13 @@
&& clobber(o0) && clobber(o1) && clobber(o2) && clobber(o3)
&& clobber(o4) && clobber(o5) && clobber(s0)
-> @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (REV <t> (MOVDload <t> {s} (OffPtr <p.Type> [i0] p) mem))
// FP simplification
(FNEGS (FMULS x y)) -> (FNMULS x y)
(FNEGD (FMULD x y)) -> (FNMULD x y)
(FMULS (FNEGS x) y) -> (FNMULS x y)
(FMULD (FNEGD x) y) -> (FNMULD x y)
(FNEGS (FNMULS x y)) -> (FMULS x y)
(FNEGD (FNMULD x y)) -> (FMULD x y)
(FNMULS (FNEGS x) y) -> (FMULS x y)
(FNMULD (FNEGD x) y) -> (FMULD x y)

View File

@ -178,14 +178,16 @@ func init() {
{name: "MODW", argLength: 2, reg: gp21, asm: "REMW"}, // arg0 % arg1, signed, 32 bit
{name: "UMODW", argLength: 2, reg: gp21, asm: "UREMW"}, // arg0 % arg1, unsigned, 32 bit
{name: "FADDS", argLength: 2, reg: fp21, asm: "FADDS", commutative: true}, // arg0 + arg1
{name: "FADDD", argLength: 2, reg: fp21, asm: "FADDD", commutative: true}, // arg0 + arg1
{name: "FSUBS", argLength: 2, reg: fp21, asm: "FSUBS"}, // arg0 - arg1
{name: "FSUBD", argLength: 2, reg: fp21, asm: "FSUBD"}, // arg0 - arg1
{name: "FMULS", argLength: 2, reg: fp21, asm: "FMULS", commutative: true}, // arg0 * arg1
{name: "FMULD", argLength: 2, reg: fp21, asm: "FMULD", commutative: true}, // arg0 * arg1
{name: "FDIVS", argLength: 2, reg: fp21, asm: "FDIVS"}, // arg0 / arg1
{name: "FDIVD", argLength: 2, reg: fp21, asm: "FDIVD"}, // arg0 / arg1
{name: "FADDS", argLength: 2, reg: fp21, asm: "FADDS", commutative: true}, // arg0 + arg1
{name: "FADDD", argLength: 2, reg: fp21, asm: "FADDD", commutative: true}, // arg0 + arg1
{name: "FSUBS", argLength: 2, reg: fp21, asm: "FSUBS"}, // arg0 - arg1
{name: "FSUBD", argLength: 2, reg: fp21, asm: "FSUBD"}, // arg0 - arg1
{name: "FMULS", argLength: 2, reg: fp21, asm: "FMULS", commutative: true}, // arg0 * arg1
{name: "FMULD", argLength: 2, reg: fp21, asm: "FMULD", commutative: true}, // arg0 * arg1
{name: "FNMULS", argLength: 2, reg: fp21, asm: "FNMULS", commutative: true}, // -(arg0 * arg1)
{name: "FNMULD", argLength: 2, reg: fp21, asm: "FNMULD", commutative: true}, // -(arg0 * arg1)
{name: "FDIVS", argLength: 2, reg: fp21, asm: "FDIVS"}, // arg0 / arg1
{name: "FDIVD", argLength: 2, reg: fp21, asm: "FDIVD"}, // arg0 / arg1
{name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true}, // arg0 & arg1
{name: "ANDconst", argLength: 1, reg: gp11, asm: "AND", aux: "Int64"}, // arg0 & auxInt

View File

@ -975,6 +975,8 @@ const (
OpARM64FSUBD
OpARM64FMULS
OpARM64FMULD
OpARM64FNMULS
OpARM64FNMULD
OpARM64FDIVS
OpARM64FDIVD
OpARM64AND
@ -12359,6 +12361,36 @@ var opcodeTable = [...]opInfo{
},
},
},
{
name: "FNMULS",
argLen: 2,
commutative: true,
asm: arm64.AFNMULS,
reg: regInfo{
inputs: []inputInfo{
{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
{1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
},
outputs: []outputInfo{
{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
},
},
},
{
name: "FNMULD",
argLen: 2,
commutative: true,
asm: arm64.AFNMULD,
reg: regInfo{
inputs: []inputInfo{
{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
{1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
},
outputs: []outputInfo{
{0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
},
},
},
{
name: "FDIVS",
argLen: 2,

View File

@ -77,6 +77,18 @@ func rewriteValueARM64(v *Value) bool {
return rewriteValueARM64_OpARM64FMOVSload_0(v)
case OpARM64FMOVSstore:
return rewriteValueARM64_OpARM64FMOVSstore_0(v)
case OpARM64FMULD:
return rewriteValueARM64_OpARM64FMULD_0(v)
case OpARM64FMULS:
return rewriteValueARM64_OpARM64FMULS_0(v)
case OpARM64FNEGD:
return rewriteValueARM64_OpARM64FNEGD_0(v)
case OpARM64FNEGS:
return rewriteValueARM64_OpARM64FNEGS_0(v)
case OpARM64FNMULD:
return rewriteValueARM64_OpARM64FNMULD_0(v)
case OpARM64FNMULS:
return rewriteValueARM64_OpARM64FNMULS_0(v)
case OpARM64GreaterEqual:
return rewriteValueARM64_OpARM64GreaterEqual_0(v)
case OpARM64GreaterEqualU:
@ -3028,6 +3040,216 @@ func rewriteValueARM64_OpARM64FMOVSstore_0(v *Value) bool {
}
return false
}
func rewriteValueARM64_OpARM64FMULD_0(v *Value) bool {
// match: (FMULD (FNEGD x) y)
// cond:
// result: (FNMULD x y)
for {
_ = v.Args[1]
v_0 := v.Args[0]
if v_0.Op != OpARM64FNEGD {
break
}
x := v_0.Args[0]
y := v.Args[1]
v.reset(OpARM64FNMULD)
v.AddArg(x)
v.AddArg(y)
return true
}
// match: (FMULD y (FNEGD x))
// cond:
// result: (FNMULD x y)
for {
_ = v.Args[1]
y := v.Args[0]
v_1 := v.Args[1]
if v_1.Op != OpARM64FNEGD {
break
}
x := v_1.Args[0]
v.reset(OpARM64FNMULD)
v.AddArg(x)
v.AddArg(y)
return true
}
return false
}
func rewriteValueARM64_OpARM64FMULS_0(v *Value) bool {
// match: (FMULS (FNEGS x) y)
// cond:
// result: (FNMULS x y)
for {
_ = v.Args[1]
v_0 := v.Args[0]
if v_0.Op != OpARM64FNEGS {
break
}
x := v_0.Args[0]
y := v.Args[1]
v.reset(OpARM64FNMULS)
v.AddArg(x)
v.AddArg(y)
return true
}
// match: (FMULS y (FNEGS x))
// cond:
// result: (FNMULS x y)
for {
_ = v.Args[1]
y := v.Args[0]
v_1 := v.Args[1]
if v_1.Op != OpARM64FNEGS {
break
}
x := v_1.Args[0]
v.reset(OpARM64FNMULS)
v.AddArg(x)
v.AddArg(y)
return true
}
return false
}
func rewriteValueARM64_OpARM64FNEGD_0(v *Value) bool {
// match: (FNEGD (FMULD x y))
// cond:
// result: (FNMULD x y)
for {
v_0 := v.Args[0]
if v_0.Op != OpARM64FMULD {
break
}
_ = v_0.Args[1]
x := v_0.Args[0]
y := v_0.Args[1]
v.reset(OpARM64FNMULD)
v.AddArg(x)
v.AddArg(y)
return true
}
// match: (FNEGD (FNMULD x y))
// cond:
// result: (FMULD x y)
for {
v_0 := v.Args[0]
if v_0.Op != OpARM64FNMULD {
break
}
_ = v_0.Args[1]
x := v_0.Args[0]
y := v_0.Args[1]
v.reset(OpARM64FMULD)
v.AddArg(x)
v.AddArg(y)
return true
}
return false
}
func rewriteValueARM64_OpARM64FNEGS_0(v *Value) bool {
// match: (FNEGS (FMULS x y))
// cond:
// result: (FNMULS x y)
for {
v_0 := v.Args[0]
if v_0.Op != OpARM64FMULS {
break
}
_ = v_0.Args[1]
x := v_0.Args[0]
y := v_0.Args[1]
v.reset(OpARM64FNMULS)
v.AddArg(x)
v.AddArg(y)
return true
}
// match: (FNEGS (FNMULS x y))
// cond:
// result: (FMULS x y)
for {
v_0 := v.Args[0]
if v_0.Op != OpARM64FNMULS {
break
}
_ = v_0.Args[1]
x := v_0.Args[0]
y := v_0.Args[1]
v.reset(OpARM64FMULS)
v.AddArg(x)
v.AddArg(y)
return true
}
return false
}
func rewriteValueARM64_OpARM64FNMULD_0(v *Value) bool {
// match: (FNMULD (FNEGD x) y)
// cond:
// result: (FMULD x y)
for {
_ = v.Args[1]
v_0 := v.Args[0]
if v_0.Op != OpARM64FNEGD {
break
}
x := v_0.Args[0]
y := v.Args[1]
v.reset(OpARM64FMULD)
v.AddArg(x)
v.AddArg(y)
return true
}
// match: (FNMULD y (FNEGD x))
// cond:
// result: (FMULD x y)
for {
_ = v.Args[1]
y := v.Args[0]
v_1 := v.Args[1]
if v_1.Op != OpARM64FNEGD {
break
}
x := v_1.Args[0]
v.reset(OpARM64FMULD)
v.AddArg(x)
v.AddArg(y)
return true
}
return false
}
func rewriteValueARM64_OpARM64FNMULS_0(v *Value) bool {
// match: (FNMULS (FNEGS x) y)
// cond:
// result: (FMULS x y)
for {
_ = v.Args[1]
v_0 := v.Args[0]
if v_0.Op != OpARM64FNEGS {
break
}
x := v_0.Args[0]
y := v.Args[1]
v.reset(OpARM64FMULS)
v.AddArg(x)
v.AddArg(y)
return true
}
// match: (FNMULS y (FNEGS x))
// cond:
// result: (FMULS x y)
for {
_ = v.Args[1]
y := v.Args[0]
v_1 := v.Args[1]
if v_1.Op != OpARM64FNEGS {
break
}
x := v_1.Args[0]
v.reset(OpARM64FMULS)
v.AddArg(x)
v.AddArg(y)
return true
}
return false
}
func rewriteValueARM64_OpARM64GreaterEqual_0(v *Value) bool {
// match: (GreaterEqual (FlagEQ))
// cond: