1
0
mirror of https://github.com/golang/go synced 2024-10-02 04:18:33 -06:00

cmd/compile: improve absorb shifts optimization for arm64

Current absorb shifts optimization can generate dead Value nodes which increase
use count of other live nodes. It will impact other optimizations (such as
combined loads) which are enabled based on specific use count. This patch fixes
the issue by decreasing the use count of nodes referenced by dead Value nodes
generated by absorb shifts optimization.

Performance impacts on go1 benchmarks (data collected on A57@2GHzx8):

name                     old time/op    new time/op    delta
BinaryTree17-8              6.28s ± 2%     6.24s ± 1%     ~     (p=0.065 n=10+9)
Fannkuch11-8                6.32s ± 0%     6.33s ± 0%   +0.17%  (p=0.000 n=10+10)
FmtFprintfEmpty-8          98.9ns ± 0%    99.2ns ± 0%   +0.34%  (p=0.000 n=9+7)
FmtFprintfString-8          183ns ± 1%     182ns ± 1%   -1.01%  (p=0.005 n=9+10)
FmtFprintfInt-8             199ns ± 1%     202ns ± 1%   +1.41%  (p=0.000 n=10+9)
FmtFprintfIntInt-8          272ns ± 1%     276ns ± 3%   +1.36%  (p=0.015 n=10+10)
FmtFprintfPrefixedInt-8     367ns ± 1%     369ns ± 1%   +0.68%  (p=0.042 n=10+10)
FmtFprintfFloat-8           491ns ± 1%     493ns ± 1%     ~     (p=0.064 n=10+10)
FmtManyArgs-8              1.31µs ± 1%    1.32µs ± 1%   +0.39%  (p=0.042 n=8+9)
GobDecode-8                17.0ms ± 2%    16.2ms ± 2%   -4.74%  (p=0.000 n=10+10)
GobEncode-8                13.7ms ± 2%    13.4ms ± 1%   -2.40%  (p=0.000 n=10+9)
Gzip-8                      844ms ± 0%     737ms ± 0%  -12.70%  (p=0.000 n=10+10)
Gunzip-8                   84.4ms ± 1%    83.9ms ± 0%   -0.55%  (p=0.000 n=10+8)
HTTPClientServer-8          122µs ± 1%     124µs ± 1%   +1.75%  (p=0.000 n=10+9)
JSONEncode-8               34.9ms ± 1%    32.4ms ± 0%   -7.11%  (p=0.000 n=10+9)
JSONDecode-8                150ms ± 0%     146ms ± 1%   -2.84%  (p=0.000 n=7+10)
Mandelbrot200-8            10.0ms ± 0%    10.0ms ± 0%     ~     (p=0.529 n=10+10)
GoParse-8                  8.18ms ± 1%    8.03ms ± 0%   -1.93%  (p=0.000 n=10+10)
RegexpMatchEasy0_32-8       209ns ± 0%     209ns ± 0%     ~     (p=0.248 n=10+9)
RegexpMatchEasy0_1K-8       789ns ± 1%     790ns ± 0%     ~     (p=0.361 n=10+10)
RegexpMatchEasy1_32-8       202ns ± 0%     202ns ± 1%     ~     (p=0.137 n=8+10)
RegexpMatchEasy1_1K-8      1.12µs ± 2%    1.12µs ± 1%     ~     (p=0.810 n=10+10)
RegexpMatchMedium_32-8      298ns ± 0%     298ns ± 0%     ~     (p=0.443 n=10+9)
RegexpMatchMedium_1K-8     83.0µs ± 5%    78.6µs ± 0%   -5.37%  (p=0.000 n=10+10)
RegexpMatchHard_32-8       4.32µs ± 0%    4.26µs ± 0%   -1.47%  (p=0.000 n=10+10)
RegexpMatchHard_1K-8        132µs ± 4%     126µs ± 0%   -4.41%  (p=0.000 n=10+9)
Revcomp-8                   1.11s ± 0%     1.11s ± 0%   +0.14%  (p=0.017 n=10+9)
Template-8                  155ms ± 1%     155ms ± 1%     ~     (p=0.796 n=10+10)
TimeParse-8                 774ns ± 1%     785ns ± 1%   +1.41%  (p=0.001 n=10+10)
TimeFormat-8                788ns ± 1%     806ns ± 1%   +2.24%  (p=0.000 n=10+9)

name                     old speed      new speed      delta
GobDecode-8              45.2MB/s ± 2%  47.5MB/s ± 2%   +4.96%  (p=0.000 n=10+10)
GobEncode-8              56.0MB/s ± 2%  57.4MB/s ± 1%   +2.44%  (p=0.000 n=10+9)
Gzip-8                   23.0MB/s ± 0%  26.3MB/s ± 0%  +14.55%  (p=0.000 n=10+10)
Gunzip-8                  230MB/s ± 1%   231MB/s ± 0%   +0.55%  (p=0.000 n=10+8)
JSONEncode-8             55.6MB/s ± 1%  59.9MB/s ± 0%   +7.65%  (p=0.000 n=10+9)
JSONDecode-8             12.9MB/s ± 0%  13.3MB/s ± 1%   +2.94%  (p=0.000 n=7+10)
GoParse-8                7.08MB/s ± 1%  7.22MB/s ± 0%   +1.95%  (p=0.000 n=10+10)
RegexpMatchEasy0_32-8     153MB/s ± 0%   153MB/s ± 0%   -0.16%  (p=0.023 n=10+10)
RegexpMatchEasy0_1K-8    1.30GB/s ± 1%  1.30GB/s ± 0%     ~     (p=0.393 n=10+10)
RegexpMatchEasy1_32-8     158MB/s ± 0%   158MB/s ± 0%     ~     (p=0.684 n=10+10)
RegexpMatchEasy1_1K-8     915MB/s ± 2%   918MB/s ± 1%     ~     (p=0.796 n=10+10)
RegexpMatchMedium_32-8   3.35MB/s ± 0%  3.35MB/s ± 0%     ~     (p=1.000 n=10+9)
RegexpMatchMedium_1K-8   12.3MB/s ± 5%  13.0MB/s ± 0%   +5.56%  (p=0.000 n=10+10)
RegexpMatchHard_32-8     7.40MB/s ± 0%  7.51MB/s ± 0%   +1.50%  (p=0.000 n=10+10)
RegexpMatchHard_1K-8     7.75MB/s ± 4%  8.10MB/s ± 0%   +4.52%  (p=0.000 n=10+8)
Revcomp-8                 229MB/s ± 0%   228MB/s ± 0%   -0.14%  (p=0.017 n=10+9)
Template-8               12.5MB/s ± 1%  12.5MB/s ± 1%     ~     (p=0.780 n=10+10)

Change-Id: I103389f168eac79f6af44e8fef93acc2a7a4ac96
Reviewed-on: https://go-review.googlesource.com/88415
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
This commit is contained in:
Chad Rosier 2018-02-15 14:49:03 -05:00 committed by Cherry Zhang
parent b3cb740be5
commit 51932c326f
4 changed files with 542 additions and 349 deletions

View File

@ -248,7 +248,7 @@ var allAsmTests = []*asmTests{
{
arch: "arm64",
os: "linux",
imports: []string{"math/bits"},
imports: []string{"encoding/binary", "math/bits"},
tests: linuxARM64Tests,
},
{
@ -2751,6 +2751,80 @@ var linuxARM64Tests = []*asmTest{
`,
pos: []string{"TBZ"},
},
// Load-combining tests.
{
fn: `
func $(b []byte) uint64 {
return binary.LittleEndian.Uint64(b)
}
`,
pos: []string{"\tMOVD\t\\(R[0-9]+\\)"},
},
{
fn: `
func $(b []byte, i int) uint64 {
return binary.LittleEndian.Uint64(b[i:])
}
`,
pos: []string{"\tMOVD\t\\(R[0-9]+\\)"},
},
{
fn: `
func $(b []byte) uint32 {
return binary.LittleEndian.Uint32(b)
}
`,
pos: []string{"\tMOVWU\t\\(R[0-9]+\\)"},
},
{
fn: `
func $(b []byte, i int) uint32 {
return binary.LittleEndian.Uint32(b[i:])
}
`,
pos: []string{"\tMOVWU\t\\(R[0-9]+\\)"},
},
{
fn: `
func $(b []byte) uint64 {
return binary.BigEndian.Uint64(b)
}
`,
pos: []string{"\tREV\t"},
},
{
fn: `
func $(b []byte, i int) uint64 {
return binary.BigEndian.Uint64(b[i:])
}
`,
pos: []string{"\tREV\t"},
},
{
fn: `
func $(b []byte) uint32 {
return binary.BigEndian.Uint32(b)
}
`,
pos: []string{"\tREVW\t"},
},
{
fn: `
func $(b []byte, i int) uint32 {
return binary.BigEndian.Uint32(b[i:])
}
`,
pos: []string{"\tREVW\t"},
},
{
fn: `
func $(s []byte) uint16 {
return uint16(s[0]) | uint16(s[1]) << 8
}
`,
pos: []string{"\tMOVHU\t\\(R[0-9]+\\)"},
neg: []string{"ORR\tR[0-9]+<<8\t"},
},
}
var linuxMIPSTests = []*asmTest{

View File

@ -1078,30 +1078,30 @@
(CSELULT0 _ (FlagGT_UGT)) -> (MOVDconst [0])
// absorb shifts into ops
(ADD x (SLLconst [c] y)) -> (ADDshiftLL x y [c])
(ADD x (SRLconst [c] y)) -> (ADDshiftRL x y [c])
(ADD x (SRAconst [c] y)) -> (ADDshiftRA x y [c])
(SUB x (SLLconst [c] y)) -> (SUBshiftLL x y [c])
(SUB x (SRLconst [c] y)) -> (SUBshiftRL x y [c])
(SUB x (SRAconst [c] y)) -> (SUBshiftRA x y [c])
(AND x (SLLconst [c] y)) -> (ANDshiftLL x y [c])
(AND x (SRLconst [c] y)) -> (ANDshiftRL x y [c])
(AND x (SRAconst [c] y)) -> (ANDshiftRA x y [c])
(OR x (SLLconst [c] y)) -> (ORshiftLL x y [c]) // useful for combined load
(OR x (SRLconst [c] y)) -> (ORshiftRL x y [c])
(OR x (SRAconst [c] y)) -> (ORshiftRA x y [c])
(XOR x (SLLconst [c] y)) -> (XORshiftLL x y [c])
(XOR x (SRLconst [c] y)) -> (XORshiftRL x y [c])
(XOR x (SRAconst [c] y)) -> (XORshiftRA x y [c])
(BIC x (SLLconst [c] y)) -> (BICshiftLL x y [c])
(BIC x (SRLconst [c] y)) -> (BICshiftRL x y [c])
(BIC x (SRAconst [c] y)) -> (BICshiftRA x y [c])
(CMP x (SLLconst [c] y)) -> (CMPshiftLL x y [c])
(CMP (SLLconst [c] y) x) -> (InvertFlags (CMPshiftLL x y [c]))
(CMP x (SRLconst [c] y)) -> (CMPshiftRL x y [c])
(CMP (SRLconst [c] y) x) -> (InvertFlags (CMPshiftRL x y [c]))
(CMP x (SRAconst [c] y)) -> (CMPshiftRA x y [c])
(CMP (SRAconst [c] y) x) -> (InvertFlags (CMPshiftRA x y [c]))
(ADD x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) -> (ADDshiftLL x0 y [c])
(ADD x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) -> (ADDshiftRL x0 y [c])
(ADD x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) -> (ADDshiftRA x0 y [c])
(SUB x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) -> (SUBshiftLL x0 y [c])
(SUB x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) -> (SUBshiftRL x0 y [c])
(SUB x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) -> (SUBshiftRA x0 y [c])
(AND x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) -> (ANDshiftLL x0 y [c])
(AND x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) -> (ANDshiftRL x0 y [c])
(AND x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) -> (ANDshiftRA x0 y [c])
(OR x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) -> (ORshiftLL x0 y [c]) // useful for combined load
(OR x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) -> (ORshiftRL x0 y [c])
(OR x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) -> (ORshiftRA x0 y [c])
(XOR x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) -> (XORshiftLL x0 y [c])
(XOR x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) -> (XORshiftRL x0 y [c])
(XOR x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) -> (XORshiftRA x0 y [c])
(BIC x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) -> (BICshiftLL x0 y [c])
(BIC x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) -> (BICshiftRL x0 y [c])
(BIC x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) -> (BICshiftRA x0 y [c])
(CMP x0 x1:(SLLconst [c] y)) && clobberIfDead(x1) -> (CMPshiftLL x0 y [c])
(CMP x0:(SLLconst [c] y) x1) && clobberIfDead(x0) -> (InvertFlags (CMPshiftLL x1 y [c]))
(CMP x0 x1:(SRLconst [c] y)) && clobberIfDead(x1) -> (CMPshiftRL x0 y [c])
(CMP x0:(SRLconst [c] y) x1) && clobberIfDead(x0) -> (InvertFlags (CMPshiftRL x1 y [c]))
(CMP x0 x1:(SRAconst [c] y)) && clobberIfDead(x1) -> (CMPshiftRA x0 y [c])
(CMP x0:(SRAconst [c] y) x1) && clobberIfDead(x0) -> (InvertFlags (CMPshiftRA x1 y [c]))
// prefer *const ops to *shift ops
(ADDshiftLL (MOVDconst [c]) x [d]) -> (ADDconst [c] (SLLconst <x.Type> x [d]))

View File

@ -505,6 +505,17 @@ func clobber(v *Value) bool {
return true
}
// clobberIfDead resets v when use count is 1. Returns true.
// clobberIfDead is used by rewrite rules to decrement
// use counts of v's args when v is dead and never used.
func clobberIfDead(v *Value) bool {
if v.Uses == 1 {
v.reset(OpInvalid)
}
// Note: leave v.Block intact. The Block field is used after clobberIfDead.
return true
}
// noteRule is an easy way to track if a rule is matched when writing
// new ones. Make the rule of interest also conditional on
// noteRule("note to self: rule of interest matched")

File diff suppressed because it is too large Load Diff