From ecd9e8a2fe7a53a602cee6058b987c52a4bbb0e7 Mon Sep 17 00:00:00 2001 From: Chad Rosier Date: Fri, 23 Feb 2018 15:17:54 -0500 Subject: [PATCH] cmd/compile/internal/ssa: combine zero stores into larger stores on arm64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reduces the go tool binary on arm64 by 12k. go1 results on Amberwing: name old time/op new time/op delta RegexpMatchEasy0_32 249ns ± 0% 249ns ± 0% ~ (p=0.087 n=10+10) RegexpMatchEasy0_1K 584ns ± 0% 584ns ± 0% ~ (all equal) RegexpMatchEasy1_32 246ns ± 0% 246ns ± 0% ~ (p=1.000 n=10+10) RegexpMatchEasy1_1K 806ns ± 0% 806ns ± 0% ~ (p=0.706 n=10+9) RegexpMatchMedium_32 314ns ± 0% 314ns ± 0% ~ (all equal) RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% ~ (p=0.245 n=10+8) RegexpMatchHard_32 2.75µs ± 1% 2.75µs ± 1% ~ (p=0.690 n=10+10) RegexpMatchHard_1K 78.9µs ± 0% 78.9µs ± 1% ~ (p=0.295 n=9+9) FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal) FmtFprintfString 112ns ± 0% 112ns ± 0% ~ (all equal) FmtFprintfInt 117ns ± 0% 116ns ± 0% -0.85% (p=0.000 n=10+10) FmtFprintfIntInt 181ns ± 0% 181ns ± 0% ~ (all equal) FmtFprintfPrefixedInt 222ns ± 0% 224ns ± 0% +0.90% (p=0.000 n=9+10) FmtFprintfFloat 318ns ± 1% 322ns ± 0% ~ (p=0.059 n=10+8) FmtManyArgs 736ns ± 1% 735ns ± 0% ~ (p=0.206 n=9+9) Gzip 437ms ± 0% 436ms ± 0% -0.25% (p=0.000 n=10+10) HTTPClientServer 89.8µs ± 1% 90.2µs ± 2% ~ (p=0.393 n=10+10) JSONEncode 20.1ms ± 1% 20.2ms ± 1% ~ (p=0.065 n=9+10) JSONDecode 94.2ms ± 1% 93.9ms ± 1% -0.42% (p=0.043 n=10+10) GobDecode 12.7ms ± 1% 12.8ms ± 2% +0.94% (p=0.019 n=10+10) GobEncode 12.1ms ± 0% 12.1ms ± 0% ~ (p=0.052 n=10+10) Mandelbrot200 5.06ms ± 0% 5.05ms ± 0% -0.04% (p=0.000 n=9+10) TimeParse 450ns ± 3% 446ns ± 0% ~ (p=0.238 n=10+9) TimeFormat 485ns ± 1% 483ns ± 1% ~ (p=0.073 n=10+10) Template 90.4ms ± 0% 90.7ms ± 0% +0.29% (p=0.000 n=8+10) GoParse 6.01ms ± 0% 6.03ms ± 0% +0.35% (p=0.000 n=10+10) BinaryTree17 11.7s ± 0% 11.7s ± 0% ~ (p=0.481 n=10+10) Revcomp 669ms ± 0% 669ms ± 0% ~ (p=0.315 n=10+10) Fannkuch11 3.40s ± 0% 3.37s ± 0% -0.92% (p=0.000 n=10+10) [Geo mean] 67.9µs 67.9µs +0.02% name old speed new speed delta RegexpMatchEasy0_32 128MB/s ± 0% 128MB/s ± 0% -0.08% (p=0.003 n=8+10) RegexpMatchEasy0_1K 1.75GB/s ± 0% 1.75GB/s ± 0% ~ (p=0.642 n=8+10) RegexpMatchEasy1_32 130MB/s ± 0% 130MB/s ± 0% ~ (p=0.690 n=10+9) RegexpMatchEasy1_1K 1.27GB/s ± 0% 1.27GB/s ± 0% ~ (p=0.661 n=10+9) RegexpMatchMedium_32 3.18MB/s ± 0% 3.18MB/s ± 0% ~ (all equal) RegexpMatchMedium_1K 19.7MB/s ± 0% 19.6MB/s ± 0% ~ (p=0.190 n=10+9) RegexpMatchHard_32 11.6MB/s ± 0% 11.6MB/s ± 1% ~ (p=0.669 n=10+10) RegexpMatchHard_1K 13.0MB/s ± 0% 13.0MB/s ± 0% ~ (p=0.718 n=9+9) Gzip 44.4MB/s ± 0% 44.5MB/s ± 0% +0.24% (p=0.000 n=10+10) JSONEncode 96.5MB/s ± 1% 96.1MB/s ± 1% ~ (p=0.065 n=9+10) JSONDecode 20.6MB/s ± 1% 20.7MB/s ± 1% +0.42% (p=0.041 n=10+10) GobDecode 60.6MB/s ± 1% 60.0MB/s ± 2% -0.92% (p=0.016 n=10+10) GobEncode 63.4MB/s ± 0% 63.6MB/s ± 0% ~ (p=0.055 n=10+10) Template 21.5MB/s ± 0% 21.4MB/s ± 0% -0.30% (p=0.000 n=9+10) GoParse 9.64MB/s ± 0% 9.61MB/s ± 0% -0.36% (p=0.000 n=10+10) Revcomp 380MB/s ± 0% 380MB/s ± 0% ~ (p=0.323 n=10+10) [Geo mean] 56.0MB/s 55.9MB/s -0.07% Change-Id: Ia732fa57fbcf4767d72382516d9f16705d177736 Reviewed-on: https://go-review.googlesource.com/96435 Run-TryBot: Cherry Zhang Reviewed-by: Cherry Zhang --- src/cmd/compile/internal/gc/asm_test.go | 226 +++++++++++++++++++ src/cmd/compile/internal/ssa/gen/ARM64.rules | 30 +++ src/cmd/compile/internal/ssa/rewrite.go | 4 + src/cmd/compile/internal/ssa/rewriteARM64.go | 116 ++++++++++ 4 files changed, 376 insertions(+) diff --git a/src/cmd/compile/internal/gc/asm_test.go b/src/cmd/compile/internal/gc/asm_test.go index 6f347402398..750ac751920 100644 --- a/src/cmd/compile/internal/gc/asm_test.go +++ b/src/cmd/compile/internal/gc/asm_test.go @@ -2971,6 +2971,232 @@ var linuxARM64Tests = []*asmTest{ `, pos: []string{"\tCSEL\t"}, }, + // Check that zero stores are combine into larger stores + { + fn: ` + func $(b []byte) { + _ = b[1] // early bounds check to guarantee safety of writes below + b[0] = 0 + b[1] = 0 + } + `, + pos: []string{"MOVH\tZR"}, + neg: []string{"MOVB"}, + }, + { + fn: ` + func $(b []byte) { + _ = b[1] // early bounds check to guarantee safety of writes below + b[1] = 0 + b[0] = 0 + } + `, + pos: []string{"MOVH\tZR"}, + neg: []string{"MOVB"}, + }, + { + fn: ` + func $(b []byte) { + _ = b[3] // early bounds check to guarantee safety of writes below + b[0] = 0 + b[1] = 0 + b[2] = 0 + b[3] = 0 + } + `, + pos: []string{"MOVW\tZR"}, + neg: []string{"MOVB", "MOVH"}, + }, + { + fn: ` + func $(b []byte) { + _ = b[3] // early bounds check to guarantee safety of writes below + b[2] = 0 + b[3] = 0 + b[1] = 0 + b[0] = 0 + } + `, + pos: []string{"MOVW\tZR"}, + neg: []string{"MOVB", "MOVH"}, + }, + { + fn: ` + func $(h []uint16) { + _ = h[1] // early bounds check to guarantee safety of writes below + h[0] = 0 + h[1] = 0 + } + `, + pos: []string{"MOVW\tZR"}, + neg: []string{"MOVB", "MOVH"}, + }, + { + fn: ` + func $(h []uint16) { + _ = h[1] // early bounds check to guarantee safety of writes below + h[1] = 0 + h[0] = 0 + } + `, + pos: []string{"MOVW\tZR"}, + neg: []string{"MOVB", "MOVH"}, + }, + { + fn: ` + func $(b []byte) { + _ = b[7] // early bounds check to guarantee safety of writes below + b[0] = 0 + b[1] = 0 + b[2] = 0 + b[3] = 0 + b[4] = 0 + b[5] = 0 + b[6] = 0 + b[7] = 0 + } + `, + pos: []string{"MOVD\tZR"}, + neg: []string{"MOVB", "MOVH", "MOVW"}, + }, + { + fn: ` + func $(h []uint16) { + _ = h[3] // early bounds check to guarantee safety of writes below + h[0] = 0 + h[1] = 0 + h[2] = 0 + h[3] = 0 + } + `, + pos: []string{"MOVD\tZR"}, + neg: []string{"MOVB", "MOVH", "MOVW"}, + }, + { + fn: ` + func $(h []uint16) { + _ = h[3] // early bounds check to guarantee safety of writes below + h[2] = 0 + h[3] = 0 + h[1] = 0 + h[0] = 0 + } + `, + pos: []string{"MOVD\tZR"}, + neg: []string{"MOVB", "MOVH", "MOVW"}, + }, + { + fn: ` + func $(w []uint32) { + _ = w[1] // early bounds check to guarantee safety of writes below + w[0] = 0 + w[1] = 0 + } + `, + pos: []string{"MOVD\tZR"}, + neg: []string{"MOVB", "MOVH", "MOVW"}, + }, + { + fn: ` + func $(w []uint32) { + _ = w[1] // early bounds check to guarantee safety of writes below + w[1] = 0 + w[0] = 0 + } + `, + pos: []string{"MOVD\tZR"}, + neg: []string{"MOVB", "MOVH", "MOVW"}, + }, + { + fn: ` + func $(b []byte) { + _ = b[15] // early bounds check to guarantee safety of writes below + b[0] = 0 + b[1] = 0 + b[2] = 0 + b[3] = 0 + b[4] = 0 + b[5] = 0 + b[6] = 0 + b[7] = 0 + b[8] = 0 + b[9] = 0 + b[10] = 0 + b[11] = 0 + b[12] = 0 + b[13] = 0 + b[15] = 0 + b[14] = 0 + } + `, + pos: []string{"STP"}, + neg: []string{"MOVB", "MOVH", "MOVW"}, + }, + { + fn: ` + func $(h []uint16) { + _ = h[7] // early bounds check to guarantee safety of writes below + h[0] = 0 + h[1] = 0 + h[2] = 0 + h[3] = 0 + h[4] = 0 + h[5] = 0 + h[6] = 0 + h[7] = 0 + } + `, + pos: []string{"STP"}, + neg: []string{"MOVB", "MOVH"}, + }, + { + fn: ` + func $(w []uint32) { + _ = w[3] // early bounds check to guarantee safety of writes below + w[0] = 0 + w[1] = 0 + w[2] = 0 + w[3] = 0 + } + `, + pos: []string{"STP"}, + neg: []string{"MOVB", "MOVH"}, + }, + { + fn: ` + func $(w []uint32) { + _ = w[3] // early bounds check to guarantee safety of writes below + w[1] = 0 + w[0] = 0 + w[3] = 0 + w[2] = 0 + } + `, + pos: []string{"STP"}, + neg: []string{"MOVB", "MOVH"}, + }, + { + fn: ` + func $(d []uint64) { + _ = d[1] // early bounds check to guarantee safety of writes below + d[0] = 0 + d[1] = 0 + } + `, + pos: []string{"STP"}, + neg: []string{"MOVB", "MOVH"}, + }, + { + fn: ` + func $(d []uint64) { + _ = d[1] // early bounds check to guarantee safety of writes below + d[1] = 0 + d[0] = 0 + } + `, + pos: []string{"STP"}, + neg: []string{"MOVB", "MOVH"}, + }, } var linuxMIPSTests = []*asmTest{ diff --git a/src/cmd/compile/internal/ssa/gen/ARM64.rules b/src/cmd/compile/internal/ssa/gen/ARM64.rules index c5774edbd34..9f6ef57d434 100644 --- a/src/cmd/compile/internal/ssa/gen/ARM64.rules +++ b/src/cmd/compile/internal/ssa/gen/ARM64.rules @@ -1439,6 +1439,36 @@ && clobber(o4) && clobber(o5) && clobber(s0) -> @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (REV (MOVDload {s} (OffPtr [i0] p) mem)) +// Combine zero stores into larger (unaligned) stores. +(MOVBstorezero [i] {s} ptr0 x:(MOVBstorezero [j] {s} ptr1 mem)) + && x.Uses == 1 + && areAdjacentOffsets(i,j,1) + && is32Bit(min(i,j)) + && isSamePtr(ptr0, ptr1) + && clobber(x) + -> (MOVHstorezero [min(i,j)] {s} ptr0 mem) +(MOVHstorezero [i] {s} ptr0 x:(MOVHstorezero [j] {s} ptr1 mem)) + && x.Uses == 1 + && areAdjacentOffsets(i,j,2) + && is32Bit(min(i,j)) + && isSamePtr(ptr0, ptr1) + && clobber(x) + -> (MOVWstorezero [min(i,j)] {s} ptr0 mem) +(MOVWstorezero [i] {s} ptr0 x:(MOVWstorezero [j] {s} ptr1 mem)) + && x.Uses == 1 + && areAdjacentOffsets(i,j,4) + && is32Bit(min(i,j)) + && isSamePtr(ptr0, ptr1) + && clobber(x) + -> (MOVDstorezero [min(i,j)] {s} ptr0 mem) +(MOVDstorezero [i] {s} ptr0 x:(MOVDstorezero [j] {s} ptr1 mem)) + && x.Uses == 1 + && areAdjacentOffsets(i,j,8) + && is32Bit(min(i,j)) + && isSamePtr(ptr0, ptr1) + && clobber(x) + -> (MOVQstorezero [min(i,j)] {s} ptr0 mem) + // FP simplification (FNEGS (FMULS x y)) -> (FNMULS x y) (FNEGD (FMULD x y)) -> (FNMULD x y) diff --git a/src/cmd/compile/internal/ssa/rewrite.go b/src/cmd/compile/internal/ssa/rewrite.go index 587a2a6d1a0..2a20519f038 100644 --- a/src/cmd/compile/internal/ssa/rewrite.go +++ b/src/cmd/compile/internal/ssa/rewrite.go @@ -769,6 +769,10 @@ func overlap(offset1, size1, offset2, size2 int64) bool { return false } +func areAdjacentOffsets(off1, off2, size int64) bool { + return off1+size == off2 || off1 == off2+size +} + // check if value zeroes out upper 32-bit of 64-bit register. // depth limits recursion depth. In AMD64.rules 3 is used as limit, // because it catches same amount of cases as 4. diff --git a/src/cmd/compile/internal/ssa/rewriteARM64.go b/src/cmd/compile/internal/ssa/rewriteARM64.go index 81be85c63a2..1bb21d8a2c3 100644 --- a/src/cmd/compile/internal/ssa/rewriteARM64.go +++ b/src/cmd/compile/internal/ssa/rewriteARM64.go @@ -5941,6 +5941,35 @@ func rewriteValueARM64_OpARM64MOVBstorezero_0(v *Value) bool { v.AddArg(mem) return true } + // match: (MOVBstorezero [i] {s} ptr0 x:(MOVBstorezero [j] {s} ptr1 mem)) + // cond: x.Uses == 1 && areAdjacentOffsets(i,j,1) && is32Bit(min(i,j)) && isSamePtr(ptr0, ptr1) && clobber(x) + // result: (MOVHstorezero [min(i,j)] {s} ptr0 mem) + for { + i := v.AuxInt + s := v.Aux + _ = v.Args[1] + ptr0 := v.Args[0] + x := v.Args[1] + if x.Op != OpARM64MOVBstorezero { + break + } + j := x.AuxInt + if x.Aux != s { + break + } + _ = x.Args[1] + ptr1 := x.Args[0] + mem := x.Args[1] + if !(x.Uses == 1 && areAdjacentOffsets(i, j, 1) && is32Bit(min(i, j)) && isSamePtr(ptr0, ptr1) && clobber(x)) { + break + } + v.reset(OpARM64MOVHstorezero) + v.AuxInt = min(i, j) + v.Aux = s + v.AddArg(ptr0) + v.AddArg(mem) + return true + } return false } func rewriteValueARM64_OpARM64MOVDload_0(v *Value) bool { @@ -6205,6 +6234,35 @@ func rewriteValueARM64_OpARM64MOVDstorezero_0(v *Value) bool { v.AddArg(mem) return true } + // match: (MOVDstorezero [i] {s} ptr0 x:(MOVDstorezero [j] {s} ptr1 mem)) + // cond: x.Uses == 1 && areAdjacentOffsets(i,j,8) && is32Bit(min(i,j)) && isSamePtr(ptr0, ptr1) && clobber(x) + // result: (MOVQstorezero [min(i,j)] {s} ptr0 mem) + for { + i := v.AuxInt + s := v.Aux + _ = v.Args[1] + ptr0 := v.Args[0] + x := v.Args[1] + if x.Op != OpARM64MOVDstorezero { + break + } + j := x.AuxInt + if x.Aux != s { + break + } + _ = x.Args[1] + ptr1 := x.Args[0] + mem := x.Args[1] + if !(x.Uses == 1 && areAdjacentOffsets(i, j, 8) && is32Bit(min(i, j)) && isSamePtr(ptr0, ptr1) && clobber(x)) { + break + } + v.reset(OpARM64MOVQstorezero) + v.AuxInt = min(i, j) + v.Aux = s + v.AddArg(ptr0) + v.AddArg(mem) + return true + } return false } func rewriteValueARM64_OpARM64MOVHUload_0(v *Value) bool { @@ -6747,6 +6805,35 @@ func rewriteValueARM64_OpARM64MOVHstorezero_0(v *Value) bool { v.AddArg(mem) return true } + // match: (MOVHstorezero [i] {s} ptr0 x:(MOVHstorezero [j] {s} ptr1 mem)) + // cond: x.Uses == 1 && areAdjacentOffsets(i,j,2) && is32Bit(min(i,j)) && isSamePtr(ptr0, ptr1) && clobber(x) + // result: (MOVWstorezero [min(i,j)] {s} ptr0 mem) + for { + i := v.AuxInt + s := v.Aux + _ = v.Args[1] + ptr0 := v.Args[0] + x := v.Args[1] + if x.Op != OpARM64MOVHstorezero { + break + } + j := x.AuxInt + if x.Aux != s { + break + } + _ = x.Args[1] + ptr1 := x.Args[0] + mem := x.Args[1] + if !(x.Uses == 1 && areAdjacentOffsets(i, j, 2) && is32Bit(min(i, j)) && isSamePtr(ptr0, ptr1) && clobber(x)) { + break + } + v.reset(OpARM64MOVWstorezero) + v.AuxInt = min(i, j) + v.Aux = s + v.AddArg(ptr0) + v.AddArg(mem) + return true + } return false } func rewriteValueARM64_OpARM64MOVQstorezero_0(v *Value) bool { @@ -7379,6 +7466,35 @@ func rewriteValueARM64_OpARM64MOVWstorezero_0(v *Value) bool { v.AddArg(mem) return true } + // match: (MOVWstorezero [i] {s} ptr0 x:(MOVWstorezero [j] {s} ptr1 mem)) + // cond: x.Uses == 1 && areAdjacentOffsets(i,j,4) && is32Bit(min(i,j)) && isSamePtr(ptr0, ptr1) && clobber(x) + // result: (MOVDstorezero [min(i,j)] {s} ptr0 mem) + for { + i := v.AuxInt + s := v.Aux + _ = v.Args[1] + ptr0 := v.Args[0] + x := v.Args[1] + if x.Op != OpARM64MOVWstorezero { + break + } + j := x.AuxInt + if x.Aux != s { + break + } + _ = x.Args[1] + ptr1 := x.Args[0] + mem := x.Args[1] + if !(x.Uses == 1 && areAdjacentOffsets(i, j, 4) && is32Bit(min(i, j)) && isSamePtr(ptr0, ptr1) && clobber(x)) { + break + } + v.reset(OpARM64MOVDstorezero) + v.AuxInt = min(i, j) + v.Aux = s + v.AddArg(ptr0) + v.AddArg(mem) + return true + } return false } func rewriteValueARM64_OpARM64MUL_0(v *Value) bool {