From 10f757486e94f60a5e0af180dcd61c9eef7534c6 Mon Sep 17 00:00:00 2001 From: Ilya Tocar Date: Thu, 27 Oct 2016 16:58:45 +0300 Subject: [PATCH] cmd/compile/internal/ssa: generate bswap on AMD64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Generate bswap+load/store for reading/writing big endian data. Helps encoding/binary. name old time/op new time/op delta ReadSlice1000Int32s-8 5.06µs ± 8% 4.58µs ± 8% -9.50% (p=0.000 n=10+10) ReadStruct-8 1.07µs ± 0% 1.05µs ± 0% -1.51% (p=0.000 n=9+10) ReadInts-8 367ns ± 0% 363ns ± 0% -1.15% (p=0.000 n=8+9) WriteInts-8 475ns ± 1% 469ns ± 0% -1.45% (p=0.000 n=10+10) WriteSlice1000Int32s-8 5.03µs ± 3% 4.50µs ± 3% -10.45% (p=0.000 n=9+9) PutUvarint32-8 17.2ns ± 0% 17.2ns ± 0% ~ (all samples are equal) PutUvarint64-8 46.7ns ± 0% 46.7ns ± 0% ~ (p=0.509 n=10+10) name old speed new speed delta ReadSlice1000Int32s-8 791MB/s ± 8% 875MB/s ± 8% +10.53% (p=0.000 n=10+10) ReadStruct-8 70.0MB/s ± 0% 71.1MB/s ± 0% +1.54% (p=0.000 n=9+10) ReadInts-8 81.6MB/s ± 0% 82.6MB/s ± 0% +1.21% (p=0.000 n=9+9) WriteInts-8 63.0MB/s ± 1% 63.9MB/s ± 0% +1.45% (p=0.000 n=10+10) WriteSlice1000Int32s-8 796MB/s ± 4% 888MB/s ± 3% +11.65% (p=0.000 n=9+9) PutUvarint32-8 233MB/s ± 0% 233MB/s ± 0% ~ (p=0.089 n=10+10) PutUvarint64-8 171MB/s ± 0% 171MB/s ± 0% ~ (p=0.137 n=10+9) Change-Id: Ia2dbdef92198eaa7e2af5443a8ed586d4b401ffb Reviewed-on: https://go-review.googlesource.com/32222 Run-TryBot: Ilya Tocar TryBot-Result: Gobot Gobot Reviewed-by: Keith Randall --- src/cmd/compile/internal/gc/asm_test.go | 32 + src/cmd/compile/internal/ssa/gen/AMD64.rules | 198 ++++ src/cmd/compile/internal/ssa/rewriteAMD64.go | 940 +++++++++++++++++++ 3 files changed, 1170 insertions(+) diff --git a/src/cmd/compile/internal/gc/asm_test.go b/src/cmd/compile/internal/gc/asm_test.go index 58cdb9da7d7..2e5d7e74882 100644 --- a/src/cmd/compile/internal/gc/asm_test.go +++ b/src/cmd/compile/internal/gc/asm_test.go @@ -157,6 +157,38 @@ func f(b []byte, i int) uint32 { `, []string{"\tMOVL\t\\(.*\\)\\(.*\\*1\\),"}, }, + {"amd64", "linux", ` +import "encoding/binary" +func f(b []byte) uint64 { + return binary.BigEndian.Uint64(b) +} +`, + []string{"\tBSWAPQ\t"}, + }, + {"amd64", "linux", ` +import "encoding/binary" +func f(b []byte, i int) uint64 { + return binary.BigEndian.Uint64(b[i:]) +} +`, + []string{"\tBSWAPQ\t"}, + }, + {"amd64", "linux", ` +import "encoding/binary" +func f(b []byte) uint32 { + return binary.BigEndian.Uint32(b) +} +`, + []string{"\tBSWAPL\t"}, + }, + {"amd64", "linux", ` +import "encoding/binary" +func f(b []byte, i int) uint32 { + return binary.BigEndian.Uint32(b[i:]) +} +`, + []string{"\tBSWAPL\t"}, + }, {"386", "linux", ` import "encoding/binary" func f(b []byte) uint32 { diff --git a/src/cmd/compile/internal/ssa/gen/AMD64.rules b/src/cmd/compile/internal/ssa/gen/AMD64.rules index 4c49d109246..5b4649cb143 100644 --- a/src/cmd/compile/internal/ssa/gen/AMD64.rules +++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules @@ -1507,6 +1507,204 @@ && clobber(o5) -> @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (MOVQloadidx1 [i] {s} p idx mem) +// Combine byte loads + shifts into larger (unaligned) loads + bswap +(ORL o1:(ORL o0:(ORL + x0:(MOVBload [i] {s} p mem) + s0:(SHLLconst [8] x1:(MOVBload [i-1] {s} p mem))) + s1:(SHLLconst [16] x2:(MOVBload [i-2] {s} p mem))) + s2:(SHLLconst [24] x3:(MOVBload [i-3] {s} p mem))) + && x0.Uses == 1 + && x1.Uses == 1 + && x2.Uses == 1 + && x3.Uses == 1 + && s0.Uses == 1 + && s1.Uses == 1 + && s2.Uses == 1 + && o0.Uses == 1 + && o1.Uses == 1 + && mergePoint(b,x0,x1,x2,x3) != nil + && clobber(x0) + && clobber(x1) + && clobber(x2) + && clobber(x3) + && clobber(s0) + && clobber(s1) + && clobber(s2) + && clobber(o0) + && clobber(o1) + -> @mergePoint(b,x0,x1,x2,x3) (BSWAPL (MOVLload [i-3] {s} p mem)) + +(ORL o1:(ORL o0:(ORL + x0:(MOVBloadidx1 [i] {s} p idx mem) + s0:(SHLLconst [8] x1:(MOVBloadidx1 [i-1] {s} p idx mem))) + s1:(SHLLconst [16] x2:(MOVBloadidx1 [i-2] {s} p idx mem))) + s2:(SHLLconst [24] x3:(MOVBloadidx1 [i-3] {s} p idx mem))) + && x0.Uses == 1 + && x1.Uses == 1 + && x2.Uses == 1 + && x3.Uses == 1 + && s0.Uses == 1 + && s1.Uses == 1 + && s2.Uses == 1 + && o0.Uses == 1 + && o1.Uses == 1 + && mergePoint(b,x0,x1,x2,x3) != nil + && clobber(x0) + && clobber(x1) + && clobber(x2) + && clobber(x3) + && clobber(s0) + && clobber(s1) + && clobber(s2) + && clobber(o0) + && clobber(o1) + -> @mergePoint(b,x0,x1,x2,x3) (BSWAPL (MOVLloadidx1 [i-3] {s} p idx mem)) + +(ORQ o5:(ORQ o4:(ORQ o3:(ORQ o2:(ORQ o1:(ORQ o0:(ORQ + x0:(MOVBload [i] {s} p mem) + s0:(SHLQconst [8] x1:(MOVBload [i-1] {s} p mem))) + s1:(SHLQconst [16] x2:(MOVBload [i-2] {s} p mem))) + s2:(SHLQconst [24] x3:(MOVBload [i-3] {s} p mem))) + s3:(SHLQconst [32] x4:(MOVBload [i-4] {s} p mem))) + s4:(SHLQconst [40] x5:(MOVBload [i-5] {s} p mem))) + s5:(SHLQconst [48] x6:(MOVBload [i-6] {s} p mem))) + s6:(SHLQconst [56] x7:(MOVBload [i-7] {s} p mem))) + && x0.Uses == 1 + && x1.Uses == 1 + && x2.Uses == 1 + && x3.Uses == 1 + && x4.Uses == 1 + && x5.Uses == 1 + && x6.Uses == 1 + && x7.Uses == 1 + && s0.Uses == 1 + && s1.Uses == 1 + && s2.Uses == 1 + && s3.Uses == 1 + && s4.Uses == 1 + && s5.Uses == 1 + && s6.Uses == 1 + && o0.Uses == 1 + && o1.Uses == 1 + && o2.Uses == 1 + && o3.Uses == 1 + && o4.Uses == 1 + && o5.Uses == 1 + && mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) != nil + && clobber(x0) + && clobber(x1) + && clobber(x2) + && clobber(x3) + && clobber(x4) + && clobber(x5) + && clobber(x6) + && clobber(x7) + && clobber(s0) + && clobber(s1) + && clobber(s2) + && clobber(s3) + && clobber(s4) + && clobber(s5) + && clobber(s6) + && clobber(o0) + && clobber(o1) + && clobber(o2) + && clobber(o3) + && clobber(o4) + && clobber(o5) + -> @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (BSWAPQ (MOVQload [i-7] {s} p mem)) + +(ORQ o5:(ORQ o4:(ORQ o3:(ORQ o2:(ORQ o1:(ORQ o0:(ORQ + x0:(MOVBloadidx1 [i] {s} p idx mem) + s0:(SHLQconst [8] x1:(MOVBloadidx1 [i-1] {s} p idx mem))) + s1:(SHLQconst [16] x2:(MOVBloadidx1 [i-2] {s} p idx mem))) + s2:(SHLQconst [24] x3:(MOVBloadidx1 [i-3] {s} p idx mem))) + s3:(SHLQconst [32] x4:(MOVBloadidx1 [i-4] {s} p idx mem))) + s4:(SHLQconst [40] x5:(MOVBloadidx1 [i-5] {s} p idx mem))) + s5:(SHLQconst [48] x6:(MOVBloadidx1 [i-6] {s} p idx mem))) + s6:(SHLQconst [56] x7:(MOVBloadidx1 [i-7] {s} p idx mem))) + && x0.Uses == 1 + && x1.Uses == 1 + && x2.Uses == 1 + && x3.Uses == 1 + && x4.Uses == 1 + && x5.Uses == 1 + && x6.Uses == 1 + && x7.Uses == 1 + && s0.Uses == 1 + && s1.Uses == 1 + && s2.Uses == 1 + && s3.Uses == 1 + && s4.Uses == 1 + && s5.Uses == 1 + && s6.Uses == 1 + && o0.Uses == 1 + && o1.Uses == 1 + && o2.Uses == 1 + && o3.Uses == 1 + && o4.Uses == 1 + && o5.Uses == 1 + && mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) != nil + && clobber(x0) + && clobber(x1) + && clobber(x2) + && clobber(x3) + && clobber(x4) + && clobber(x5) + && clobber(x6) + && clobber(x7) + && clobber(s0) + && clobber(s1) + && clobber(s2) + && clobber(s3) + && clobber(s4) + && clobber(s5) + && clobber(s6) + && clobber(o0) + && clobber(o1) + && clobber(o2) + && clobber(o3) + && clobber(o4) + && clobber(o5) + -> @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (BSWAPQ (MOVQloadidx1 [i-7] {s} p idx mem)) + +// Combine stores + shifts into bswap and larger (unaligned) stores +(MOVBstore [i] {s} p w + x2:(MOVBstore [i-1] {s} p (SHRLconst [8] w) + x1:(MOVBstore [i-2] {s} p (SHRLconst [16] w) + x0:(MOVBstore [i-3] {s} p (SHRLconst [24] w) mem)))) + && x0.Uses == 1 + && x1.Uses == 1 + && x2.Uses == 1 + && clobber(x0) + && clobber(x1) + && clobber(x2) + -> (MOVLstore [i-3] {s} p (BSWAPL w) mem) + +(MOVBstore [i] {s} p w + x6:(MOVBstore [i-1] {s} p (SHRQconst [8] w) + x5:(MOVBstore [i-2] {s} p (SHRQconst [16] w) + x4:(MOVBstore [i-3] {s} p (SHRQconst [24] w) + x3:(MOVBstore [i-4] {s} p (SHRQconst [32] w) + x2:(MOVBstore [i-5] {s} p (SHRQconst [40] w) + x1:(MOVBstore [i-6] {s} p (SHRQconst [48] w) + x0:(MOVBstore [i-7] {s} p (SHRQconst [56] w) mem)))))))) + && x0.Uses == 1 + && x1.Uses == 1 + && x2.Uses == 1 + && x3.Uses == 1 + && x4.Uses == 1 + && x5.Uses == 1 + && x6.Uses == 1 + && clobber(x0) + && clobber(x1) + && clobber(x2) + && clobber(x3) + && clobber(x4) + && clobber(x5) + && clobber(x6) + -> (MOVQstore [i-7] {s} p (BSWAPQ w) mem) + // Combine constant stores into larger (unaligned) stores. (MOVBstoreconst [c] {s} p x:(MOVBstoreconst [a] {s} p mem)) && x.Uses == 1 diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 5c685ef25fe..1257ec6e7c3 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -3928,6 +3928,280 @@ func rewriteValueAMD64_OpAMD64MOVBstore(v *Value, config *Config) bool { v.AddArg(mem) return true } + // match: (MOVBstore [i] {s} p w x2:(MOVBstore [i-1] {s} p (SHRLconst [8] w) x1:(MOVBstore [i-2] {s} p (SHRLconst [16] w) x0:(MOVBstore [i-3] {s} p (SHRLconst [24] w) mem)))) + // cond: x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && clobber(x0) && clobber(x1) && clobber(x2) + // result: (MOVLstore [i-3] {s} p (BSWAPL w) mem) + for { + i := v.AuxInt + s := v.Aux + p := v.Args[0] + w := v.Args[1] + x2 := v.Args[2] + if x2.Op != OpAMD64MOVBstore { + break + } + if x2.AuxInt != i-1 { + break + } + if x2.Aux != s { + break + } + if p != x2.Args[0] { + break + } + x2_1 := x2.Args[1] + if x2_1.Op != OpAMD64SHRLconst { + break + } + if x2_1.AuxInt != 8 { + break + } + if w != x2_1.Args[0] { + break + } + x1 := x2.Args[2] + if x1.Op != OpAMD64MOVBstore { + break + } + if x1.AuxInt != i-2 { + break + } + if x1.Aux != s { + break + } + if p != x1.Args[0] { + break + } + x1_1 := x1.Args[1] + if x1_1.Op != OpAMD64SHRLconst { + break + } + if x1_1.AuxInt != 16 { + break + } + if w != x1_1.Args[0] { + break + } + x0 := x1.Args[2] + if x0.Op != OpAMD64MOVBstore { + break + } + if x0.AuxInt != i-3 { + break + } + if x0.Aux != s { + break + } + if p != x0.Args[0] { + break + } + x0_1 := x0.Args[1] + if x0_1.Op != OpAMD64SHRLconst { + break + } + if x0_1.AuxInt != 24 { + break + } + if w != x0_1.Args[0] { + break + } + mem := x0.Args[2] + if !(x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && clobber(x0) && clobber(x1) && clobber(x2)) { + break + } + v.reset(OpAMD64MOVLstore) + v.AuxInt = i - 3 + v.Aux = s + v.AddArg(p) + v0 := b.NewValue0(v.Line, OpAMD64BSWAPL, w.Type) + v0.AddArg(w) + v.AddArg(v0) + v.AddArg(mem) + return true + } + // match: (MOVBstore [i] {s} p w x6:(MOVBstore [i-1] {s} p (SHRQconst [8] w) x5:(MOVBstore [i-2] {s} p (SHRQconst [16] w) x4:(MOVBstore [i-3] {s} p (SHRQconst [24] w) x3:(MOVBstore [i-4] {s} p (SHRQconst [32] w) x2:(MOVBstore [i-5] {s} p (SHRQconst [40] w) x1:(MOVBstore [i-6] {s} p (SHRQconst [48] w) x0:(MOVBstore [i-7] {s} p (SHRQconst [56] w) mem)))))))) + // cond: x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && clobber(x0) && clobber(x1) && clobber(x2) && clobber(x3) && clobber(x4) && clobber(x5) && clobber(x6) + // result: (MOVQstore [i-7] {s} p (BSWAPQ w) mem) + for { + i := v.AuxInt + s := v.Aux + p := v.Args[0] + w := v.Args[1] + x6 := v.Args[2] + if x6.Op != OpAMD64MOVBstore { + break + } + if x6.AuxInt != i-1 { + break + } + if x6.Aux != s { + break + } + if p != x6.Args[0] { + break + } + x6_1 := x6.Args[1] + if x6_1.Op != OpAMD64SHRQconst { + break + } + if x6_1.AuxInt != 8 { + break + } + if w != x6_1.Args[0] { + break + } + x5 := x6.Args[2] + if x5.Op != OpAMD64MOVBstore { + break + } + if x5.AuxInt != i-2 { + break + } + if x5.Aux != s { + break + } + if p != x5.Args[0] { + break + } + x5_1 := x5.Args[1] + if x5_1.Op != OpAMD64SHRQconst { + break + } + if x5_1.AuxInt != 16 { + break + } + if w != x5_1.Args[0] { + break + } + x4 := x5.Args[2] + if x4.Op != OpAMD64MOVBstore { + break + } + if x4.AuxInt != i-3 { + break + } + if x4.Aux != s { + break + } + if p != x4.Args[0] { + break + } + x4_1 := x4.Args[1] + if x4_1.Op != OpAMD64SHRQconst { + break + } + if x4_1.AuxInt != 24 { + break + } + if w != x4_1.Args[0] { + break + } + x3 := x4.Args[2] + if x3.Op != OpAMD64MOVBstore { + break + } + if x3.AuxInt != i-4 { + break + } + if x3.Aux != s { + break + } + if p != x3.Args[0] { + break + } + x3_1 := x3.Args[1] + if x3_1.Op != OpAMD64SHRQconst { + break + } + if x3_1.AuxInt != 32 { + break + } + if w != x3_1.Args[0] { + break + } + x2 := x3.Args[2] + if x2.Op != OpAMD64MOVBstore { + break + } + if x2.AuxInt != i-5 { + break + } + if x2.Aux != s { + break + } + if p != x2.Args[0] { + break + } + x2_1 := x2.Args[1] + if x2_1.Op != OpAMD64SHRQconst { + break + } + if x2_1.AuxInt != 40 { + break + } + if w != x2_1.Args[0] { + break + } + x1 := x2.Args[2] + if x1.Op != OpAMD64MOVBstore { + break + } + if x1.AuxInt != i-6 { + break + } + if x1.Aux != s { + break + } + if p != x1.Args[0] { + break + } + x1_1 := x1.Args[1] + if x1_1.Op != OpAMD64SHRQconst { + break + } + if x1_1.AuxInt != 48 { + break + } + if w != x1_1.Args[0] { + break + } + x0 := x1.Args[2] + if x0.Op != OpAMD64MOVBstore { + break + } + if x0.AuxInt != i-7 { + break + } + if x0.Aux != s { + break + } + if p != x0.Args[0] { + break + } + x0_1 := x0.Args[1] + if x0_1.Op != OpAMD64SHRQconst { + break + } + if x0_1.AuxInt != 56 { + break + } + if w != x0_1.Args[0] { + break + } + mem := x0.Args[2] + if !(x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && clobber(x0) && clobber(x1) && clobber(x2) && clobber(x3) && clobber(x4) && clobber(x5) && clobber(x6)) { + break + } + v.reset(OpAMD64MOVQstore) + v.AuxInt = i - 7 + v.Aux = s + v.AddArg(p) + v0 := b.NewValue0(v.Line, OpAMD64BSWAPQ, w.Type) + v0.AddArg(w) + v.AddArg(v0) + v.AddArg(mem) + return true + } // match: (MOVBstore [i] {s} p (SHRQconst [8] w) x:(MOVBstore [i-1] {s} p w mem)) // cond: x.Uses == 1 && clobber(x) // result: (MOVWstore [i-1] {s} p w mem) @@ -10881,6 +11155,225 @@ func rewriteValueAMD64_OpAMD64ORL(v *Value, config *Config) bool { v0.AddArg(mem) return true } + // match: (ORL o1:(ORL o0:(ORL x0:(MOVBload [i] {s} p mem) s0:(SHLLconst [8] x1:(MOVBload [i-1] {s} p mem))) s1:(SHLLconst [16] x2:(MOVBload [i-2] {s} p mem))) s2:(SHLLconst [24] x3:(MOVBload [i-3] {s} p mem))) + // cond: x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && s2.Uses == 1 && o0.Uses == 1 && o1.Uses == 1 && mergePoint(b,x0,x1,x2,x3) != nil && clobber(x0) && clobber(x1) && clobber(x2) && clobber(x3) && clobber(s0) && clobber(s1) && clobber(s2) && clobber(o0) && clobber(o1) + // result: @mergePoint(b,x0,x1,x2,x3) (BSWAPL (MOVLload [i-3] {s} p mem)) + for { + o1 := v.Args[0] + if o1.Op != OpAMD64ORL { + break + } + o0 := o1.Args[0] + if o0.Op != OpAMD64ORL { + break + } + x0 := o0.Args[0] + if x0.Op != OpAMD64MOVBload { + break + } + i := x0.AuxInt + s := x0.Aux + p := x0.Args[0] + mem := x0.Args[1] + s0 := o0.Args[1] + if s0.Op != OpAMD64SHLLconst { + break + } + if s0.AuxInt != 8 { + break + } + x1 := s0.Args[0] + if x1.Op != OpAMD64MOVBload { + break + } + if x1.AuxInt != i-1 { + break + } + if x1.Aux != s { + break + } + if p != x1.Args[0] { + break + } + if mem != x1.Args[1] { + break + } + s1 := o1.Args[1] + if s1.Op != OpAMD64SHLLconst { + break + } + if s1.AuxInt != 16 { + break + } + x2 := s1.Args[0] + if x2.Op != OpAMD64MOVBload { + break + } + if x2.AuxInt != i-2 { + break + } + if x2.Aux != s { + break + } + if p != x2.Args[0] { + break + } + if mem != x2.Args[1] { + break + } + s2 := v.Args[1] + if s2.Op != OpAMD64SHLLconst { + break + } + if s2.AuxInt != 24 { + break + } + x3 := s2.Args[0] + if x3.Op != OpAMD64MOVBload { + break + } + if x3.AuxInt != i-3 { + break + } + if x3.Aux != s { + break + } + if p != x3.Args[0] { + break + } + if mem != x3.Args[1] { + break + } + if !(x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && s2.Uses == 1 && o0.Uses == 1 && o1.Uses == 1 && mergePoint(b, x0, x1, x2, x3) != nil && clobber(x0) && clobber(x1) && clobber(x2) && clobber(x3) && clobber(s0) && clobber(s1) && clobber(s2) && clobber(o0) && clobber(o1)) { + break + } + b = mergePoint(b, x0, x1, x2, x3) + v0 := b.NewValue0(v.Line, OpAMD64BSWAPL, v.Type) + v.reset(OpCopy) + v.AddArg(v0) + v1 := b.NewValue0(v.Line, OpAMD64MOVLload, config.fe.TypeUInt32()) + v1.AuxInt = i - 3 + v1.Aux = s + v1.AddArg(p) + v1.AddArg(mem) + v0.AddArg(v1) + return true + } + // match: (ORL o1:(ORL o0:(ORL x0:(MOVBloadidx1 [i] {s} p idx mem) s0:(SHLLconst [8] x1:(MOVBloadidx1 [i-1] {s} p idx mem))) s1:(SHLLconst [16] x2:(MOVBloadidx1 [i-2] {s} p idx mem))) s2:(SHLLconst [24] x3:(MOVBloadidx1 [i-3] {s} p idx mem))) + // cond: x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && s2.Uses == 1 && o0.Uses == 1 && o1.Uses == 1 && mergePoint(b,x0,x1,x2,x3) != nil && clobber(x0) && clobber(x1) && clobber(x2) && clobber(x3) && clobber(s0) && clobber(s1) && clobber(s2) && clobber(o0) && clobber(o1) + // result: @mergePoint(b,x0,x1,x2,x3) (BSWAPL (MOVLloadidx1 [i-3] {s} p idx mem)) + for { + o1 := v.Args[0] + if o1.Op != OpAMD64ORL { + break + } + o0 := o1.Args[0] + if o0.Op != OpAMD64ORL { + break + } + x0 := o0.Args[0] + if x0.Op != OpAMD64MOVBloadidx1 { + break + } + i := x0.AuxInt + s := x0.Aux + p := x0.Args[0] + idx := x0.Args[1] + mem := x0.Args[2] + s0 := o0.Args[1] + if s0.Op != OpAMD64SHLLconst { + break + } + if s0.AuxInt != 8 { + break + } + x1 := s0.Args[0] + if x1.Op != OpAMD64MOVBloadidx1 { + break + } + if x1.AuxInt != i-1 { + break + } + if x1.Aux != s { + break + } + if p != x1.Args[0] { + break + } + if idx != x1.Args[1] { + break + } + if mem != x1.Args[2] { + break + } + s1 := o1.Args[1] + if s1.Op != OpAMD64SHLLconst { + break + } + if s1.AuxInt != 16 { + break + } + x2 := s1.Args[0] + if x2.Op != OpAMD64MOVBloadidx1 { + break + } + if x2.AuxInt != i-2 { + break + } + if x2.Aux != s { + break + } + if p != x2.Args[0] { + break + } + if idx != x2.Args[1] { + break + } + if mem != x2.Args[2] { + break + } + s2 := v.Args[1] + if s2.Op != OpAMD64SHLLconst { + break + } + if s2.AuxInt != 24 { + break + } + x3 := s2.Args[0] + if x3.Op != OpAMD64MOVBloadidx1 { + break + } + if x3.AuxInt != i-3 { + break + } + if x3.Aux != s { + break + } + if p != x3.Args[0] { + break + } + if idx != x3.Args[1] { + break + } + if mem != x3.Args[2] { + break + } + if !(x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && s2.Uses == 1 && o0.Uses == 1 && o1.Uses == 1 && mergePoint(b, x0, x1, x2, x3) != nil && clobber(x0) && clobber(x1) && clobber(x2) && clobber(x3) && clobber(s0) && clobber(s1) && clobber(s2) && clobber(o0) && clobber(o1)) { + break + } + b = mergePoint(b, x0, x1, x2, x3) + v0 := b.NewValue0(v.Line, OpAMD64BSWAPL, v.Type) + v.reset(OpCopy) + v.AddArg(v0) + v1 := b.NewValue0(v.Line, OpAMD64MOVLloadidx1, v.Type) + v1.AuxInt = i - 3 + v1.Aux = s + v1.AddArg(p) + v1.AddArg(idx) + v1.AddArg(mem) + v0.AddArg(v1) + return true + } return false } func rewriteValueAMD64_OpAMD64ORLconst(v *Value, config *Config) bool { @@ -11423,6 +11916,453 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value, config *Config) bool { v0.AddArg(mem) return true } + // match: (ORQ o5:(ORQ o4:(ORQ o3:(ORQ o2:(ORQ o1:(ORQ o0:(ORQ x0:(MOVBload [i] {s} p mem) s0:(SHLQconst [8] x1:(MOVBload [i-1] {s} p mem))) s1:(SHLQconst [16] x2:(MOVBload [i-2] {s} p mem))) s2:(SHLQconst [24] x3:(MOVBload [i-3] {s} p mem))) s3:(SHLQconst [32] x4:(MOVBload [i-4] {s} p mem))) s4:(SHLQconst [40] x5:(MOVBload [i-5] {s} p mem))) s5:(SHLQconst [48] x6:(MOVBload [i-6] {s} p mem))) s6:(SHLQconst [56] x7:(MOVBload [i-7] {s} p mem))) + // cond: x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && x7.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && s2.Uses == 1 && s3.Uses == 1 && s4.Uses == 1 && s5.Uses == 1 && s6.Uses == 1 && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1 && o3.Uses == 1 && o4.Uses == 1 && o5.Uses == 1 && mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) != nil && clobber(x0) && clobber(x1) && clobber(x2) && clobber(x3) && clobber(x4) && clobber(x5) && clobber(x6) && clobber(x7) && clobber(s0) && clobber(s1) && clobber(s2) && clobber(s3) && clobber(s4) && clobber(s5) && clobber(s6) && clobber(o0) && clobber(o1) && clobber(o2) && clobber(o3) && clobber(o4) && clobber(o5) + // result: @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (BSWAPQ (MOVQload [i-7] {s} p mem)) + for { + o5 := v.Args[0] + if o5.Op != OpAMD64ORQ { + break + } + o4 := o5.Args[0] + if o4.Op != OpAMD64ORQ { + break + } + o3 := o4.Args[0] + if o3.Op != OpAMD64ORQ { + break + } + o2 := o3.Args[0] + if o2.Op != OpAMD64ORQ { + break + } + o1 := o2.Args[0] + if o1.Op != OpAMD64ORQ { + break + } + o0 := o1.Args[0] + if o0.Op != OpAMD64ORQ { + break + } + x0 := o0.Args[0] + if x0.Op != OpAMD64MOVBload { + break + } + i := x0.AuxInt + s := x0.Aux + p := x0.Args[0] + mem := x0.Args[1] + s0 := o0.Args[1] + if s0.Op != OpAMD64SHLQconst { + break + } + if s0.AuxInt != 8 { + break + } + x1 := s0.Args[0] + if x1.Op != OpAMD64MOVBload { + break + } + if x1.AuxInt != i-1 { + break + } + if x1.Aux != s { + break + } + if p != x1.Args[0] { + break + } + if mem != x1.Args[1] { + break + } + s1 := o1.Args[1] + if s1.Op != OpAMD64SHLQconst { + break + } + if s1.AuxInt != 16 { + break + } + x2 := s1.Args[0] + if x2.Op != OpAMD64MOVBload { + break + } + if x2.AuxInt != i-2 { + break + } + if x2.Aux != s { + break + } + if p != x2.Args[0] { + break + } + if mem != x2.Args[1] { + break + } + s2 := o2.Args[1] + if s2.Op != OpAMD64SHLQconst { + break + } + if s2.AuxInt != 24 { + break + } + x3 := s2.Args[0] + if x3.Op != OpAMD64MOVBload { + break + } + if x3.AuxInt != i-3 { + break + } + if x3.Aux != s { + break + } + if p != x3.Args[0] { + break + } + if mem != x3.Args[1] { + break + } + s3 := o3.Args[1] + if s3.Op != OpAMD64SHLQconst { + break + } + if s3.AuxInt != 32 { + break + } + x4 := s3.Args[0] + if x4.Op != OpAMD64MOVBload { + break + } + if x4.AuxInt != i-4 { + break + } + if x4.Aux != s { + break + } + if p != x4.Args[0] { + break + } + if mem != x4.Args[1] { + break + } + s4 := o4.Args[1] + if s4.Op != OpAMD64SHLQconst { + break + } + if s4.AuxInt != 40 { + break + } + x5 := s4.Args[0] + if x5.Op != OpAMD64MOVBload { + break + } + if x5.AuxInt != i-5 { + break + } + if x5.Aux != s { + break + } + if p != x5.Args[0] { + break + } + if mem != x5.Args[1] { + break + } + s5 := o5.Args[1] + if s5.Op != OpAMD64SHLQconst { + break + } + if s5.AuxInt != 48 { + break + } + x6 := s5.Args[0] + if x6.Op != OpAMD64MOVBload { + break + } + if x6.AuxInt != i-6 { + break + } + if x6.Aux != s { + break + } + if p != x6.Args[0] { + break + } + if mem != x6.Args[1] { + break + } + s6 := v.Args[1] + if s6.Op != OpAMD64SHLQconst { + break + } + if s6.AuxInt != 56 { + break + } + x7 := s6.Args[0] + if x7.Op != OpAMD64MOVBload { + break + } + if x7.AuxInt != i-7 { + break + } + if x7.Aux != s { + break + } + if p != x7.Args[0] { + break + } + if mem != x7.Args[1] { + break + } + if !(x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && x7.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && s2.Uses == 1 && s3.Uses == 1 && s4.Uses == 1 && s5.Uses == 1 && s6.Uses == 1 && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1 && o3.Uses == 1 && o4.Uses == 1 && o5.Uses == 1 && mergePoint(b, x0, x1, x2, x3, x4, x5, x6, x7) != nil && clobber(x0) && clobber(x1) && clobber(x2) && clobber(x3) && clobber(x4) && clobber(x5) && clobber(x6) && clobber(x7) && clobber(s0) && clobber(s1) && clobber(s2) && clobber(s3) && clobber(s4) && clobber(s5) && clobber(s6) && clobber(o0) && clobber(o1) && clobber(o2) && clobber(o3) && clobber(o4) && clobber(o5)) { + break + } + b = mergePoint(b, x0, x1, x2, x3, x4, x5, x6, x7) + v0 := b.NewValue0(v.Line, OpAMD64BSWAPQ, v.Type) + v.reset(OpCopy) + v.AddArg(v0) + v1 := b.NewValue0(v.Line, OpAMD64MOVQload, config.fe.TypeUInt64()) + v1.AuxInt = i - 7 + v1.Aux = s + v1.AddArg(p) + v1.AddArg(mem) + v0.AddArg(v1) + return true + } + // match: (ORQ o5:(ORQ o4:(ORQ o3:(ORQ o2:(ORQ o1:(ORQ o0:(ORQ x0:(MOVBloadidx1 [i] {s} p idx mem) s0:(SHLQconst [8] x1:(MOVBloadidx1 [i-1] {s} p idx mem))) s1:(SHLQconst [16] x2:(MOVBloadidx1 [i-2] {s} p idx mem))) s2:(SHLQconst [24] x3:(MOVBloadidx1 [i-3] {s} p idx mem))) s3:(SHLQconst [32] x4:(MOVBloadidx1 [i-4] {s} p idx mem))) s4:(SHLQconst [40] x5:(MOVBloadidx1 [i-5] {s} p idx mem))) s5:(SHLQconst [48] x6:(MOVBloadidx1 [i-6] {s} p idx mem))) s6:(SHLQconst [56] x7:(MOVBloadidx1 [i-7] {s} p idx mem))) + // cond: x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && x7.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && s2.Uses == 1 && s3.Uses == 1 && s4.Uses == 1 && s5.Uses == 1 && s6.Uses == 1 && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1 && o3.Uses == 1 && o4.Uses == 1 && o5.Uses == 1 && mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) != nil && clobber(x0) && clobber(x1) && clobber(x2) && clobber(x3) && clobber(x4) && clobber(x5) && clobber(x6) && clobber(x7) && clobber(s0) && clobber(s1) && clobber(s2) && clobber(s3) && clobber(s4) && clobber(s5) && clobber(s6) && clobber(o0) && clobber(o1) && clobber(o2) && clobber(o3) && clobber(o4) && clobber(o5) + // result: @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (BSWAPQ (MOVQloadidx1 [i-7] {s} p idx mem)) + for { + o5 := v.Args[0] + if o5.Op != OpAMD64ORQ { + break + } + o4 := o5.Args[0] + if o4.Op != OpAMD64ORQ { + break + } + o3 := o4.Args[0] + if o3.Op != OpAMD64ORQ { + break + } + o2 := o3.Args[0] + if o2.Op != OpAMD64ORQ { + break + } + o1 := o2.Args[0] + if o1.Op != OpAMD64ORQ { + break + } + o0 := o1.Args[0] + if o0.Op != OpAMD64ORQ { + break + } + x0 := o0.Args[0] + if x0.Op != OpAMD64MOVBloadidx1 { + break + } + i := x0.AuxInt + s := x0.Aux + p := x0.Args[0] + idx := x0.Args[1] + mem := x0.Args[2] + s0 := o0.Args[1] + if s0.Op != OpAMD64SHLQconst { + break + } + if s0.AuxInt != 8 { + break + } + x1 := s0.Args[0] + if x1.Op != OpAMD64MOVBloadidx1 { + break + } + if x1.AuxInt != i-1 { + break + } + if x1.Aux != s { + break + } + if p != x1.Args[0] { + break + } + if idx != x1.Args[1] { + break + } + if mem != x1.Args[2] { + break + } + s1 := o1.Args[1] + if s1.Op != OpAMD64SHLQconst { + break + } + if s1.AuxInt != 16 { + break + } + x2 := s1.Args[0] + if x2.Op != OpAMD64MOVBloadidx1 { + break + } + if x2.AuxInt != i-2 { + break + } + if x2.Aux != s { + break + } + if p != x2.Args[0] { + break + } + if idx != x2.Args[1] { + break + } + if mem != x2.Args[2] { + break + } + s2 := o2.Args[1] + if s2.Op != OpAMD64SHLQconst { + break + } + if s2.AuxInt != 24 { + break + } + x3 := s2.Args[0] + if x3.Op != OpAMD64MOVBloadidx1 { + break + } + if x3.AuxInt != i-3 { + break + } + if x3.Aux != s { + break + } + if p != x3.Args[0] { + break + } + if idx != x3.Args[1] { + break + } + if mem != x3.Args[2] { + break + } + s3 := o3.Args[1] + if s3.Op != OpAMD64SHLQconst { + break + } + if s3.AuxInt != 32 { + break + } + x4 := s3.Args[0] + if x4.Op != OpAMD64MOVBloadidx1 { + break + } + if x4.AuxInt != i-4 { + break + } + if x4.Aux != s { + break + } + if p != x4.Args[0] { + break + } + if idx != x4.Args[1] { + break + } + if mem != x4.Args[2] { + break + } + s4 := o4.Args[1] + if s4.Op != OpAMD64SHLQconst { + break + } + if s4.AuxInt != 40 { + break + } + x5 := s4.Args[0] + if x5.Op != OpAMD64MOVBloadidx1 { + break + } + if x5.AuxInt != i-5 { + break + } + if x5.Aux != s { + break + } + if p != x5.Args[0] { + break + } + if idx != x5.Args[1] { + break + } + if mem != x5.Args[2] { + break + } + s5 := o5.Args[1] + if s5.Op != OpAMD64SHLQconst { + break + } + if s5.AuxInt != 48 { + break + } + x6 := s5.Args[0] + if x6.Op != OpAMD64MOVBloadidx1 { + break + } + if x6.AuxInt != i-6 { + break + } + if x6.Aux != s { + break + } + if p != x6.Args[0] { + break + } + if idx != x6.Args[1] { + break + } + if mem != x6.Args[2] { + break + } + s6 := v.Args[1] + if s6.Op != OpAMD64SHLQconst { + break + } + if s6.AuxInt != 56 { + break + } + x7 := s6.Args[0] + if x7.Op != OpAMD64MOVBloadidx1 { + break + } + if x7.AuxInt != i-7 { + break + } + if x7.Aux != s { + break + } + if p != x7.Args[0] { + break + } + if idx != x7.Args[1] { + break + } + if mem != x7.Args[2] { + break + } + if !(x0.Uses == 1 && x1.Uses == 1 && x2.Uses == 1 && x3.Uses == 1 && x4.Uses == 1 && x5.Uses == 1 && x6.Uses == 1 && x7.Uses == 1 && s0.Uses == 1 && s1.Uses == 1 && s2.Uses == 1 && s3.Uses == 1 && s4.Uses == 1 && s5.Uses == 1 && s6.Uses == 1 && o0.Uses == 1 && o1.Uses == 1 && o2.Uses == 1 && o3.Uses == 1 && o4.Uses == 1 && o5.Uses == 1 && mergePoint(b, x0, x1, x2, x3, x4, x5, x6, x7) != nil && clobber(x0) && clobber(x1) && clobber(x2) && clobber(x3) && clobber(x4) && clobber(x5) && clobber(x6) && clobber(x7) && clobber(s0) && clobber(s1) && clobber(s2) && clobber(s3) && clobber(s4) && clobber(s5) && clobber(s6) && clobber(o0) && clobber(o1) && clobber(o2) && clobber(o3) && clobber(o4) && clobber(o5)) { + break + } + b = mergePoint(b, x0, x1, x2, x3, x4, x5, x6, x7) + v0 := b.NewValue0(v.Line, OpAMD64BSWAPQ, v.Type) + v.reset(OpCopy) + v.AddArg(v0) + v1 := b.NewValue0(v.Line, OpAMD64MOVQloadidx1, v.Type) + v1.AuxInt = i - 7 + v1.Aux = s + v1.AddArg(p) + v1.AddArg(idx) + v1.AddArg(mem) + v0.AddArg(v1) + return true + } return false } func rewriteValueAMD64_OpAMD64ORQconst(v *Value, config *Config) bool {