1
0
mirror of https://github.com/golang/go synced 2024-11-19 11:04:47 -07:00

cmd/compile: add some LEAL{1,2,4,8} rewrite rules for AMD64

This should improve some 32 bit arithmetic operations.

During make.bash, this increases the number of
rules firing by 15518:

$ wc -l rulelog-*
 13490514 rulelog-head
 13474996 rulelog-master

compress/flate benchmarks:

name                             old time/op    new time/op    delta
Decode/Digits/Huffman/1e4-8         103µs ± 4%     102µs ± 0%  -0.95%  (p=0.000 n=30+27)
Decode/Digits/Huffman/1e5-8         962µs ± 2%     954µs ± 1%  -0.80%  (p=0.000 n=25+25)
Decode/Digits/Huffman/1e6-8        9.55ms ± 1%    9.50ms ± 1%  -0.57%  (p=0.000 n=29+29)
Decode/Digits/Speed/1e4-8           110µs ± 2%     110µs ± 2%  -0.41%  (p=0.003 n=28+30)
Decode/Digits/Speed/1e5-8          1.15ms ± 1%    1.14ms ± 1%  -0.85%  (p=0.000 n=29+28)
Decode/Digits/Speed/1e6-8          11.5ms ± 2%    11.4ms ± 1%  -1.26%  (p=0.000 n=28+27)
Decode/Digits/Default/1e4-8         113µs ± 1%     112µs ± 1%  -0.49%  (p=0.001 n=27+30)
Decode/Digits/Default/1e5-8        1.13ms ± 0%    1.12ms ± 1%  -0.75%  (p=0.000 n=26+24)
Decode/Digits/Default/1e6-8        11.1ms ± 1%    11.1ms ± 1%  -0.47%  (p=0.000 n=28+27)
Decode/Digits/Compression/1e4-8     113µs ± 1%     112µs ± 1%  -0.70%  (p=0.000 n=28+29)
Decode/Digits/Compression/1e5-8    1.13ms ± 2%    1.12ms ± 1%  -1.41%  (p=0.000 n=28+26)
Decode/Digits/Compression/1e6-8    11.1ms ± 1%    11.1ms ± 1%  -0.33%  (p=0.002 n=29+27)
Decode/Twain/Huffman/1e4-8          115µs ± 1%     115µs ± 1%  -0.40%  (p=0.000 n=28+26)
Decode/Twain/Huffman/1e5-8         1.05ms ± 1%    1.04ms ± 0%  -0.41%  (p=0.000 n=27+25)
Decode/Twain/Huffman/1e6-8         10.4ms ± 1%    10.4ms ± 1%    ~     (p=0.993 n=28+24)
Decode/Twain/Speed/1e4-8            118µs ± 2%     116µs ± 1%  -1.08%  (p=0.000 n=27+29)
Decode/Twain/Speed/1e5-8           1.07ms ± 1%    1.07ms ± 1%  -0.23%  (p=0.041 n=26+27)
Decode/Twain/Speed/1e6-8           10.6ms ± 1%    10.5ms ± 0%  -0.68%  (p=0.000 n=29+27)
Decode/Twain/Default/1e4-8          110µs ± 1%     109µs ± 0%  -0.49%  (p=0.000 n=29+26)
Decode/Twain/Default/1e5-8          906µs ± 1%     902µs ± 1%  -0.48%  (p=0.000 n=27+28)
Decode/Twain/Default/1e6-8         8.75ms ± 1%    8.68ms ± 2%  -0.73%  (p=0.000 n=28+28)
Decode/Twain/Compression/1e4-8      110µs ± 1%     109µs ± 1%  -0.80%  (p=0.000 n=27+28)
Decode/Twain/Compression/1e5-8      905µs ± 1%     906µs ± 5%    ~     (p=0.065 n=27+29)
Decode/Twain/Compression/1e6-8     8.75ms ± 2%    8.68ms ± 1%  -0.76%  (p=0.000 n=26+26)
Encode/Digits/Huffman/1e4-8        31.8µs ± 1%    32.3µs ± 2%  +1.43%  (p=0.000 n=28+27)
Encode/Digits/Huffman/1e5-8         299µs ± 2%     296µs ± 1%  -1.05%  (p=0.000 n=29+29)
Encode/Digits/Huffman/1e6-8        2.99ms ± 3%    2.96ms ± 1%  -1.00%  (p=0.000 n=29+28)
Encode/Digits/Speed/1e4-8           149µs ± 1%     152µs ± 4%  +2.18%  (p=0.000 n=30+30)
Encode/Digits/Speed/1e5-8          1.39ms ± 1%    1.40ms ± 2%  +1.02%  (p=0.000 n=27+27)
Encode/Digits/Speed/1e6-8          13.7ms ± 0%    13.8ms ± 1%  +0.81%  (p=0.000 n=27+27)
Encode/Digits/Default/1e4-8         297µs ± 7%     297µs ± 7%    ~     (p=1.000 n=30+30)
Encode/Digits/Default/1e5-8        4.51ms ± 1%    4.42ms ± 1%  -2.06%  (p=0.000 n=29+29)
Encode/Digits/Default/1e6-8        47.5ms ± 1%    46.6ms ± 1%  -1.90%  (p=0.000 n=27+25)
Encode/Digits/Compression/1e4-8     302µs ± 7%     303µs ± 9%    ~     (p=0.854 n=30+30)
Encode/Digits/Compression/1e5-8    4.52ms ± 1%    4.43ms ± 2%  -1.91%  (p=0.000 n=26+25)
Encode/Digits/Compression/1e6-8    47.5ms ± 1%    46.7ms ± 1%  -1.70%  (p=0.000 n=26+27)
Encode/Twain/Huffman/1e4-8         46.6µs ± 2%    46.8µs ± 2%    ~     (p=0.114 n=30+30)
Encode/Twain/Huffman/1e5-8          357µs ± 3%     352µs ± 2%  -1.13%  (p=0.000 n=29+28)
Encode/Twain/Huffman/1e6-8         3.58ms ± 4%    3.52ms ± 1%  -1.43%  (p=0.003 n=30+28)
Encode/Twain/Speed/1e4-8            173µs ± 1%     174µs ± 1%  +0.65%  (p=0.000 n=27+28)
Encode/Twain/Speed/1e5-8           1.39ms ± 1%    1.40ms ± 1%  +0.92%  (p=0.000 n=28+27)
Encode/Twain/Speed/1e6-8           13.6ms ± 1%    13.7ms ± 1%  +0.51%  (p=0.000 n=25+26)
Encode/Twain/Default/1e4-8          364µs ± 5%     361µs ± 5%    ~     (p=0.219 n=30+30)
Encode/Twain/Default/1e5-8         5.41ms ± 1%    5.43ms ± 5%    ~     (p=0.655 n=27+27)
Encode/Twain/Default/1e6-8         57.2ms ± 1%    58.4ms ± 4%  +2.15%  (p=0.000 n=22+28)
Encode/Twain/Compression/1e4-8      371µs ± 9%     373µs ± 6%    ~     (p=0.503 n=30+29)
Encode/Twain/Compression/1e5-8     5.97ms ± 2%    5.92ms ± 1%  -0.75%  (p=0.000 n=28+26)
Encode/Twain/Compression/1e6-8     64.0ms ± 1%    63.8ms ± 1%  -0.36%  (p=0.036 n=27+25)
[Geo mean]                         1.37ms         1.36ms       -0.38%


Change-Id: I3df4de63f06eaf121c38821bd889453a8de1b199
Reviewed-on: https://go-review.googlesource.com/101276
Reviewed-by: Keith Randall <khr@golang.org>
This commit is contained in:
Josh Bleecher Snyder 2018-02-26 07:05:19 -08:00
parent 44286b17c5
commit b1df8d6ffa
2 changed files with 1458 additions and 102 deletions

View File

@ -852,10 +852,8 @@
(CMPB (MOVLconst [c]) x) -> (InvertFlags (CMPBconst x [int64(int8(c))]))
// Using MOVZX instead of AND is cheaper.
(ANDLconst [0xFF] x) -> (MOVBQZX x)
(ANDLconst [0xFFFF] x) -> (MOVWQZX x)
(ANDQconst [0xFF] x) -> (MOVBQZX x)
(ANDQconst [0xFFFF] x) -> (MOVWQZX x)
(AND(Q|L)const [ 0xFF] x) -> (MOVBQZX x)
(AND(Q|L)const [0xFFFF] x) -> (MOVWQZX x)
(ANDQconst [0xFFFFFFFF] x) -> (MOVLQZX x)
// strength reduction
@ -867,75 +865,75 @@
// which can require a register-register move
// to preserve the original value,
// so it must be used with care.
(MULQconst [-9] x) -> (NEGQ (LEAQ8 <v.Type> x x))
(MULQconst [-5] x) -> (NEGQ (LEAQ4 <v.Type> x x))
(MULQconst [-3] x) -> (NEGQ (LEAQ2 <v.Type> x x))
(MULQconst [-1] x) -> (NEGQ x)
(MULQconst [0] _) -> (MOVQconst [0])
(MULQconst [1] x) -> x
(MULQconst [3] x) -> (LEAQ2 x x)
(MULQconst [5] x) -> (LEAQ4 x x)
(MULQconst [7] x) -> (LEAQ2 x (LEAQ2 <v.Type> x x))
(MULQconst [9] x) -> (LEAQ8 x x)
(MULQconst [11] x) -> (LEAQ2 x (LEAQ4 <v.Type> x x))
(MULQconst [13] x) -> (LEAQ4 x (LEAQ2 <v.Type> x x))
(MULQconst [19] x) -> (LEAQ2 x (LEAQ8 <v.Type> x x))
(MULQconst [21] x) -> (LEAQ4 x (LEAQ4 <v.Type> x x))
(MULQconst [25] x) -> (LEAQ8 x (LEAQ2 <v.Type> x x))
(MULQconst [27] x) -> (LEAQ8 (LEAQ2 <v.Type> x x) (LEAQ2 <v.Type> x x))
(MULQconst [37] x) -> (LEAQ4 x (LEAQ8 <v.Type> x x))
(MULQconst [41] x) -> (LEAQ8 x (LEAQ4 <v.Type> x x))
(MULQconst [45] x) -> (LEAQ8 (LEAQ4 <v.Type> x x) (LEAQ4 <v.Type> x x))
(MULQconst [73] x) -> (LEAQ8 x (LEAQ8 <v.Type> x x))
(MULQconst [81] x) -> (LEAQ8 (LEAQ8 <v.Type> x x) (LEAQ8 <v.Type> x x))
(MUL(Q|L)const [-9] x) -> (NEG(Q|L) (LEA(Q|L)8 <v.Type> x x))
(MUL(Q|L)const [-5] x) -> (NEG(Q|L) (LEA(Q|L)4 <v.Type> x x))
(MUL(Q|L)const [-3] x) -> (NEG(Q|L) (LEA(Q|L)2 <v.Type> x x))
(MUL(Q|L)const [-1] x) -> (NEG(Q|L) x)
(MUL(Q|L)const [ 0] _) -> (MOV(Q|L)const [0])
(MUL(Q|L)const [ 1] x) -> x
(MUL(Q|L)const [ 3] x) -> (LEA(Q|L)2 x x)
(MUL(Q|L)const [ 5] x) -> (LEA(Q|L)4 x x)
(MUL(Q|L)const [ 7] x) -> (LEA(Q|L)2 x (LEA(Q|L)2 <v.Type> x x))
(MUL(Q|L)const [ 9] x) -> (LEA(Q|L)8 x x)
(MUL(Q|L)const [11] x) -> (LEA(Q|L)2 x (LEA(Q|L)4 <v.Type> x x))
(MUL(Q|L)const [13] x) -> (LEA(Q|L)4 x (LEA(Q|L)2 <v.Type> x x))
(MUL(Q|L)const [19] x) -> (LEA(Q|L)2 x (LEA(Q|L)8 <v.Type> x x))
(MUL(Q|L)const [21] x) -> (LEA(Q|L)4 x (LEA(Q|L)4 <v.Type> x x))
(MUL(Q|L)const [25] x) -> (LEA(Q|L)8 x (LEA(Q|L)2 <v.Type> x x))
(MUL(Q|L)const [27] x) -> (LEA(Q|L)8 (LEA(Q|L)2 <v.Type> x x) (LEA(Q|L)2 <v.Type> x x))
(MUL(Q|L)const [37] x) -> (LEA(Q|L)4 x (LEA(Q|L)8 <v.Type> x x))
(MUL(Q|L)const [41] x) -> (LEA(Q|L)8 x (LEA(Q|L)4 <v.Type> x x))
(MUL(Q|L)const [45] x) -> (LEA(Q|L)8 (LEA(Q|L)4 <v.Type> x x) (LEA(Q|L)4 <v.Type> x x))
(MUL(Q|L)const [73] x) -> (LEA(Q|L)8 x (LEA(Q|L)8 <v.Type> x x))
(MUL(Q|L)const [81] x) -> (LEA(Q|L)8 (LEA(Q|L)8 <v.Type> x x) (LEA(Q|L)8 <v.Type> x x))
(MULQconst [c] x) && isPowerOfTwo(c+1) && c >= 15 -> (SUBQ (SHLQconst <v.Type> [log2(c+1)] x) x)
(MULQconst [c] x) && isPowerOfTwo(c-1) && c >= 17 -> (LEAQ1 (SHLQconst <v.Type> [log2(c-1)] x) x)
(MULQconst [c] x) && isPowerOfTwo(c-2) && c >= 34 -> (LEAQ2 (SHLQconst <v.Type> [log2(c-2)] x) x)
(MULQconst [c] x) && isPowerOfTwo(c-4) && c >= 68 -> (LEAQ4 (SHLQconst <v.Type> [log2(c-4)] x) x)
(MULQconst [c] x) && isPowerOfTwo(c-8) && c >= 136 -> (LEAQ8 (SHLQconst <v.Type> [log2(c-8)] x) x)
(MULQconst [c] x) && c%3 == 0 && isPowerOfTwo(c/3) -> (SHLQconst [log2(c/3)] (LEAQ2 <v.Type> x x))
(MULQconst [c] x) && c%5 == 0 && isPowerOfTwo(c/5) -> (SHLQconst [log2(c/5)] (LEAQ4 <v.Type> x x))
(MULQconst [c] x) && c%9 == 0 && isPowerOfTwo(c/9) -> (SHLQconst [log2(c/9)] (LEAQ8 <v.Type> x x))
(MUL(Q|L)const [c] x) && isPowerOfTwo(c+1) && c >= 15 -> (SUB(Q|L) (SHL(Q|L)const <v.Type> [log2(c+1)] x) x)
(MUL(Q|L)const [c] x) && isPowerOfTwo(c-1) && c >= 17 -> (LEA(Q|L)1 (SHL(Q|L)const <v.Type> [log2(c-1)] x) x)
(MUL(Q|L)const [c] x) && isPowerOfTwo(c-2) && c >= 34 -> (LEA(Q|L)2 (SHL(Q|L)const <v.Type> [log2(c-2)] x) x)
(MUL(Q|L)const [c] x) && isPowerOfTwo(c-4) && c >= 68 -> (LEA(Q|L)4 (SHL(Q|L)const <v.Type> [log2(c-4)] x) x)
(MUL(Q|L)const [c] x) && isPowerOfTwo(c-8) && c >= 136 -> (LEA(Q|L)8 (SHL(Q|L)const <v.Type> [log2(c-8)] x) x)
(MUL(Q|L)const [c] x) && c%3 == 0 && isPowerOfTwo(c/3) -> (SHL(Q|L)const [log2(c/3)] (LEA(Q|L)2 <v.Type> x x))
(MUL(Q|L)const [c] x) && c%5 == 0 && isPowerOfTwo(c/5) -> (SHL(Q|L)const [log2(c/5)] (LEA(Q|L)4 <v.Type> x x))
(MUL(Q|L)const [c] x) && c%9 == 0 && isPowerOfTwo(c/9) -> (SHL(Q|L)const [log2(c/9)] (LEA(Q|L)8 <v.Type> x x))
// combine add/shift into LEAQ
(ADDQ x (SHLQconst [3] y)) -> (LEAQ8 x y)
(ADDQ x (SHLQconst [2] y)) -> (LEAQ4 x y)
(ADDQ x (SHLQconst [1] y)) -> (LEAQ2 x y)
(ADDQ x (ADDQ y y)) -> (LEAQ2 x y)
(ADDQ x (ADDQ x y)) -> (LEAQ2 y x)
// combine add/shift into LEAQ/LEAL
(ADD(L|Q) x (SHL(L|Q)const [3] y)) -> (LEA(L|Q)8 x y)
(ADD(L|Q) x (SHL(L|Q)const [2] y)) -> (LEA(L|Q)4 x y)
(ADD(L|Q) x (SHL(L|Q)const [1] y)) -> (LEA(L|Q)2 x y)
(ADD(L|Q) x (ADD(L|Q) y y)) -> (LEA(L|Q)2 x y)
(ADD(L|Q) x (ADD(L|Q) x y)) -> (LEA(L|Q)2 y x)
// combine ADDQ/ADDQconst into LEAQ1
(ADDQconst [c] (ADDQ x y)) -> (LEAQ1 [c] x y)
(ADDQ (ADDQconst [c] x) y) -> (LEAQ1 [c] x y)
// combine ADDQ/ADDQconst into LEAQ1/LEAL1
(ADD(Q|L)const [c] (ADD(Q|L) x y)) -> (LEA(Q|L)1 [c] x y)
(ADD(Q|L) (ADD(Q|L)const [c] x) y) -> (LEA(Q|L)1 [c] x y)
(ADD(Q|L)const [c] (SHL(Q|L)const [1] x)) -> (LEA(Q|L)1 [c] x x)
// fold ADDQ into LEAQ
(ADDQconst [c] (LEAQ [d] {s} x)) && is32Bit(c+d) -> (LEAQ [c+d] {s} x)
(LEAQ [c] {s} (ADDQconst [d] x)) && is32Bit(c+d) -> (LEAQ [c+d] {s} x)
(LEAQ [c] {s} (ADDQ x y)) && x.Op != OpSB && y.Op != OpSB -> (LEAQ1 [c] {s} x y)
(ADDQ x (LEAQ [c] {s} y)) && x.Op != OpSB && y.Op != OpSB -> (LEAQ1 [c] {s} x y)
// fold ADDQ/ADDL into LEAQ/LEAL
(ADD(Q|L)const [c] (LEA(Q|L) [d] {s} x)) && is32Bit(c+d) -> (LEA(Q|L) [c+d] {s} x)
(LEA(Q|L) [c] {s} (ADD(Q|L)const [d] x)) && is32Bit(c+d) -> (LEA(Q|L) [c+d] {s} x)
(LEA(Q|L) [c] {s} (ADD(Q|L) x y)) && x.Op != OpSB && y.Op != OpSB -> (LEA(Q|L)1 [c] {s} x y)
(ADD(Q|L) x (LEA(Q|L) [c] {s} y)) && x.Op != OpSB && y.Op != OpSB -> (LEA(Q|L)1 [c] {s} x y)
// fold ADDQconst into LEAQx
(ADDQconst [c] (LEAQ1 [d] {s} x y)) && is32Bit(c+d) -> (LEAQ1 [c+d] {s} x y)
(ADDQconst [c] (LEAQ2 [d] {s} x y)) && is32Bit(c+d) -> (LEAQ2 [c+d] {s} x y)
(ADDQconst [c] (LEAQ4 [d] {s} x y)) && is32Bit(c+d) -> (LEAQ4 [c+d] {s} x y)
(ADDQconst [c] (LEAQ8 [d] {s} x y)) && is32Bit(c+d) -> (LEAQ8 [c+d] {s} x y)
(LEAQ1 [c] {s} (ADDQconst [d] x) y) && is32Bit(c+d) && x.Op != OpSB -> (LEAQ1 [c+d] {s} x y)
(LEAQ2 [c] {s} (ADDQconst [d] x) y) && is32Bit(c+d) && x.Op != OpSB -> (LEAQ2 [c+d] {s} x y)
(LEAQ2 [c] {s} x (ADDQconst [d] y)) && is32Bit(c+2*d) && y.Op != OpSB -> (LEAQ2 [c+2*d] {s} x y)
(LEAQ4 [c] {s} (ADDQconst [d] x) y) && is32Bit(c+d) && x.Op != OpSB -> (LEAQ4 [c+d] {s} x y)
(LEAQ4 [c] {s} x (ADDQconst [d] y)) && is32Bit(c+4*d) && y.Op != OpSB -> (LEAQ4 [c+4*d] {s} x y)
(LEAQ8 [c] {s} (ADDQconst [d] x) y) && is32Bit(c+d) && x.Op != OpSB -> (LEAQ8 [c+d] {s} x y)
(LEAQ8 [c] {s} x (ADDQconst [d] y)) && is32Bit(c+8*d) && y.Op != OpSB -> (LEAQ8 [c+8*d] {s} x y)
// fold ADDQconst/ADDLconst into LEAQx/LEALx
(ADD(Q|L)const [c] (LEA(Q|L)1 [d] {s} x y)) && is32Bit(c+d) -> (LEA(Q|L)1 [c+d] {s} x y)
(ADD(Q|L)const [c] (LEA(Q|L)2 [d] {s} x y)) && is32Bit(c+d) -> (LEA(Q|L)2 [c+d] {s} x y)
(ADD(Q|L)const [c] (LEA(Q|L)4 [d] {s} x y)) && is32Bit(c+d) -> (LEA(Q|L)4 [c+d] {s} x y)
(ADD(Q|L)const [c] (LEA(Q|L)8 [d] {s} x y)) && is32Bit(c+d) -> (LEA(Q|L)8 [c+d] {s} x y)
(LEA(Q|L)1 [c] {s} (ADD(Q|L)const [d] x) y) && is32Bit(c+d) && x.Op != OpSB -> (LEA(Q|L)1 [c+d] {s} x y)
(LEA(Q|L)2 [c] {s} (ADD(Q|L)const [d] x) y) && is32Bit(c+d) && x.Op != OpSB -> (LEA(Q|L)2 [c+d] {s} x y)
(LEA(Q|L)2 [c] {s} x (ADD(Q|L)const [d] y)) && is32Bit(c+2*d) && y.Op != OpSB -> (LEA(Q|L)2 [c+2*d] {s} x y)
(LEA(Q|L)4 [c] {s} (ADD(Q|L)const [d] x) y) && is32Bit(c+d) && x.Op != OpSB -> (LEA(Q|L)4 [c+d] {s} x y)
(LEA(Q|L)4 [c] {s} x (ADD(Q|L)const [d] y)) && is32Bit(c+4*d) && y.Op != OpSB -> (LEA(Q|L)4 [c+4*d] {s} x y)
(LEA(Q|L)8 [c] {s} (ADD(Q|L)const [d] x) y) && is32Bit(c+d) && x.Op != OpSB -> (LEA(Q|L)8 [c+d] {s} x y)
(LEA(Q|L)8 [c] {s} x (ADD(Q|L)const [d] y)) && is32Bit(c+8*d) && y.Op != OpSB -> (LEA(Q|L)8 [c+8*d] {s} x y)
// fold shifts into LEAQx
(LEAQ1 [c] {s} x (SHLQconst [1] y)) -> (LEAQ2 [c] {s} x y)
(LEAQ1 [c] {s} x (SHLQconst [2] y)) -> (LEAQ4 [c] {s} x y)
(LEAQ1 [c] {s} x (SHLQconst [3] y)) -> (LEAQ8 [c] {s} x y)
(LEAQ2 [c] {s} x (SHLQconst [1] y)) -> (LEAQ4 [c] {s} x y)
(LEAQ2 [c] {s} x (SHLQconst [2] y)) -> (LEAQ8 [c] {s} x y)
(LEAQ4 [c] {s} x (SHLQconst [1] y)) -> (LEAQ8 [c] {s} x y)
// fold shifts into LEAQx/LEALx
(LEA(Q|L)1 [c] {s} x (SHL(Q|L)const [1] y)) -> (LEA(Q|L)2 [c] {s} x y)
(LEA(Q|L)1 [c] {s} x (SHL(Q|L)const [2] y)) -> (LEA(Q|L)4 [c] {s} x y)
(LEA(Q|L)1 [c] {s} x (SHL(Q|L)const [3] y)) -> (LEA(Q|L)8 [c] {s} x y)
(LEA(Q|L)2 [c] {s} x (SHL(Q|L)const [1] y)) -> (LEA(Q|L)4 [c] {s} x y)
(LEA(Q|L)2 [c] {s} x (SHL(Q|L)const [2] y)) -> (LEA(Q|L)8 [c] {s} x y)
(LEA(Q|L)4 [c] {s} x (SHL(Q|L)const [1] y)) -> (LEA(Q|L)8 [c] {s} x y)
// reverse ordering of compare instruction
(SETL (InvertFlags x)) -> (SETG x)
@ -2219,12 +2217,6 @@
&& clobber(mem2)
-> (MOVQstore [i-4] {s} p (MOVQload [j-4] {s2} p2 mem) mem)
// amd64p32 rules
// same as the rules above, but with 32 instead of 64 bit pointer arithmetic.
// LEAQ,ADDQ -> LEAL,ADDL
(ADDLconst [c] (LEAL [d] {s} x)) && is32Bit(c+d) -> (LEAL [c+d] {s} x)
(LEAL [c] {s} (ADDLconst [d] x)) && is32Bit(c+d) -> (LEAL [c+d] {s} x)
(MOVQload [off1] {sym1} (LEAL [off2] {sym2} base) mem) && canMergeSym(sym1, sym2) && is32Bit(off1+off2) ->
(MOVQload [off1+off2] {mergeSym(sym1,sym2)} base mem)
(MOVLload [off1] {sym1} (LEAL [off2] {sym2} base) mem) && canMergeSym(sym1, sym2) && is32Bit(off1+off2) ->
@ -2410,8 +2402,8 @@
(MOVLi2f <t> (Arg [off] {sym})) -> @b.Func.Entry (Arg <t> [off] {sym})
// LEAQ is rematerializeable, so this helps to avoid register spill.
// See isuue 22947 for details
(ADDQconst [off] x:(SP)) -> (LEAQ [off] x)
// See issue 22947 for details
(ADD(Q|L)const [off] x:(SP)) -> (LEA(Q|L) [off] x)
// Fold loads into compares
// Note: these may be undone by the flagalloc pass.

File diff suppressed because it is too large Load Diff