cmd/compile: switch to typed aux for 386 optimization rules

Convert first section of 386 optimization rules to the typed aux form. Adds addOffset{32,64} functions that returns ValAndOffs and a ValAndOff.canAdd32 function that takes an int32. Passes GOARCH=386 gotip build -toolexec 'toolstash -cmp' -a std Change-Id: I69d2a8ace6936d5e8ba6ba047183002bf07dd5be Reviewed-on: https://go-review.googlesource.com/c/go/+/228825 Run-TryBot: Alberto Donizetti <alb.donizetti@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2024-11-18 18:04:46 -07:00 · 2020-04-19 10:45:04 +02:00 · 2020-04-19 10:45:04 +02:00 · 17fbc818ff
commit 17fbc818ff
parent 4974ac6874
4 changed files with 1614 additions and 1588 deletions
--- a/src/cmd/compile/internal/ssa/gen/386.rules
+++ b/src/cmd/compile/internal/ssa/gen/386.rules
@ -272,8 +272,8 @@
 		(MOVLstoreconst [makeValAndOff32(0,0)] destptr mem))

 // Strip off any fractional word zeroing.
-(Zero [s] destptr mem) && s%4 != 0 && s > 4 ->
-	(Zero [s-s%4] (ADDLconst destptr [s%4])
+(Zero [s] destptr mem) && s%4 != 0 && s > 4 =>
+	(Zero [s-s%4] (ADDLconst destptr [int32(s%4)])
 		(MOVLstoreconst [0] destptr mem))

 // Zero small numbers of words directly.
@ -370,113 +370,114 @@
 // TODO: Should the optimizations be a separate pass?

 // Fold boolean tests into blocks
-(NE (TESTB (SETL  cmp) (SETL  cmp)) yes no) -> (LT  cmp yes no)
-(NE (TESTB (SETLE cmp) (SETLE cmp)) yes no) -> (LE  cmp yes no)
-(NE (TESTB (SETG  cmp) (SETG  cmp)) yes no) -> (GT  cmp yes no)
-(NE (TESTB (SETGE cmp) (SETGE cmp)) yes no) -> (GE  cmp yes no)
-(NE (TESTB (SETEQ cmp) (SETEQ cmp)) yes no) -> (EQ  cmp yes no)
-(NE (TESTB (SETNE cmp) (SETNE cmp)) yes no) -> (NE  cmp yes no)
-(NE (TESTB (SETB  cmp) (SETB  cmp)) yes no) -> (ULT cmp yes no)
-(NE (TESTB (SETBE cmp) (SETBE cmp)) yes no) -> (ULE cmp yes no)
-(NE (TESTB (SETA  cmp) (SETA  cmp)) yes no) -> (UGT cmp yes no)
-(NE (TESTB (SETAE cmp) (SETAE cmp)) yes no) -> (UGE cmp yes no)
-(NE (TESTB (SETO cmp) (SETO cmp)) yes no) -> (OS cmp yes no)
+(NE (TESTB (SETL  cmp) (SETL  cmp)) yes no) => (LT  cmp yes no)
+(NE (TESTB (SETLE cmp) (SETLE cmp)) yes no) => (LE  cmp yes no)
+(NE (TESTB (SETG  cmp) (SETG  cmp)) yes no) => (GT  cmp yes no)
+(NE (TESTB (SETGE cmp) (SETGE cmp)) yes no) => (GE  cmp yes no)
+(NE (TESTB (SETEQ cmp) (SETEQ cmp)) yes no) => (EQ  cmp yes no)
+(NE (TESTB (SETNE cmp) (SETNE cmp)) yes no) => (NE  cmp yes no)
+(NE (TESTB (SETB  cmp) (SETB  cmp)) yes no) => (ULT cmp yes no)
+(NE (TESTB (SETBE cmp) (SETBE cmp)) yes no) => (ULE cmp yes no)
+(NE (TESTB (SETA  cmp) (SETA  cmp)) yes no) => (UGT cmp yes no)
+(NE (TESTB (SETAE cmp) (SETAE cmp)) yes no) => (UGE cmp yes no)
+(NE (TESTB (SETO cmp) (SETO cmp)) yes no) => (OS cmp yes no)

 // Special case for floating point - LF/LEF not generated
-(NE (TESTB (SETGF  cmp) (SETGF  cmp)) yes no) -> (UGT  cmp yes no)
-(NE (TESTB (SETGEF cmp) (SETGEF cmp)) yes no) -> (UGE  cmp yes no)
-(NE (TESTB (SETEQF cmp) (SETEQF cmp)) yes no) -> (EQF  cmp yes no)
-(NE (TESTB (SETNEF cmp) (SETNEF cmp)) yes no) -> (NEF  cmp yes no)
+(NE (TESTB (SETGF  cmp) (SETGF  cmp)) yes no) => (UGT  cmp yes no)
+(NE (TESTB (SETGEF cmp) (SETGEF cmp)) yes no) => (UGE  cmp yes no)
+(NE (TESTB (SETEQF cmp) (SETEQF cmp)) yes no) => (EQF  cmp yes no)
+(NE (TESTB (SETNEF cmp) (SETNEF cmp)) yes no) => (NEF  cmp yes no)

 // fold constants into instructions
-(ADDL x (MOVLconst [c])) -> (ADDLconst [c] x)
-(ADDLcarry x (MOVLconst [c])) -> (ADDLconstcarry [c] x)
-(ADCL x (MOVLconst [c]) f) -> (ADCLconst [c] x f)
+(ADDL x (MOVLconst [c])) => (ADDLconst [c] x)
+(ADDLcarry x (MOVLconst [c])) => (ADDLconstcarry [c] x)
+(ADCL x (MOVLconst [c]) f) => (ADCLconst [c] x f)

-(SUBL x (MOVLconst [c])) -> (SUBLconst x [c])
-(SUBL (MOVLconst [c]) x) -> (NEGL (SUBLconst <v.Type> x [c]))
-(SUBLcarry x (MOVLconst [c])) -> (SUBLconstcarry [c] x)
-(SBBL x (MOVLconst [c]) f) -> (SBBLconst [c] x f)
+(SUBL x (MOVLconst [c])) => (SUBLconst x [c])
+(SUBL (MOVLconst [c]) x) => (NEGL (SUBLconst <v.Type> x [c]))
+(SUBLcarry x (MOVLconst [c])) => (SUBLconstcarry [c] x)
+(SBBL x (MOVLconst [c]) f) => (SBBLconst [c] x f)

-(MULL x (MOVLconst [c])) -> (MULLconst [c] x)
+(MULL x (MOVLconst [c])) => (MULLconst [c] x)
+(ANDL x (MOVLconst [c])) => (ANDLconst [c] x)

-(ANDL x (MOVLconst [c])) -> (ANDLconst [c] x)
+(ANDLconst [c] (ANDLconst [d] x)) => (ANDLconst [c & d] x)
+(XORLconst [c] (XORLconst [d] x)) => (XORLconst [c ^ d] x)
+(MULLconst [c] (MULLconst [d] x)) => (MULLconst [c * d] x)

-(ANDLconst [c] (ANDLconst [d] x)) -> (ANDLconst [c & d] x)
+(ORL x (MOVLconst [c])) => (ORLconst [c] x)
+(XORL x (MOVLconst [c])) => (XORLconst [c] x)

-(XORLconst [c] (XORLconst [d] x)) -> (XORLconst [c ^ d] x)
+(SHLL x (MOVLconst [c])) => (SHLLconst [c&31] x)
+(SHRL x (MOVLconst [c])) => (SHRLconst [c&31] x)
+(SHRW x (MOVLconst [c])) && c&31 < 16 => (SHRWconst [int16(c&31)] x)
+(SHRW _ (MOVLconst [c])) && c&31 >= 16 => (MOVLconst [0])
+(SHRB x (MOVLconst [c])) && c&31 < 8 => (SHRBconst [int8(c&31)] x)
+(SHRB _ (MOVLconst [c])) && c&31 >= 8 => (MOVLconst [0])

-(MULLconst [c] (MULLconst [d] x)) -> (MULLconst [int64(int32(c * d))] x)
+(SARL x (MOVLconst [c])) => (SARLconst [c&31] x)
+(SARW x (MOVLconst [c])) => (SARWconst [int16(min(int64(c&31),15))] x)
+(SARB x (MOVLconst [c])) => (SARBconst [int8(min(int64(c&31),7))] x)

-(ORL x (MOVLconst [c])) -> (ORLconst [c] x)
-
-(XORL x (MOVLconst [c])) -> (XORLconst [c] x)
-
-(SHLL x (MOVLconst [c])) -> (SHLLconst [c&31] x)
-(SHRL x (MOVLconst [c])) -> (SHRLconst [c&31] x)
-(SHRW x (MOVLconst [c])) && c&31 < 16 -> (SHRWconst [c&31] x)
-(SHRW _ (MOVLconst [c])) && c&31 >= 16 -> (MOVLconst [0])
-(SHRB x (MOVLconst [c])) && c&31 < 8 -> (SHRBconst [c&31] x)
-(SHRB _ (MOVLconst [c])) && c&31 >= 8 -> (MOVLconst [0])
-
-(SARL x (MOVLconst [c])) -> (SARLconst [c&31] x)
-(SARW x (MOVLconst [c])) -> (SARWconst [min(c&31,15)] x)
-(SARB x (MOVLconst [c])) -> (SARBconst [min(c&31,7)] x)
-
-(SARL x (ANDLconst [31] y)) -> (SARL x y)
-
-(SHLL x (ANDLconst [31] y)) -> (SHLL x y)
-
-(SHRL x (ANDLconst [31] y)) -> (SHRL x y)
+(SARL x (ANDLconst [31] y)) => (SARL x y)
+(SHLL x (ANDLconst [31] y)) => (SHLL x y)
+(SHRL x (ANDLconst [31] y)) => (SHRL x y)

 // Rotate instructions

-(ADDL (SHLLconst [c] x) (SHRLconst [d] x)) && d == 32-c -> (ROLLconst [c] x)
-( ORL (SHLLconst [c] x) (SHRLconst [d] x)) && d == 32-c -> (ROLLconst [c] x)
-(XORL (SHLLconst [c] x) (SHRLconst [d] x)) && d == 32-c -> (ROLLconst [c] x)
+(ADDL (SHLLconst [c] x) (SHRLconst [d] x)) && d == 32-c => (ROLLconst [c] x)
+( ORL (SHLLconst [c] x) (SHRLconst [d] x)) && d == 32-c => (ROLLconst [c] x)
+(XORL (SHLLconst [c] x) (SHRLconst [d] x)) && d == 32-c => (ROLLconst [c] x)

-(ADDL <t> (SHLLconst x [c]) (SHRWconst x [d])) && c < 16 && d == 16-c && t.Size() == 2 -> (ROLWconst x [c])
-( ORL <t> (SHLLconst x [c]) (SHRWconst x [d])) && c < 16 && d == 16-c && t.Size() == 2 -> (ROLWconst x [c])
-(XORL <t> (SHLLconst x [c]) (SHRWconst x [d])) && c < 16 && d == 16-c && t.Size() == 2 -> (ROLWconst x [c])
+(ADDL <t> (SHLLconst x [c]) (SHRWconst x [d])) && c < 16 && d == int16(16-c) && t.Size() == 2
+  => (ROLWconst x [int16(c)])
+( ORL <t> (SHLLconst x [c]) (SHRWconst x [d])) && c < 16 && d == int16(16-c) && t.Size() == 2
+  => (ROLWconst x [int16(c)])
+(XORL <t> (SHLLconst x [c]) (SHRWconst x [d])) && c < 16 && d == int16(16-c) && t.Size() == 2
+  => (ROLWconst x [int16(c)])

-(ADDL <t> (SHLLconst x [c]) (SHRBconst x [d])) && c < 8 && d == 8-c && t.Size() == 1 -> (ROLBconst x [c])
-( ORL <t> (SHLLconst x [c]) (SHRBconst x [d])) && c < 8 && d == 8-c && t.Size() == 1 -> (ROLBconst x [c])
-(XORL <t> (SHLLconst x [c]) (SHRBconst x [d])) && c < 8 && d == 8-c && t.Size() == 1 -> (ROLBconst x [c])
+(ADDL <t> (SHLLconst x [c]) (SHRBconst x [d])) && c < 8 && d == int8(8-c) && t.Size() == 1
+  => (ROLBconst x [int8(c)])
+( ORL <t> (SHLLconst x [c]) (SHRBconst x [d])) && c < 8 && d == int8(8-c) && t.Size() == 1
+  => (ROLBconst x [int8(c)])
+(XORL <t> (SHLLconst x [c]) (SHRBconst x [d])) && c < 8 && d == int8(8-c) && t.Size() == 1
+  => (ROLBconst x [int8(c)])
+
+(ROLLconst [c] (ROLLconst [d] x)) => (ROLLconst [(c+d)&31] x)
+(ROLWconst [c] (ROLWconst [d] x)) => (ROLWconst [(c+d)&15] x)
+(ROLBconst [c] (ROLBconst [d] x)) => (ROLBconst [(c+d)& 7] x)

-(ROLLconst [c] (ROLLconst [d] x)) -> (ROLLconst [(c+d)&31] x)
-(ROLWconst [c] (ROLWconst [d] x)) -> (ROLWconst [(c+d)&15] x)
-(ROLBconst [c] (ROLBconst [d] x)) -> (ROLBconst [(c+d)& 7] x)

 // Constant shift simplifications

-(SHLLconst x [0]) -> x
-(SHRLconst x [0]) -> x
-(SARLconst x [0]) -> x
+(SHLLconst x [0]) => x
+(SHRLconst x [0]) => x
+(SARLconst x [0]) => x

-(SHRWconst x [0]) -> x
-(SARWconst x [0]) -> x
+(SHRWconst x [0]) => x
+(SARWconst x [0]) => x

-(SHRBconst x [0]) -> x
-(SARBconst x [0]) -> x
+(SHRBconst x [0]) => x
+(SARBconst x [0]) => x

-(ROLLconst [0] x) -> x
-(ROLWconst [0] x) -> x
-(ROLBconst [0] x) -> x
+(ROLLconst [0] x) => x
+(ROLWconst [0] x) => x
+(ROLBconst [0] x) => x

 // Note: the word and byte shifts keep the low 5 bits (not the low 4 or 3 bits)
 // because the x86 instructions are defined to use all 5 bits of the shift even
 // for the small shifts. I don't think we'll ever generate a weird shift (e.g.
 // (SHRW x (MOVLconst [24])), but just in case.

-(CMPL x (MOVLconst [c])) -> (CMPLconst x [c])
-(CMPL (MOVLconst [c]) x) -> (InvertFlags (CMPLconst x [c]))
-(CMPW x (MOVLconst [c])) -> (CMPWconst x [int64(int16(c))])
-(CMPW (MOVLconst [c]) x) -> (InvertFlags (CMPWconst x [int64(int16(c))]))
-(CMPB x (MOVLconst [c])) -> (CMPBconst x [int64(int8(c))])
-(CMPB (MOVLconst [c]) x) -> (InvertFlags (CMPBconst x [int64(int8(c))]))
+(CMPL x (MOVLconst [c])) => (CMPLconst x [c])
+(CMPL (MOVLconst [c]) x) => (InvertFlags (CMPLconst x [c]))
+(CMPW x (MOVLconst [c])) => (CMPWconst x [int16(c)])
+(CMPW (MOVLconst [c]) x) => (InvertFlags (CMPWconst x [int16(c)]))
+(CMPB x (MOVLconst [c])) => (CMPBconst x [int8(c)])
+(CMPB (MOVLconst [c]) x) => (InvertFlags (CMPBconst x [int8(c)]))

 // Canonicalize the order of arguments to comparisons - helps with CSE.
-(CMP(L|W|B) x y) && x.ID > y.ID -> (InvertFlags (CMP(L|W|B) y x))
+(CMP(L|W|B) x y) && x.ID > y.ID => (InvertFlags (CMP(L|W|B) y x))

 // strength reduction
 // Assumes that the following costs from https://gmplib.org/~tege/x86-timing.pdf:
@ -487,86 +488,86 @@
 // which can require a register-register move
 // to preserve the original value,
 // so it must be used with care.
-(MULLconst [-9] x) -> (NEGL (LEAL8 <v.Type> x x))
-(MULLconst [-5] x) -> (NEGL (LEAL4 <v.Type> x x))
-(MULLconst [-3] x) -> (NEGL (LEAL2 <v.Type> x x))
-(MULLconst [-1] x) -> (NEGL x)
-(MULLconst [0] _) -> (MOVLconst [0])
-(MULLconst [1] x) -> x
-(MULLconst [3] x) -> (LEAL2 x x)
-(MULLconst [5] x) -> (LEAL4 x x)
-(MULLconst [7] x) -> (LEAL2 x (LEAL2 <v.Type> x x))
-(MULLconst [9] x) -> (LEAL8 x x)
-(MULLconst [11] x) -> (LEAL2 x (LEAL4 <v.Type> x x))
-(MULLconst [13] x) -> (LEAL4 x (LEAL2 <v.Type> x x))
-(MULLconst [19] x) -> (LEAL2 x (LEAL8 <v.Type> x x))
-(MULLconst [21] x) -> (LEAL4 x (LEAL4 <v.Type> x x))
-(MULLconst [25] x) -> (LEAL8 x (LEAL2 <v.Type> x x))
-(MULLconst [27] x) -> (LEAL8 (LEAL2 <v.Type> x x) (LEAL2 <v.Type> x x))
-(MULLconst [37] x) -> (LEAL4 x (LEAL8 <v.Type> x x))
-(MULLconst [41] x) -> (LEAL8 x (LEAL4 <v.Type> x x))
-(MULLconst [45] x) -> (LEAL8 (LEAL4 <v.Type> x x) (LEAL4 <v.Type> x x))
-(MULLconst [73] x) -> (LEAL8 x (LEAL8 <v.Type> x x))
-(MULLconst [81] x) -> (LEAL8 (LEAL8 <v.Type> x x) (LEAL8 <v.Type> x x))
+(MULLconst [-9] x) => (NEGL (LEAL8 <v.Type> x x))
+(MULLconst [-5] x) => (NEGL (LEAL4 <v.Type> x x))
+(MULLconst [-3] x) => (NEGL (LEAL2 <v.Type> x x))
+(MULLconst [-1] x) => (NEGL x)
+(MULLconst [0] _) => (MOVLconst [0])
+(MULLconst [1] x) => x
+(MULLconst [3] x) => (LEAL2 x x)
+(MULLconst [5] x) => (LEAL4 x x)
+(MULLconst [7] x) => (LEAL2 x (LEAL2 <v.Type> x x))
+(MULLconst [9] x) => (LEAL8 x x)
+(MULLconst [11] x) => (LEAL2 x (LEAL4 <v.Type> x x))
+(MULLconst [13] x) => (LEAL4 x (LEAL2 <v.Type> x x))
+(MULLconst [19] x) => (LEAL2 x (LEAL8 <v.Type> x x))
+(MULLconst [21] x) => (LEAL4 x (LEAL4 <v.Type> x x))
+(MULLconst [25] x) => (LEAL8 x (LEAL2 <v.Type> x x))
+(MULLconst [27] x) => (LEAL8 (LEAL2 <v.Type> x x) (LEAL2 <v.Type> x x))
+(MULLconst [37] x) => (LEAL4 x (LEAL8 <v.Type> x x))
+(MULLconst [41] x) => (LEAL8 x (LEAL4 <v.Type> x x))
+(MULLconst [45] x) => (LEAL8 (LEAL4 <v.Type> x x) (LEAL4 <v.Type> x x))
+(MULLconst [73] x) => (LEAL8 x (LEAL8 <v.Type> x x))
+(MULLconst [81] x) => (LEAL8 (LEAL8 <v.Type> x x) (LEAL8 <v.Type> x x))

-(MULLconst [c] x) && isPowerOfTwo(c+1) && c >= 15 -> (SUBL (SHLLconst <v.Type> [log2(c+1)] x) x)
-(MULLconst [c] x) && isPowerOfTwo(c-1) && c >= 17 -> (LEAL1 (SHLLconst <v.Type> [log2(c-1)] x) x)
-(MULLconst [c] x) && isPowerOfTwo(c-2) && c >= 34 -> (LEAL2 (SHLLconst <v.Type> [log2(c-2)] x) x)
-(MULLconst [c] x) && isPowerOfTwo(c-4) && c >= 68 -> (LEAL4 (SHLLconst <v.Type> [log2(c-4)] x) x)
-(MULLconst [c] x) && isPowerOfTwo(c-8) && c >= 136 -> (LEAL8 (SHLLconst <v.Type> [log2(c-8)] x) x)
-(MULLconst [c] x) && c%3 == 0 && isPowerOfTwo(c/3) -> (SHLLconst [log2(c/3)] (LEAL2 <v.Type> x x))
-(MULLconst [c] x) && c%5 == 0 && isPowerOfTwo(c/5) -> (SHLLconst [log2(c/5)] (LEAL4 <v.Type> x x))
-(MULLconst [c] x) && c%9 == 0 && isPowerOfTwo(c/9) -> (SHLLconst [log2(c/9)] (LEAL8 <v.Type> x x))
+(MULLconst [c] x) && isPowerOfTwo32(c+1) && c >= 15 => (SUBL (SHLLconst <v.Type> [int32(log32(c+1))] x) x)
+(MULLconst [c] x) && isPowerOfTwo32(c-1) && c >= 17 => (LEAL1 (SHLLconst <v.Type> [int32(log32(c-1))] x) x)
+(MULLconst [c] x) && isPowerOfTwo32(c-2) && c >= 34 => (LEAL2 (SHLLconst <v.Type> [int32(log32(c-2))] x) x)
+(MULLconst [c] x) && isPowerOfTwo32(c-4) && c >= 68 => (LEAL4 (SHLLconst <v.Type> [int32(log32(c-4))] x) x)
+(MULLconst [c] x) && isPowerOfTwo32(c-8) && c >= 136 => (LEAL8 (SHLLconst <v.Type> [int32(log32(c-8))] x) x)
+(MULLconst [c] x) && c%3 == 0 && isPowerOfTwo32(c/3) => (SHLLconst [int32(log32(c/3))] (LEAL2 <v.Type> x x))
+(MULLconst [c] x) && c%5 == 0 && isPowerOfTwo32(c/5) => (SHLLconst [int32(log32(c/5))] (LEAL4 <v.Type> x x))
+(MULLconst [c] x) && c%9 == 0 && isPowerOfTwo32(c/9) => (SHLLconst [int32(log32(c/9))] (LEAL8 <v.Type> x x))

 // combine add/shift into LEAL
-(ADDL x (SHLLconst [3] y)) -> (LEAL8 x y)
-(ADDL x (SHLLconst [2] y)) -> (LEAL4 x y)
-(ADDL x (SHLLconst [1] y)) -> (LEAL2 x y)
-(ADDL x (ADDL y y)) -> (LEAL2 x y)
-(ADDL x (ADDL x y)) -> (LEAL2 y x)
+(ADDL x (SHLLconst [3] y)) => (LEAL8 x y)
+(ADDL x (SHLLconst [2] y)) => (LEAL4 x y)
+(ADDL x (SHLLconst [1] y)) => (LEAL2 x y)
+(ADDL x (ADDL y y)) => (LEAL2 x y)
+(ADDL x (ADDL x y)) => (LEAL2 y x)

 // combine ADDL/ADDLconst into LEAL1
-(ADDLconst [c] (ADDL x y)) -> (LEAL1 [c] x y)
-(ADDL (ADDLconst [c] x) y) -> (LEAL1 [c] x y)
+(ADDLconst [c] (ADDL x y)) => (LEAL1 [c] x y)
+(ADDL (ADDLconst [c] x) y) => (LEAL1 [c] x y)

 // fold ADDL into LEAL
-(ADDLconst [c] (LEAL [d] {s} x)) && is32Bit(c+d) -> (LEAL [c+d] {s} x)
-(LEAL [c] {s} (ADDLconst [d] x)) && is32Bit(c+d) -> (LEAL [c+d] {s} x)
-(LEAL [c] {s} (ADDL x y)) && x.Op != OpSB && y.Op != OpSB -> (LEAL1 [c] {s} x y)
-(ADDL x (LEAL [c] {s} y)) && x.Op != OpSB && y.Op != OpSB -> (LEAL1 [c] {s} x y)
+(ADDLconst [c] (LEAL [d] {s} x)) && is32Bit(int64(c)+int64(d)) => (LEAL [c+d] {s} x)
+(LEAL [c] {s} (ADDLconst [d] x)) && is32Bit(int64(c)+int64(d)) => (LEAL [c+d] {s} x)
+(LEAL [c] {s} (ADDL x y)) && x.Op != OpSB && y.Op != OpSB => (LEAL1 [c] {s} x y)
+(ADDL x (LEAL [c] {s} y)) && x.Op != OpSB && y.Op != OpSB => (LEAL1 [c] {s} x y)

 // fold ADDLconst into LEALx
-(ADDLconst [c] (LEAL1 [d] {s} x y)) && is32Bit(c+d) -> (LEAL1 [c+d] {s} x y)
-(ADDLconst [c] (LEAL2 [d] {s} x y)) && is32Bit(c+d) -> (LEAL2 [c+d] {s} x y)
-(ADDLconst [c] (LEAL4 [d] {s} x y)) && is32Bit(c+d) -> (LEAL4 [c+d] {s} x y)
-(ADDLconst [c] (LEAL8 [d] {s} x y)) && is32Bit(c+d) -> (LEAL8 [c+d] {s} x y)
-(LEAL1 [c] {s} (ADDLconst [d] x) y) && is32Bit(c+d)   && x.Op != OpSB -> (LEAL1 [c+d] {s} x y)
-(LEAL2 [c] {s} (ADDLconst [d] x) y) && is32Bit(c+d)   && x.Op != OpSB -> (LEAL2 [c+d] {s} x y)
-(LEAL2 [c] {s} x (ADDLconst [d] y)) && is32Bit(c+2*d) && y.Op != OpSB -> (LEAL2 [c+2*d] {s} x y)
-(LEAL4 [c] {s} (ADDLconst [d] x) y) && is32Bit(c+d)   && x.Op != OpSB -> (LEAL4 [c+d] {s} x y)
-(LEAL4 [c] {s} x (ADDLconst [d] y)) && is32Bit(c+4*d) && y.Op != OpSB -> (LEAL4 [c+4*d] {s} x y)
-(LEAL8 [c] {s} (ADDLconst [d] x) y) && is32Bit(c+d)   && x.Op != OpSB -> (LEAL8 [c+d] {s} x y)
-(LEAL8 [c] {s} x (ADDLconst [d] y)) && is32Bit(c+8*d) && y.Op != OpSB -> (LEAL8 [c+8*d] {s} x y)
+(ADDLconst [c] (LEAL1 [d] {s} x y)) && is32Bit(int64(c)+int64(d)) => (LEAL1 [c+d] {s} x y)
+(ADDLconst [c] (LEAL2 [d] {s} x y)) && is32Bit(int64(c)+int64(d)) => (LEAL2 [c+d] {s} x y)
+(ADDLconst [c] (LEAL4 [d] {s} x y)) && is32Bit(int64(c)+int64(d)) => (LEAL4 [c+d] {s} x y)
+(ADDLconst [c] (LEAL8 [d] {s} x y)) && is32Bit(int64(c)+int64(d)) => (LEAL8 [c+d] {s} x y)
+(LEAL1 [c] {s} (ADDLconst [d] x) y) && is32Bit(int64(c)+int64(d))   && x.Op != OpSB => (LEAL1 [c+d] {s} x y)
+(LEAL2 [c] {s} (ADDLconst [d] x) y) && is32Bit(int64(c)+int64(d))   && x.Op != OpSB => (LEAL2 [c+d] {s} x y)
+(LEAL2 [c] {s} x (ADDLconst [d] y)) && is32Bit(int64(c)+2*int64(d)) && y.Op != OpSB => (LEAL2 [c+2*d] {s} x y)
+(LEAL4 [c] {s} (ADDLconst [d] x) y) && is32Bit(int64(c)+int64(d))   && x.Op != OpSB => (LEAL4 [c+d] {s} x y)
+(LEAL4 [c] {s} x (ADDLconst [d] y)) && is32Bit(int64(c)+4*int64(d)) && y.Op != OpSB => (LEAL4 [c+4*d] {s} x y)
+(LEAL8 [c] {s} (ADDLconst [d] x) y) && is32Bit(int64(c)+int64(d))   && x.Op != OpSB => (LEAL8 [c+d] {s} x y)
+(LEAL8 [c] {s} x (ADDLconst [d] y)) && is32Bit(int64(c)+8*int64(d)) && y.Op != OpSB => (LEAL8 [c+8*d] {s} x y)

 // fold shifts into LEALx
-(LEAL1 [c] {s} x (SHLLconst [1] y)) -> (LEAL2 [c] {s} x y)
-(LEAL1 [c] {s} x (SHLLconst [2] y)) -> (LEAL4 [c] {s} x y)
-(LEAL1 [c] {s} x (SHLLconst [3] y)) -> (LEAL8 [c] {s} x y)
-(LEAL2 [c] {s} x (SHLLconst [1] y)) -> (LEAL4 [c] {s} x y)
-(LEAL2 [c] {s} x (SHLLconst [2] y)) -> (LEAL8 [c] {s} x y)
-(LEAL4 [c] {s} x (SHLLconst [1] y)) -> (LEAL8 [c] {s} x y)
+(LEAL1 [c] {s} x (SHLLconst [1] y)) => (LEAL2 [c] {s} x y)
+(LEAL1 [c] {s} x (SHLLconst [2] y)) => (LEAL4 [c] {s} x y)
+(LEAL1 [c] {s} x (SHLLconst [3] y)) => (LEAL8 [c] {s} x y)
+(LEAL2 [c] {s} x (SHLLconst [1] y)) => (LEAL4 [c] {s} x y)
+(LEAL2 [c] {s} x (SHLLconst [2] y)) => (LEAL8 [c] {s} x y)
+(LEAL4 [c] {s} x (SHLLconst [1] y)) => (LEAL8 [c] {s} x y)

 // reverse ordering of compare instruction
-(SETL (InvertFlags x)) -> (SETG x)
-(SETG (InvertFlags x)) -> (SETL x)
-(SETB (InvertFlags x)) -> (SETA x)
-(SETA (InvertFlags x)) -> (SETB x)
-(SETLE (InvertFlags x)) -> (SETGE x)
-(SETGE (InvertFlags x)) -> (SETLE x)
-(SETBE (InvertFlags x)) -> (SETAE x)
-(SETAE (InvertFlags x)) -> (SETBE x)
-(SETEQ (InvertFlags x)) -> (SETEQ x)
-(SETNE (InvertFlags x)) -> (SETNE x)
+(SETL (InvertFlags x)) => (SETG x)
+(SETG (InvertFlags x)) => (SETL x)
+(SETB (InvertFlags x)) => (SETA x)
+(SETA (InvertFlags x)) => (SETB x)
+(SETLE (InvertFlags x)) => (SETGE x)
+(SETGE (InvertFlags x)) => (SETLE x)
+(SETBE (InvertFlags x)) => (SETAE x)
+(SETAE (InvertFlags x)) => (SETBE x)
+(SETEQ (InvertFlags x)) => (SETEQ x)
+(SETNE (InvertFlags x)) => (SETNE x)

 // sign extended loads
 // Note: The combined instruction must end up in the same block
@ -576,58 +577,60 @@
 // Make sure we don't combine these ops if the load has another use.
 // This prevents a single load from being split into multiple loads
 // which then might return different values.  See test/atomicload.go.
-(MOVBLSX x:(MOVBload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVBLSXload <v.Type> [off] {sym} ptr mem)
-(MOVBLZX x:(MOVBload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVBload <v.Type> [off] {sym} ptr mem)
-(MOVWLSX x:(MOVWload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVWLSXload <v.Type> [off] {sym} ptr mem)
-(MOVWLZX x:(MOVWload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) -> @x.Block (MOVWload <v.Type> [off] {sym} ptr mem)
+(MOVBLSX x:(MOVBload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBLSXload <v.Type> [off] {sym} ptr mem)
+(MOVBLZX x:(MOVBload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBload <v.Type> [off] {sym} ptr mem)
+(MOVWLSX x:(MOVWload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWLSXload <v.Type> [off] {sym} ptr mem)
+(MOVWLZX x:(MOVWload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWload <v.Type> [off] {sym} ptr mem)

 // replace load from same location as preceding store with zero/sign extension (or copy in case of full width)
-(MOVBload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> (MOVBLZX x)
-(MOVWload [off] {sym} ptr (MOVWstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> (MOVWLZX x)
-(MOVLload [off] {sym} ptr (MOVLstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> x
-(MOVBLSXload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> (MOVBLSX x)
-(MOVWLSXload [off] {sym} ptr (MOVWstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) -> (MOVWLSX x)
+(MOVBload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVBLZX x)
+(MOVWload [off] {sym} ptr (MOVWstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVWLZX x)
+(MOVLload [off] {sym} ptr (MOVLstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => x
+(MOVBLSXload [off] {sym} ptr (MOVBstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVBLSX x)
+(MOVWLSXload [off] {sym} ptr (MOVWstore [off2] {sym2} ptr2 x _)) && sym == sym2 && off == off2 && isSamePtr(ptr, ptr2) => (MOVWLSX x)

 // Fold extensions and ANDs together.
-(MOVBLZX (ANDLconst [c] x)) -> (ANDLconst [c & 0xff] x)
-(MOVWLZX (ANDLconst [c] x)) -> (ANDLconst [c & 0xffff] x)
-(MOVBLSX (ANDLconst [c] x)) && c & 0x80 == 0 -> (ANDLconst [c & 0x7f] x)
-(MOVWLSX (ANDLconst [c] x)) && c & 0x8000 == 0 -> (ANDLconst [c & 0x7fff] x)
+(MOVBLZX (ANDLconst [c] x)) => (ANDLconst [c & 0xff] x)
+(MOVWLZX (ANDLconst [c] x)) => (ANDLconst [c & 0xffff] x)
+(MOVBLSX (ANDLconst [c] x)) && c & 0x80 == 0 => (ANDLconst [c & 0x7f] x)
+(MOVWLSX (ANDLconst [c] x)) && c & 0x8000 == 0 => (ANDLconst [c & 0x7fff] x)

 // Don't extend before storing
-(MOVWstore [off] {sym} ptr (MOVWL(S|Z)X x) mem) -> (MOVWstore [off] {sym} ptr x mem)
-(MOVBstore [off] {sym} ptr (MOVBL(S|Z)X x) mem) -> (MOVBstore [off] {sym} ptr x mem)
+(MOVWstore [off] {sym} ptr (MOVWL(S|Z)X x) mem) => (MOVWstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVBL(S|Z)X x) mem) => (MOVBstore [off] {sym} ptr x mem)

 // fold constants into memory operations
 // Note that this is not always a good idea because if not all the uses of
 // the ADDLconst get eliminated, we still have to compute the ADDLconst and we now
 // have potentially two live values (ptr and (ADDLconst [off] ptr)) instead of one.
 // Nevertheless, let's do it!
-(MOV(L|W|B|SS|SD)load  [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(off1+off2) -> (MOV(L|W|B|SS|SD)load  [off1+off2] {sym} ptr mem)
-(MOV(L|W|B|SS|SD)store  [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOV(L|W|B|SS|SD)store  [off1+off2] {sym} ptr val mem)
+(MOV(L|W|B|SS|SD)load  [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(int64(off1)+int64(off2)) =>
+    (MOV(L|W|B|SS|SD)load  [off1+off2] {sym} ptr mem)
+(MOV(L|W|B|SS|SD)store  [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(int64(off1)+int64(off2)) =>
+    (MOV(L|W|B|SS|SD)store  [off1+off2] {sym} ptr val mem)

-((ADD|SUB|MUL|AND|OR|XOR)Lload [off1] {sym} val (ADDLconst [off2] base) mem) && is32Bit(off1+off2) ->
+((ADD|SUB|MUL|AND|OR|XOR)Lload [off1] {sym} val (ADDLconst [off2] base) mem) && is32Bit(int64(off1)+int64(off2)) =>
 	((ADD|SUB|MUL|AND|OR|XOR)Lload [off1+off2] {sym} val base mem)
-((ADD|SUB|MUL|DIV)SSload [off1] {sym} val (ADDLconst [off2] base) mem) && is32Bit(off1+off2) ->
+((ADD|SUB|MUL|DIV)SSload [off1] {sym} val (ADDLconst [off2] base) mem) && is32Bit(int64(off1)+int64(off2)) =>
 	((ADD|SUB|MUL|DIV)SSload [off1+off2] {sym} val base mem)
-((ADD|SUB|MUL|DIV)SDload [off1] {sym} val (ADDLconst [off2] base) mem) && is32Bit(off1+off2) ->
+((ADD|SUB|MUL|DIV)SDload [off1] {sym} val (ADDLconst [off2] base) mem) && is32Bit(int64(off1)+int64(off2)) =>
 	((ADD|SUB|MUL|DIV)SDload [off1+off2] {sym} val base mem)
-((ADD|SUB|AND|OR|XOR)Lmodify [off1] {sym} (ADDLconst [off2] base) val mem) && is32Bit(off1+off2) ->
+((ADD|SUB|AND|OR|XOR)Lmodify [off1] {sym} (ADDLconst [off2] base) val mem) && is32Bit(int64(off1)+int64(off2)) =>
 	((ADD|SUB|AND|OR|XOR)Lmodify [off1+off2] {sym} base val mem)
-((ADD|AND|OR|XOR)Lconstmodify [valoff1] {sym} (ADDLconst [off2] base) mem) && ValAndOff(valoff1).canAdd(off2) ->
-	((ADD|AND|OR|XOR)Lconstmodify [ValAndOff(valoff1).add(off2)] {sym} base mem)
+((ADD|AND|OR|XOR)Lconstmodify [valoff1] {sym} (ADDLconst [off2] base) mem) && valoff1.canAdd32(off2) =>
+	((ADD|AND|OR|XOR)Lconstmodify [valoff1.addOffset32(off2)] {sym} base mem)

 // Fold constants into stores.
-(MOVLstore [off] {sym} ptr (MOVLconst [c]) mem) && validOff(off) ->
-	(MOVLstoreconst [makeValAndOff(int64(int32(c)),off)] {sym} ptr mem)
-(MOVWstore [off] {sym} ptr (MOVLconst [c]) mem) && validOff(off) ->
-	(MOVWstoreconst [makeValAndOff(int64(int16(c)),off)] {sym} ptr mem)
-(MOVBstore [off] {sym} ptr (MOVLconst [c]) mem) && validOff(off) ->
-	(MOVBstoreconst [makeValAndOff(int64(int8(c)),off)] {sym} ptr mem)
+(MOVLstore [off] {sym} ptr (MOVLconst [c]) mem) && validOff(int64(off)) =>
+	(MOVLstoreconst [makeValAndOff32(c,off)] {sym} ptr mem)
+(MOVWstore [off] {sym} ptr (MOVLconst [c]) mem) && validOff(int64(off)) =>
+	(MOVWstoreconst [makeValAndOff32(c,off)] {sym} ptr mem)
+(MOVBstore [off] {sym} ptr (MOVLconst [c]) mem) && validOff(int64(off)) =>
+	(MOVBstoreconst [makeValAndOff32(c,off)] {sym} ptr mem)

 // Fold address offsets into constant stores.
-(MOV(L|W|B)storeconst [sc] {s} (ADDLconst [off] ptr) mem) && ValAndOff(sc).canAdd(off) ->
-	(MOV(L|W|B)storeconst [ValAndOff(sc).add(off)] {s} ptr mem)
+(MOV(L|W|B)storeconst [sc] {s} (ADDLconst [off] ptr) mem) && sc.canAdd32(off) =>
+	(MOV(L|W|B)storeconst [sc.addOffset32(off)] {s} ptr mem)

 // We need to fold LEAL into the MOVx ops so that the live variable analysis knows
 // what variables are being read/written by the ops.
@ -637,111 +640,113 @@
 // a separate instruction gives us that register.  Having the LEAL be
 // a separate instruction also allows it to be CSEd (which is good because
 // it compiles to a thunk call).
-(MOV(L|W|B|SS|SD|BLSX|WLSX)load  [off1] {sym1} (LEAL [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2)
-  && (base.Op != OpSB || !config.ctxt.Flag_shared) ->
-        (MOV(L|W|B|SS|SD|BLSX|WLSX)load  [off1+off2] {mergeSym(sym1,sym2)} base mem)
+(MOV(L|W|B|SS|SD|BLSX|WLSX)load  [off1] {sym1} (LEAL [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)
+  && (base.Op != OpSB || !config.ctxt.Flag_shared) =>
+        (MOV(L|W|B|SS|SD|BLSX|WLSX)load  [off1+off2] {mergeSymTyped(sym1,sym2)} base mem)

-(MOV(L|W|B|SS|SD)store  [off1] {sym1} (LEAL [off2] {sym2} base) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2)
-  && (base.Op != OpSB || !config.ctxt.Flag_shared) ->
-	(MOV(L|W|B|SS|SD)store  [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+(MOV(L|W|B|SS|SD)store  [off1] {sym1} (LEAL [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2)
+  && (base.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOV(L|W|B|SS|SD)store  [off1+off2] {mergeSymTyped(sym1,sym2)} base val mem)

-(MOV(L|W|B)storeconst [sc] {sym1} (LEAL [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && ValAndOff(sc).canAdd(off)
-  && (ptr.Op != OpSB || !config.ctxt.Flag_shared) ->
-	(MOV(L|W|B)storeconst [ValAndOff(sc).add(off)] {mergeSym(sym1, sym2)} ptr mem)
+(MOV(L|W|B)storeconst [sc] {sym1} (LEAL [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && sc.canAdd32(off)
+  && (ptr.Op != OpSB || !config.ctxt.Flag_shared) =>
+	(MOV(L|W|B)storeconst [sc.addOffset32(off)] {mergeSymTyped(sym1, sym2)} ptr mem)

 ((ADD|SUB|MUL|AND|OR|XOR)Lload [off1] {sym1} val (LEAL [off2] {sym2} base) mem)
-	&& is32Bit(off1+off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) ->
-	((ADD|SUB|MUL|AND|OR|XOR)Lload [off1+off2] {mergeSym(sym1,sym2)} val base mem)
+	&& is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) =>
+	((ADD|SUB|MUL|AND|OR|XOR)Lload [off1+off2] {mergeSymTyped(sym1,sym2)} val base mem)
 ((ADD|SUB|MUL|DIV)SSload [off1] {sym1} val (LEAL [off2] {sym2} base) mem)
-	&& is32Bit(off1+off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) ->
-	((ADD|SUB|MUL|DIV)SSload [off1+off2] {mergeSym(sym1,sym2)} val base mem)
+	&& is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) =>
+	((ADD|SUB|MUL|DIV)SSload [off1+off2] {mergeSymTyped(sym1,sym2)} val base mem)
 ((ADD|SUB|MUL|DIV)SDload [off1] {sym1} val (LEAL [off2] {sym2} base) mem)
-	&& is32Bit(off1+off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) ->
-	((ADD|SUB|MUL|DIV)SDload [off1+off2] {mergeSym(sym1,sym2)} val base mem)
+	&& is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) =>
+	((ADD|SUB|MUL|DIV)SDload [off1+off2] {mergeSymTyped(sym1,sym2)} val base mem)
 ((ADD|SUB|AND|OR|XOR)Lmodify [off1] {sym1} (LEAL [off2] {sym2} base) val mem)
-	&& is32Bit(off1+off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) ->
-	((ADD|SUB|AND|OR|XOR)Lmodify [off1+off2] {mergeSym(sym1,sym2)} base val mem)
+	&& is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) =>
+	((ADD|SUB|AND|OR|XOR)Lmodify [off1+off2] {mergeSymTyped(sym1,sym2)} base val mem)
 ((ADD|AND|OR|XOR)Lconstmodify [valoff1] {sym1} (LEAL [off2] {sym2} base) mem)
-	&& ValAndOff(valoff1).canAdd(off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) ->
-	((ADD|AND|OR|XOR)Lconstmodify [ValAndOff(valoff1).add(off2)] {mergeSym(sym1,sym2)} base mem)
+	&& valoff1.canAdd32(off2) && canMergeSym(sym1, sym2) && (base.Op != OpSB || !config.ctxt.Flag_shared) =>
+	((ADD|AND|OR|XOR)Lconstmodify [valoff1.addOffset32(off2)] {mergeSymTyped(sym1,sym2)} base mem)

 // Merge load/store to op
-((ADD|AND|OR|XOR|SUB|MUL)L x l:(MOVLload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && clobber(l) -> ((ADD|AND|OR|XOR|SUB|MUL)Lload x [off] {sym} ptr mem)
-((ADD|SUB|MUL|DIV)SD x l:(MOVSDload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && !config.use387 && clobber(l) -> ((ADD|SUB|MUL|DIV)SDload x [off] {sym} ptr mem)
-((ADD|SUB|MUL|DIV)SS x l:(MOVSSload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && !config.use387 && clobber(l) -> ((ADD|SUB|MUL|DIV)SSload x [off] {sym} ptr mem)
-(MOVLstore {sym} [off] ptr y:((ADD|AND|OR|XOR)Lload x [off] {sym} ptr mem) mem) && y.Uses==1 && clobber(y) -> ((ADD|AND|OR|XOR)Lmodify [off] {sym} ptr x mem)
-(MOVLstore {sym} [off] ptr y:((ADD|SUB|AND|OR|XOR)L l:(MOVLload [off] {sym} ptr mem) x) mem) && y.Uses==1 && l.Uses==1 && clobber(y, l) ->
+((ADD|AND|OR|XOR|SUB|MUL)L x l:(MOVLload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && clobber(l) => ((ADD|AND|OR|XOR|SUB|MUL)Lload x [off] {sym} ptr mem)
+((ADD|SUB|MUL|DIV)SD x l:(MOVSDload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && !config.use387 && clobber(l) => ((ADD|SUB|MUL|DIV)SDload x [off] {sym} ptr mem)
+((ADD|SUB|MUL|DIV)SS x l:(MOVSSload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && !config.use387 && clobber(l) => ((ADD|SUB|MUL|DIV)SSload x [off] {sym} ptr mem)
+(MOVLstore {sym} [off] ptr y:((ADD|AND|OR|XOR)Lload x [off] {sym} ptr mem) mem) && y.Uses==1 && clobber(y) => ((ADD|AND|OR|XOR)Lmodify [off] {sym} ptr x mem)
+(MOVLstore {sym} [off] ptr y:((ADD|SUB|AND|OR|XOR)L l:(MOVLload [off] {sym} ptr mem) x) mem) && y.Uses==1 && l.Uses==1 && clobber(y, l) =>
 	((ADD|SUB|AND|OR|XOR)Lmodify [off] {sym} ptr x mem)
 (MOVLstore {sym} [off] ptr y:((ADD|AND|OR|XOR)Lconst [c] l:(MOVLload [off] {sym} ptr mem)) mem)
-	&& y.Uses==1 && l.Uses==1 && clobber(y, l) && validValAndOff(c,off) ->
-	((ADD|AND|OR|XOR)Lconstmodify [makeValAndOff(c,off)] {sym} ptr mem)
+	&& y.Uses==1 && l.Uses==1 && clobber(y, l) && validValAndOff(int64(c),int64(off)) =>
+	((ADD|AND|OR|XOR)Lconstmodify [makeValAndOff32(c,off)] {sym} ptr mem)

 // fold LEALs together
-(LEAL [off1] {sym1} (LEAL [off2] {sym2} x)) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
-      (LEAL [off1+off2] {mergeSym(sym1,sym2)} x)
+(LEAL [off1] {sym1} (LEAL [off2] {sym2} x)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+      (LEAL [off1+off2] {mergeSymTyped(sym1,sym2)} x)

 // LEAL into LEAL1
-(LEAL1 [off1] {sym1} (LEAL [off2] {sym2} x) y) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) && x.Op != OpSB ->
-       (LEAL1 [off1+off2] {mergeSym(sym1,sym2)} x y)
+(LEAL1 [off1] {sym1} (LEAL [off2] {sym2} x) y) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && x.Op != OpSB =>
+       (LEAL1 [off1+off2] {mergeSymTyped(sym1,sym2)} x y)

 // LEAL1 into LEAL
-(LEAL [off1] {sym1} (LEAL1 [off2] {sym2} x y)) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
-       (LEAL1 [off1+off2] {mergeSym(sym1,sym2)} x y)
+(LEAL [off1] {sym1} (LEAL1 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+       (LEAL1 [off1+off2] {mergeSymTyped(sym1,sym2)} x y)

 // LEAL into LEAL[248]
-(LEAL2 [off1] {sym1} (LEAL [off2] {sym2} x) y) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) && x.Op != OpSB ->
-       (LEAL2 [off1+off2] {mergeSym(sym1,sym2)} x y)
-(LEAL4 [off1] {sym1} (LEAL [off2] {sym2} x) y) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) && x.Op != OpSB ->
-       (LEAL4 [off1+off2] {mergeSym(sym1,sym2)} x y)
-(LEAL8 [off1] {sym1} (LEAL [off2] {sym2} x) y) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) && x.Op != OpSB ->
-       (LEAL8 [off1+off2] {mergeSym(sym1,sym2)} x y)
+(LEAL2 [off1] {sym1} (LEAL [off2] {sym2} x) y) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && x.Op != OpSB =>
+       (LEAL2 [off1+off2] {mergeSymTyped(sym1,sym2)} x y)
+(LEAL4 [off1] {sym1} (LEAL [off2] {sym2} x) y) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && x.Op != OpSB =>
+       (LEAL4 [off1+off2] {mergeSymTyped(sym1,sym2)} x y)
+(LEAL8 [off1] {sym1} (LEAL [off2] {sym2} x) y) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) && x.Op != OpSB =>
+       (LEAL8 [off1+off2] {mergeSymTyped(sym1,sym2)} x y)

 // LEAL[248] into LEAL
-(LEAL [off1] {sym1} (LEAL2 [off2] {sym2} x y)) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
-      (LEAL2 [off1+off2] {mergeSym(sym1,sym2)} x y)
-(LEAL [off1] {sym1} (LEAL4 [off2] {sym2} x y)) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
-      (LEAL4 [off1+off2] {mergeSym(sym1,sym2)} x y)
-(LEAL [off1] {sym1} (LEAL8 [off2] {sym2} x y)) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
-      (LEAL8 [off1+off2] {mergeSym(sym1,sym2)} x y)
+(LEAL [off1] {sym1} (LEAL2 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+      (LEAL2 [off1+off2] {mergeSymTyped(sym1,sym2)} x y)
+(LEAL [off1] {sym1} (LEAL4 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+      (LEAL4 [off1+off2] {mergeSymTyped(sym1,sym2)} x y)
+(LEAL [off1] {sym1} (LEAL8 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+      (LEAL8 [off1+off2] {mergeSymTyped(sym1,sym2)} x y)

 // LEAL[1248] into LEAL[1248]. Only some such merges are possible.
-(LEAL1 [off1] {sym1} x (LEAL1 [off2] {sym2} y y)) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
-      (LEAL2 [off1+off2] {mergeSym(sym1, sym2)} x y)
-(LEAL1 [off1] {sym1} x (LEAL1 [off2] {sym2} x y)) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
-      (LEAL2 [off1+off2] {mergeSym(sym1, sym2)} y x)
-(LEAL2 [off1] {sym} x (LEAL1 [off2] {nil} y y)) && is32Bit(off1+2*off2) ->
+(LEAL1 [off1] {sym1} x (LEAL1 [off2] {sym2} y y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+      (LEAL2 [off1+off2] {mergeSymTyped(sym1, sym2)} x y)
+(LEAL1 [off1] {sym1} x (LEAL1 [off2] {sym2} x y)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) =>
+      (LEAL2 [off1+off2] {mergeSymTyped(sym1, sym2)} y x)
+(LEAL2 [off1] {sym} x (LEAL1 [off2] {nil} y y)) && is32Bit(int64(off1)+2*int64(off2)) =>
      (LEAL4 [off1+2*off2] {sym} x y)
-(LEAL4 [off1] {sym} x (LEAL1 [off2] {nil} y y)) && is32Bit(off1+4*off2) ->
+(LEAL4 [off1] {sym} x (LEAL1 [off2] {nil} y y)) && is32Bit(int64(off1)+4*int64(off2)) =>
      (LEAL8 [off1+4*off2] {sym} x y)

 // Absorb InvertFlags into branches.
-(LT (InvertFlags cmp) yes no) -> (GT cmp yes no)
-(GT (InvertFlags cmp) yes no) -> (LT cmp yes no)
-(LE (InvertFlags cmp) yes no) -> (GE cmp yes no)
-(GE (InvertFlags cmp) yes no) -> (LE cmp yes no)
-(ULT (InvertFlags cmp) yes no) -> (UGT cmp yes no)
-(UGT (InvertFlags cmp) yes no) -> (ULT cmp yes no)
-(ULE (InvertFlags cmp) yes no) -> (UGE cmp yes no)
-(UGE (InvertFlags cmp) yes no) -> (ULE cmp yes no)
-(EQ (InvertFlags cmp) yes no) -> (EQ cmp yes no)
-(NE (InvertFlags cmp) yes no) -> (NE cmp yes no)
+(LT (InvertFlags cmp) yes no) => (GT cmp yes no)
+(GT (InvertFlags cmp) yes no) => (LT cmp yes no)
+(LE (InvertFlags cmp) yes no) => (GE cmp yes no)
+(GE (InvertFlags cmp) yes no) => (LE cmp yes no)
+(ULT (InvertFlags cmp) yes no) => (UGT cmp yes no)
+(UGT (InvertFlags cmp) yes no) => (ULT cmp yes no)
+(ULE (InvertFlags cmp) yes no) => (UGE cmp yes no)
+(UGE (InvertFlags cmp) yes no) => (ULE cmp yes no)
+(EQ (InvertFlags cmp) yes no) => (EQ cmp yes no)
+(NE (InvertFlags cmp) yes no) => (NE cmp yes no)

 // Constant comparisons.
-(CMPLconst (MOVLconst [x]) [y]) && int32(x)==int32(y) -> (FlagEQ)
-(CMPLconst (MOVLconst [x]) [y]) && int32(x)<int32(y) && uint32(x)<uint32(y) -> (FlagLT_ULT)
-(CMPLconst (MOVLconst [x]) [y]) && int32(x)<int32(y) && uint32(x)>uint32(y) -> (FlagLT_UGT)
-(CMPLconst (MOVLconst [x]) [y]) && int32(x)>int32(y) && uint32(x)<uint32(y) -> (FlagGT_ULT)
-(CMPLconst (MOVLconst [x]) [y]) && int32(x)>int32(y) && uint32(x)>uint32(y) -> (FlagGT_UGT)
-(CMPWconst (MOVLconst [x]) [y]) && int16(x)==int16(y) -> (FlagEQ)
-(CMPWconst (MOVLconst [x]) [y]) && int16(x)<int16(y) && uint16(x)<uint16(y) -> (FlagLT_ULT)
-(CMPWconst (MOVLconst [x]) [y]) && int16(x)<int16(y) && uint16(x)>uint16(y) -> (FlagLT_UGT)
-(CMPWconst (MOVLconst [x]) [y]) && int16(x)>int16(y) && uint16(x)<uint16(y) -> (FlagGT_ULT)
-(CMPWconst (MOVLconst [x]) [y]) && int16(x)>int16(y) && uint16(x)>uint16(y) -> (FlagGT_UGT)
-(CMPBconst (MOVLconst [x]) [y]) && int8(x)==int8(y) -> (FlagEQ)
-(CMPBconst (MOVLconst [x]) [y]) && int8(x)<int8(y) && uint8(x)<uint8(y) -> (FlagLT_ULT)
-(CMPBconst (MOVLconst [x]) [y]) && int8(x)<int8(y) && uint8(x)>uint8(y) -> (FlagLT_UGT)
-(CMPBconst (MOVLconst [x]) [y]) && int8(x)>int8(y) && uint8(x)<uint8(y) -> (FlagGT_ULT)
-(CMPBconst (MOVLconst [x]) [y]) && int8(x)>int8(y) && uint8(x)>uint8(y) -> (FlagGT_UGT)
+(CMPLconst (MOVLconst [x]) [y]) && x==y                       => (FlagEQ)
+(CMPLconst (MOVLconst [x]) [y]) && x<y && uint32(x)<uint32(y) => (FlagLT_ULT)
+(CMPLconst (MOVLconst [x]) [y]) && x<y && uint32(x)>uint32(y) => (FlagLT_UGT)
+(CMPLconst (MOVLconst [x]) [y]) && x>y && uint32(x)<uint32(y) => (FlagGT_ULT)
+(CMPLconst (MOVLconst [x]) [y]) && x>y && uint32(x)>uint32(y) => (FlagGT_UGT)
+
+(CMPWconst (MOVLconst [x]) [y]) && int16(x)==y                       => (FlagEQ)
+(CMPWconst (MOVLconst [x]) [y]) && int16(x)<y && uint16(x)<uint16(y) => (FlagLT_ULT)
+(CMPWconst (MOVLconst [x]) [y]) && int16(x)<y && uint16(x)>uint16(y) => (FlagLT_UGT)
+(CMPWconst (MOVLconst [x]) [y]) && int16(x)>y && uint16(x)<uint16(y) => (FlagGT_ULT)
+(CMPWconst (MOVLconst [x]) [y]) && int16(x)>y && uint16(x)>uint16(y) => (FlagGT_UGT)
+
+(CMPBconst (MOVLconst [x]) [y]) && int8(x)==y                      => (FlagEQ)
+(CMPBconst (MOVLconst [x]) [y]) && int8(x)<y && uint8(x)<uint8(y) => (FlagLT_ULT)
+(CMPBconst (MOVLconst [x]) [y]) && int8(x)<y && uint8(x)>uint8(y) => (FlagLT_UGT)
+(CMPBconst (MOVLconst [x]) [y]) && int8(x)>y && uint8(x)<uint8(y) => (FlagGT_ULT)
+(CMPBconst (MOVLconst [x]) [y]) && int8(x)>y && uint8(x)>uint8(y) => (FlagGT_UGT)

 // Other known comparisons.
 (CMPLconst (SHRLconst _ [c]) [n]) && 0 <= n && 0 < c && c <= 32 && (1<<uint64(32-c)) <= uint64(n) -> (FlagLT_ULT)
--- a/src/cmd/compile/internal/ssa/op.go
+++ b/src/cmd/compile/internal/ssa/op.go
@ -190,6 +190,11 @@ func (x ValAndOff) canAdd(off int64) bool {
 	return newoff == int64(int32(newoff))
 }

+func (x ValAndOff) canAdd32(off int32) bool {
+	newoff := x.Off() + int64(off)
+	return newoff == int64(int32(newoff))
+}
+
 func (x ValAndOff) add(off int64) int64 {
 	if !x.canAdd(off) {
 		panic("invalid ValAndOff.add")
@ -197,6 +202,20 @@ func (x ValAndOff) add(off int64) int64 {
 	return makeValAndOff(x.Val(), x.Off()+off)
 }

+func (x ValAndOff) addOffset32(off int32) ValAndOff {
+	if !x.canAdd32(off) {
+		panic("invalid ValAndOff.add")
+	}
+	return ValAndOff(makeValAndOff(x.Val(), x.Off()+int64(off)))
+}
+
+func (x ValAndOff) addOffset64(off int64) ValAndOff {
+	if !x.canAdd(off) {
+		panic("invalid ValAndOff.add")
+	}
+	return ValAndOff(makeValAndOff(x.Val(), x.Off()+off))
+}
+
 // int128 is a type that stores a 128-bit constant.
 // The only allowed constant right now is 0, so we can cheat quite a bit.
 type int128 int64
--- a/src/cmd/compile/internal/ssa/rewrite.go
+++ b/src/cmd/compile/internal/ssa/rewrite.go
@ -207,9 +207,11 @@ func mergeSym(x, y interface{}) interface{} {
 	}
 	panic(fmt.Sprintf("mergeSym with two non-nil syms %s %s", x, y))
 }
+
 func canMergeSym(x, y interface{}) bool {
 	return x == nil || y == nil
 }
+
 func mergeSymTyped(x, y Sym) Sym {
 	if x == nil {
 		return y
--- a/src/cmd/compile/internal/ssa/rewrite386.go
+++ b/src/cmd/compile/internal/ssa/rewrite386.go