[dev.ssa] cmd/compile/internal/ssa: implement lots of small (<8byte) ops.

Lots and lots of ops! Also XOR for good measure. Add a pass to the compiler generator to check that all of the architecture-specific opcodes are handled by genValue. We will catch any missing ones if we come across them during compilation, but probably better to catch them statically. Change-Id: Ic4adfbec55c8257f88117bc732fa664486262868 Reviewed-on: https://go-review.googlesource.com/12813 Reviewed-by: Josh Bleecher Snyder <josharian@gmail.com>
2024-10-05 16:51:21 -06:00 · 2015-07-28 16:04:50 -07:00 · 2015-07-28 16:04:50 -07:00 · 20550cbaf1
commit 20550cbaf1
parent 7402416a8b
7 changed files with 2816 additions and 582 deletions
--- a/src/cmd/compile/internal/gc/ssa.go
+++ b/src/cmd/compile/internal/gc/ssa.go
@ -1628,10 +1628,12 @@ func genValue(v *ssa.Value) {
 		p.From.Index = regnum(v.Args[1])
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = regnum(v)
+	// 2-address opcode arithmetic, symmetric
 	case ssa.OpAMD64ADDB,
 		ssa.OpAMD64ANDQ, ssa.OpAMD64ANDL, ssa.OpAMD64ANDW, ssa.OpAMD64ANDB,
-		ssa.OpAMD64MULQ, ssa.OpAMD64MULL, ssa.OpAMD64MULW,
-		ssa.OpAMD64ORQ, ssa.OpAMD64ORL, ssa.OpAMD64ORW, ssa.OpAMD64ORB:
+		ssa.OpAMD64ORQ, ssa.OpAMD64ORL, ssa.OpAMD64ORW, ssa.OpAMD64ORB,
+		ssa.OpAMD64XORQ, ssa.OpAMD64XORL, ssa.OpAMD64XORW, ssa.OpAMD64XORB,
+		ssa.OpAMD64MULQ, ssa.OpAMD64MULL, ssa.OpAMD64MULW:
 		r := regnum(v)
 		x := regnum(v.Args[0])
 		y := regnum(v.Args[1])
@ -1652,59 +1654,47 @@ func genValue(v *ssa.Value) {
 		} else {
 			p.From.Reg = x
 		}
-	case ssa.OpAMD64ADDQconst:
-		// TODO: use addq instead of leaq if target is in the right register.
-		p := Prog(x86.ALEAQ)
-		p.From.Type = obj.TYPE_MEM
-		p.From.Reg = regnum(v.Args[0])
-		p.From.Offset = v.AuxInt
-		p.To.Type = obj.TYPE_REG
-		p.To.Reg = regnum(v)
-	case ssa.OpAMD64MULQconst:
+	// 2-address opcode arithmetic, not symmetric
+	case ssa.OpAMD64SUBQ, ssa.OpAMD64SUBL, ssa.OpAMD64SUBW, ssa.OpAMD64SUBB:
 		r := regnum(v)
 		x := regnum(v.Args[0])
-		if r != x {
-			p := Prog(x86.AMOVQ)
-			p.From.Type = obj.TYPE_REG
-			p.From.Reg = x
-			p.To.Type = obj.TYPE_REG
-			p.To.Reg = r
+		y := regnum(v.Args[1])
+		var neg bool
+		if y == r {
+			// compute -(y-x) instead
+			x, y = y, x
+			neg = true
 		}
-		p := Prog(x86.AIMULQ)
-		p.From.Type = obj.TYPE_CONST
-		p.From.Offset = v.AuxInt
-		p.To.Type = obj.TYPE_REG
-		p.To.Reg = r
-		// TODO: Teach doasm to compile the three-address multiply imul $c, r1, r2
-		// instead of using the MOVQ above.
-		//p.From3 = new(obj.Addr)
-		//p.From3.Type = obj.TYPE_REG
-		//p.From3.Reg = regnum(v.Args[0])
-	case ssa.OpAMD64SUBQconst:
-		// This code compensates for the fact that the register allocator
-		// doesn't understand 2-address instructions yet.  TODO: fix that.
-		x := regnum(v.Args[0])
-		r := regnum(v)
 		if x != r {
-			p := Prog(x86.AMOVQ)
+			p := Prog(regMoveAMD64(v.Type.Size()))
 			p.From.Type = obj.TYPE_REG
 			p.From.Reg = x
 			p.To.Type = obj.TYPE_REG
 			p.To.Reg = r
 		}
-		p := Prog(x86.ASUBQ)
-		p.From.Type = obj.TYPE_CONST
-		p.From.Offset = v.AuxInt
+
+		p := Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = r
-	case ssa.OpAMD64SHLQ, ssa.OpAMD64SHRQ, ssa.OpAMD64SARQ:
+		p.From.Reg = y
+		if neg {
+			p := Prog(x86.ANEGQ) // TODO: use correct size?  This is mostly a hack until regalloc does 2-address correctly
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = r
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = r
+		}
+	case ssa.OpAMD64SHLQ, ssa.OpAMD64SHLL, ssa.OpAMD64SHLW, ssa.OpAMD64SHLB,
+		ssa.OpAMD64SHRQ, ssa.OpAMD64SHRL, ssa.OpAMD64SHRW, ssa.OpAMD64SHRB,
+		ssa.OpAMD64SARQ, ssa.OpAMD64SARL, ssa.OpAMD64SARW, ssa.OpAMD64SARB:
 		x := regnum(v.Args[0])
 		r := regnum(v)
 		if x != r {
 			if r == x86.REG_CX {
 				v.Fatalf("can't implement %s, target and shift both in CX", v.LongString())
 			}
-			p := Prog(x86.AMOVQ)
+			p := Prog(regMoveAMD64(v.Type.Size()))
 			p.From.Type = obj.TYPE_REG
 			p.From.Reg = x
 			p.To.Type = obj.TYPE_REG
@ -1715,11 +1705,57 @@ func genValue(v *ssa.Value) {
 		p.From.Reg = regnum(v.Args[1]) // should be CX
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = r
-	case ssa.OpAMD64ANDQconst, ssa.OpAMD64SHLQconst, ssa.OpAMD64SHRQconst, ssa.OpAMD64SARQconst, ssa.OpAMD64XORQconst:
+	case ssa.OpAMD64ADDQconst, ssa.OpAMD64ADDLconst, ssa.OpAMD64ADDWconst:
+		// TODO: use addq instead of leaq if target is in the right register.
+		var asm int
+		switch v.Op {
+		case ssa.OpAMD64ADDQconst:
+			asm = x86.ALEAQ
+		case ssa.OpAMD64ADDLconst:
+			asm = x86.ALEAL
+		case ssa.OpAMD64ADDWconst:
+			asm = x86.ALEAW
+		}
+		p := Prog(asm)
+		p.From.Type = obj.TYPE_MEM
+		p.From.Reg = regnum(v.Args[0])
+		p.From.Offset = v.AuxInt
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = regnum(v)
+	case ssa.OpAMD64MULQconst, ssa.OpAMD64MULLconst, ssa.OpAMD64MULWconst:
+		r := regnum(v)
+		x := regnum(v.Args[0])
+		if r != x {
+			p := Prog(regMoveAMD64(v.Type.Size()))
+			p.From.Type = obj.TYPE_REG
+			p.From.Reg = x
+			p.To.Type = obj.TYPE_REG
+			p.To.Reg = r
+		}
+		p := Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = v.AuxInt
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = r
+		// TODO: Teach doasm to compile the three-address multiply imul $c, r1, r2
+		// instead of using the MOVQ above.
+		//p.From3 = new(obj.Addr)
+		//p.From3.Type = obj.TYPE_REG
+		//p.From3.Reg = regnum(v.Args[0])
+	case ssa.OpAMD64ADDBconst,
+		ssa.OpAMD64ANDQconst, ssa.OpAMD64ANDLconst, ssa.OpAMD64ANDWconst, ssa.OpAMD64ANDBconst,
+		ssa.OpAMD64ORQconst, ssa.OpAMD64ORLconst, ssa.OpAMD64ORWconst, ssa.OpAMD64ORBconst,
+		ssa.OpAMD64XORQconst, ssa.OpAMD64XORLconst, ssa.OpAMD64XORWconst, ssa.OpAMD64XORBconst,
+		ssa.OpAMD64SUBQconst, ssa.OpAMD64SUBLconst, ssa.OpAMD64SUBWconst, ssa.OpAMD64SUBBconst,
+		ssa.OpAMD64SHLQconst, ssa.OpAMD64SHLLconst, ssa.OpAMD64SHLWconst, ssa.OpAMD64SHLBconst,
+		ssa.OpAMD64SHRQconst, ssa.OpAMD64SHRLconst, ssa.OpAMD64SHRWconst, ssa.OpAMD64SHRBconst,
+		ssa.OpAMD64SARQconst, ssa.OpAMD64SARLconst, ssa.OpAMD64SARWconst, ssa.OpAMD64SARBconst:
+		// This code compensates for the fact that the register allocator
+		// doesn't understand 2-address instructions yet.  TODO: fix that.
 		x := regnum(v.Args[0])
 		r := regnum(v)
 		if x != r {
-			p := Prog(x86.AMOVQ)
+			p := Prog(regMoveAMD64(v.Type.Size()))
 			p.From.Type = obj.TYPE_REG
 			p.From.Reg = x
 			p.To.Type = obj.TYPE_REG
@ -1732,7 +1768,7 @@ func genValue(v *ssa.Value) {
 		p.To.Reg = r
 	case ssa.OpAMD64SBBQcarrymask:
 		r := regnum(v)
-		p := Prog(x86.ASBBQ)
+		p := Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_REG
 		p.From.Reg = r
 		p.To.Type = obj.TYPE_REG
@ -1785,14 +1821,16 @@ func genValue(v *ssa.Value) {
 		addAux(&p.From, v)
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = regnum(v)
-	case ssa.OpAMD64CMPQ, ssa.OpAMD64CMPL, ssa.OpAMD64CMPW, ssa.OpAMD64CMPB, ssa.OpAMD64TESTB, ssa.OpAMD64TESTQ:
+	case ssa.OpAMD64CMPQ, ssa.OpAMD64CMPL, ssa.OpAMD64CMPW, ssa.OpAMD64CMPB,
+		ssa.OpAMD64TESTQ, ssa.OpAMD64TESTL, ssa.OpAMD64TESTW, ssa.OpAMD64TESTB:
 		p := Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_REG
 		p.From.Reg = regnum(v.Args[0])
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = regnum(v.Args[1])
-	case ssa.OpAMD64CMPQconst:
-		p := Prog(x86.ACMPQ)
+	case ssa.OpAMD64CMPQconst, ssa.OpAMD64CMPLconst, ssa.OpAMD64CMPWconst, ssa.OpAMD64CMPBconst,
+		ssa.OpAMD64TESTQconst, ssa.OpAMD64TESTLconst, ssa.OpAMD64TESTWconst, ssa.OpAMD64TESTBconst:
+		p := Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_REG
 		p.From.Reg = regnum(v.Args[0])
 		p.To.Type = obj.TYPE_CONST
@ -1943,6 +1981,16 @@ func genValue(v *ssa.Value) {
 		p := Prog(v.Op.Asm())
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = regnum(v)
+	case ssa.OpAMD64InvertFlags:
+		v.Fatalf("InvertFlags should never make it to codegen %v", v)
+	case ssa.OpAMD64REPSTOSQ:
+		Prog(x86.AREP)
+		Prog(x86.ASTOSQ)
+		v.Unimplementedf("REPSTOSQ clobbers not implemented: %s", v.LongString())
+	case ssa.OpAMD64REPMOVSB:
+		Prog(x86.AREP)
+		Prog(x86.AMOVSB)
+		v.Unimplementedf("REPMOVSB clobbers not implemented: %s", v.LongString())
 	default:
 		v.Unimplementedf("genValue not implemented: %s", v.LongString())
 	}
--- a/src/cmd/compile/internal/ssa/gen/AMD64.rules
+++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules
@ -16,6 +16,20 @@
 (Add16 x y) -> (ADDW x y)
 (Add8 x y) -> (ADDB x y)

+(Sub64 x y) -> (SUBQ x y)
+(Sub32 x y) -> (SUBL x y)
+(Sub16 x y) -> (SUBW x y)
+(Sub8 x y) -> (SUBB x y)
+
+(Mul64 x y) -> (MULQ x y)
+(MulPtr x y) -> (MULQ x y)
+(Mul32 x y) -> (MULL x y)
+(Mul16 x y) -> (MULW x y)
+// Note: we use 16-bit multiply instructions for 8-bit multiplies because
+// the 16-bit multiply instructions are more forgiving (they operate on
+// any register instead of just AX/DX).
+(Mul8 x y) -> (MULW x y)
+
 (And64 x y) -> (ANDQ x y)
 (And32 x y) -> (ANDL x y)
 (And16 x y) -> (ANDW x y)
@ -26,25 +40,16 @@
 (Or16 x y) -> (ORW x y)
 (Or8 x y) -> (ORB x y)

-(Sub64 x y) -> (SUBQ x y)
-(Sub32 x y) -> (SUBL x y)
-(Sub16 x y) -> (SUBW x y)
-(Sub8 x y) -> (SUBB x y)
+(Xor64 x y) -> (XORQ x y)
+(Xor32 x y) -> (XORL x y)
+(Xor16 x y) -> (XORW x y)
+(Xor8 x y) -> (XORB x y)

 (Neg64 x) -> (NEGQ x)
 (Neg32 x) -> (NEGL x)
 (Neg16 x) -> (NEGW x)
 (Neg8 x) -> (NEGB x)

-(Mul64 x y) -> (MULQ x y)
-(MulPtr x y) -> (MULQ x y)
-(Mul32 x y) -> (MULL x y)
-(Mul16 x y) -> (MULW x y)
-// Note: we use 16-bit multiply instructions for 8-bit multiplies because
-// the 16-bit multiply instructions are more forgiving (they operate on
-// any register instead of just AX/DX).
-(Mul8 x y) -> (MULW x y)
-
 // Note: we always extend to 64 bits even though some ops don't need that many result bits.
 (SignExt8to16 x) -> (MOVBQSX x)
 (SignExt8to32 x) -> (MOVBQSX x)
@ -76,8 +81,43 @@
 // Note: unsigned shifts need to return 0 if shift amount is >= 64.
 //   mask = shift >= 64 ? 0 : 0xffffffffffffffff
 //   result = mask & arg << shift
+// TODO: define ops per right-hand side size, like Lsh64x32 for int64(x)<<uint32(y)?
 (Lsh64 <t> x y) && y.Type.Size() == 8 ->
 	(ANDQ (SHLQ <t> x y) (SBBQcarrymask <t> (CMPQconst <TypeFlags> [64] y)))
+(Lsh64 <t> x y) && y.Type.Size() == 4 ->
+	(ANDQ (SHLQ <t> x y) (SBBQcarrymask <t> (CMPLconst <TypeFlags> [64] y)))
+(Lsh64 <t> x y) && y.Type.Size() == 2 ->
+	(ANDQ (SHLQ <t> x y) (SBBQcarrymask <t> (CMPWconst <TypeFlags> [64] y)))
+(Lsh64 <t> x y) && y.Type.Size() == 1 ->
+	(ANDQ (SHLQ <t> x y) (SBBQcarrymask <t> (CMPBconst <TypeFlags> [64] y)))
+
+(Lsh32 <t> x y) && y.Type.Size() == 8 ->
+	(ANDL (SHLL <t> x y) (SBBQcarrymask <t> (CMPQconst <TypeFlags> [32] y)))
+(Lsh32 <t> x y) && y.Type.Size() == 4 ->
+	(ANDL (SHLL <t> x y) (SBBQcarrymask <t> (CMPLconst <TypeFlags> [32] y)))
+(Lsh32 <t> x y) && y.Type.Size() == 2 ->
+	(ANDL (SHLL <t> x y) (SBBQcarrymask <t> (CMPWconst <TypeFlags> [32] y)))
+(Lsh32 <t> x y) && y.Type.Size() == 1 ->
+	(ANDL (SHLL <t> x y) (SBBQcarrymask <t> (CMPBconst <TypeFlags> [32] y)))
+
+(Lsh16 <t> x y) && y.Type.Size() == 8 ->
+	(ANDW (SHLW <t> x y) (SBBQcarrymask <t> (CMPQconst <TypeFlags> [16] y)))
+(Lsh16 <t> x y) && y.Type.Size() == 4 ->
+	(ANDW (SHLW <t> x y) (SBBQcarrymask <t> (CMPLconst <TypeFlags> [16] y)))
+(Lsh16 <t> x y) && y.Type.Size() == 2 ->
+	(ANDW (SHLW <t> x y) (SBBQcarrymask <t> (CMPWconst <TypeFlags> [16] y)))
+(Lsh16 <t> x y) && y.Type.Size() == 1 ->
+	(ANDW (SHLW <t> x y) (SBBQcarrymask <t> (CMPBconst <TypeFlags> [16] y)))
+
+(Lsh8 <t> x y) && y.Type.Size() == 8 ->
+	(ANDB (SHLB <t> x y) (SBBQcarrymask <t> (CMPQconst <TypeFlags> [8] y)))
+(Lsh8 <t> x y) && y.Type.Size() == 4 ->
+	(ANDB (SHLB <t> x y) (SBBQcarrymask <t> (CMPLconst <TypeFlags> [8] y)))
+(Lsh8 <t> x y) && y.Type.Size() == 2 ->
+	(ANDB (SHLB <t> x y) (SBBQcarrymask <t> (CMPWconst <TypeFlags> [8] y)))
+(Lsh8 <t> x y) && y.Type.Size() == 1 ->
+	(ANDB (SHLB <t> x y) (SBBQcarrymask <t> (CMPBconst <TypeFlags> [8] y)))
+
 (Rsh64U <t> x y) && y.Type.Size() == 8 ->
 	(ANDQ (SHRQ <t> x y) (SBBQcarrymask <t> (CMPQconst <TypeFlags> [64] y)))

@ -158,7 +198,7 @@

 (Move [size] dst src mem) -> (REPMOVSB dst src (MOVQconst <TypeUInt64> [size]) mem)

-(Not x) -> (XORQconst [1] x)
+(Not x) -> (XORBconst [1] x)

 (OffPtr [off] ptr) -> (ADDQconst [off] ptr)

@ -193,20 +233,86 @@
 // TODO: Should this be a separate pass?

 // fold constants into instructions
-// TODO: restrict c to int32 range for all?
 (ADDQ x (MOVQconst [c])) && is32Bit(c) -> (ADDQconst [c] x)
 (ADDQ (MOVQconst [c]) x) && is32Bit(c) -> (ADDQconst [c] x)
-(SUBQ x (MOVQconst [c])) -> (SUBQconst x [c])
-(SUBQ <t> (MOVQconst [c]) x) -> (NEGQ (SUBQconst <t> x [c]))
+(ADDL x (MOVLconst [c])) -> (ADDLconst [c] x)
+(ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x)
+(ADDW x (MOVWconst [c])) -> (ADDWconst [c] x)
+(ADDW (MOVWconst [c]) x) -> (ADDWconst [c] x)
+(ADDB x (MOVBconst [c])) -> (ADDBconst [c] x)
+(ADDB (MOVBconst [c]) x) -> (ADDBconst [c] x)
+
+(SUBQ x (MOVQconst [c])) && is32Bit(c) -> (SUBQconst x [c])
+(SUBQ (MOVQconst [c]) x) && is32Bit(c) -> (NEGQ (SUBQconst <v.Type> x [c]))
+(SUBL x (MOVLconst [c])) -> (SUBLconst x [c])
+(SUBL (MOVLconst [c]) x) -> (NEGL (SUBLconst <v.Type> x [c]))
+(SUBW x (MOVWconst [c])) -> (SUBWconst x [c])
+(SUBW (MOVWconst [c]) x) -> (NEGW (SUBWconst <v.Type> x [c]))
+(SUBB x (MOVBconst [c])) -> (SUBBconst x [c])
+(SUBB (MOVBconst [c]) x) -> (NEGB (SUBBconst <v.Type> x [c]))
+
 (MULQ x (MOVQconst [c])) && is32Bit(c) -> (MULQconst [c] x)
 (MULQ (MOVQconst [c]) x) && is32Bit(c) -> (MULQconst [c] x)
-(ANDQ x (MOVQconst [c])) -> (ANDQconst [c] x)
-(ANDQ (MOVQconst [c]) x) -> (ANDQconst [c] x)
-(SHLQ x (MOVQconst [c])) -> (SHLQconst [c] x)
-(SHRQ x (MOVQconst [c])) -> (SHRQconst [c] x)
-(SARQ x (MOVQconst [c])) -> (SARQconst [c] x)
-(CMPQ x (MOVQconst [c])) -> (CMPQconst x [c])
-(CMPQ (MOVQconst [c]) x) -> (InvertFlags (CMPQconst <TypeFlags> x [c]))
+(MULL x (MOVLconst [c])) -> (MULLconst [c] x)
+(MULL (MOVLconst [c]) x) -> (MULLconst [c] x)
+(MULW x (MOVWconst [c])) -> (MULWconst [c] x)
+(MULW (MOVWconst [c]) x) -> (MULWconst [c] x)
+
+(ANDQ x (MOVQconst [c])) && is32Bit(c) -> (ANDQconst [c] x)
+(ANDQ (MOVQconst [c]) x) && is32Bit(c) -> (ANDQconst [c] x)
+(ANDL x (MOVLconst [c])) -> (ANDLconst [c] x)
+(ANDL (MOVLconst [c]) x) -> (ANDLconst [c] x)
+(ANDW x (MOVWconst [c])) -> (ANDWconst [c] x)
+(ANDW (MOVWconst [c]) x) -> (ANDWconst [c] x)
+(ANDB x (MOVBconst [c])) -> (ANDBconst [c] x)
+(ANDB (MOVBconst [c]) x) -> (ANDBconst [c] x)
+
+(ORQ x (MOVQconst [c])) && is32Bit(c) -> (ORQconst [c] x)
+(ORQ (MOVQconst [c]) x) && is32Bit(c) -> (ORQconst [c] x)
+(ORL x (MOVLconst [c])) -> (ORLconst [c] x)
+(ORL (MOVLconst [c]) x) -> (ORLconst [c] x)
+(ORW x (MOVWconst [c])) -> (ORWconst [c] x)
+(ORW (MOVWconst [c]) x) -> (ORWconst [c] x)
+(ORB x (MOVBconst [c])) -> (ORBconst [c] x)
+(ORB (MOVBconst [c]) x) -> (ORBconst [c] x)
+
+(XORQ x (MOVQconst [c])) && is32Bit(c) -> (XORQconst [c] x)
+(XORQ (MOVQconst [c]) x) && is32Bit(c) -> (XORQconst [c] x)
+(XORL x (MOVLconst [c])) -> (XORLconst [c] x)
+(XORL (MOVLconst [c]) x) -> (XORLconst [c] x)
+(XORW x (MOVWconst [c])) -> (XORWconst [c] x)
+(XORW (MOVWconst [c]) x) -> (XORWconst [c] x)
+(XORB x (MOVBconst [c])) -> (XORBconst [c] x)
+(XORB (MOVBconst [c]) x) -> (XORBconst [c] x)
+
+(SHLQ x (MOVQconst [c])) -> (SHLQconst [c&63] x)
+(SHLL x (MOVLconst [c])) -> (SHLLconst [c&31] x)
+(SHLW x (MOVWconst [c])) -> (SHLWconst [c&31] x)
+(SHLB x (MOVBconst [c])) -> (SHLBconst [c&31] x)
+
+(SHRQ x (MOVQconst [c])) -> (SHRQconst [c&63] x)
+(SHRL x (MOVLconst [c])) -> (SHRLconst [c&31] x)
+(SHRW x (MOVWconst [c])) -> (SHRWconst [c&31] x)
+(SHRB x (MOVBconst [c])) -> (SHRBconst [c&31] x)
+
+(SARQ x (MOVQconst [c])) -> (SARQconst [c&63] x)
+(SARL x (MOVLconst [c])) -> (SARLconst [c&31] x)
+(SARW x (MOVWconst [c])) -> (SARWconst [c&31] x)
+(SARB x (MOVBconst [c])) -> (SARBconst [c&31] x)
+
+// Note: the word and byte shifts keep the low 5 bits (not the low 4 or 3 bits)
+// because the x86 instructions are defined to use all 5 bits of the shift even
+// for the small shifts.  I don't think we'll ever generate a weird shift (e.g.
+// (SHLW x (MOVWconst [24])), but just in case.
+
+(CMPQ x (MOVQconst [c])) && is32Bit(c) -> (CMPQconst x [c])
+(CMPQ (MOVQconst [c]) x) && is32Bit(c) -> (InvertFlags (CMPQconst <TypeFlags> x [c]))
+(CMPL x (MOVLconst [c])) -> (CMPLconst x [c])
+(CMPL (MOVLconst [c]) x) -> (InvertFlags (CMPLconst <TypeFlags> x [c]))
+(CMPW x (MOVWconst [c])) -> (CMPWconst x [c])
+(CMPW (MOVWconst [c]) x) -> (InvertFlags (CMPWconst <TypeFlags> x [c]))
+(CMPB x (MOVBconst [c])) -> (CMPBconst x [c])
+(CMPB (MOVBconst [c]) x) -> (InvertFlags (CMPBconst <TypeFlags> x [c]))

 // strength reduction
 (MULQconst [-1] x) -> (NEGQ x)
--- a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
@ -96,25 +96,110 @@ func init() {

 	// TODO: 2-address instructions.  Mark ops as needing matching input/output regs.
 	var AMD64ops = []opData{
-		{name: "MULQ", reg: gp21, asm: "IMULQ"},      // arg0 * arg1
-		{name: "MULQconst", reg: gp11, asm: "IMULQ"}, // arg0 * auxint
-		{name: "SHLQ", reg: gp21shift, asm: "SHLQ"},  // arg0 << arg1, shift amount is mod 64
-		{name: "SHLQconst", reg: gp11, asm: "SHLQ"},  // arg0 << auxint, shift amount 0-63
-		{name: "SHRQ", reg: gp21shift, asm: "SHRQ"},  // unsigned arg0 >> arg1, shift amount is mod 64
-		{name: "SHRQconst", reg: gp11, asm: "SHRQ"},  // unsigned arg0 >> auxint, shift amount 0-63
-		{name: "SARQ", reg: gp21shift, asm: "SARQ"},  // signed arg0 >> arg1, shift amount is mod 64
-		{name: "SARQconst", reg: gp11, asm: "SARQ"},  // signed arg0 >> auxint, shift amount 0-63
+		// binary ops
+		{name: "ADDQ", reg: gp21, asm: "ADDQ"},      // arg0 + arg1
+		{name: "ADDL", reg: gp21, asm: "ADDL"},      // arg0 + arg1
+		{name: "ADDW", reg: gp21, asm: "ADDW"},      // arg0 + arg1
+		{name: "ADDB", reg: gp21, asm: "ADDB"},      // arg0 + arg1
+		{name: "ADDQconst", reg: gp11, asm: "ADDQ"}, // arg0 + auxint
+		{name: "ADDLconst", reg: gp11, asm: "ADDL"}, // arg0 + auxint
+		{name: "ADDWconst", reg: gp11, asm: "ADDW"}, // arg0 + auxint
+		{name: "ADDBconst", reg: gp11, asm: "ADDB"}, // arg0 + auxint

-		{name: "XORQconst", reg: gp11, asm: "XORQ"}, // arg0^auxint
+		{name: "SUBQ", reg: gp21, asm: "SUBQ"},      // arg0 - arg1
+		{name: "SUBL", reg: gp21, asm: "SUBL"},      // arg0 - arg1
+		{name: "SUBW", reg: gp21, asm: "SUBW"},      // arg0 - arg1
+		{name: "SUBB", reg: gp21, asm: "SUBB"},      // arg0 - arg1
+		{name: "SUBQconst", reg: gp11, asm: "SUBQ"}, // arg0 - auxint
+		{name: "SUBLconst", reg: gp11, asm: "SUBL"}, // arg0 - auxint
+		{name: "SUBWconst", reg: gp11, asm: "SUBW"}, // arg0 - auxint
+		{name: "SUBBconst", reg: gp11, asm: "SUBB"}, // arg0 - auxint
+
+		{name: "MULQ", reg: gp21, asm: "IMULQ"},      // arg0 * arg1
+		{name: "MULL", reg: gp21, asm: "IMULL"},      // arg0 * arg1
+		{name: "MULW", reg: gp21, asm: "IMULW"},      // arg0 * arg1
+		{name: "MULQconst", reg: gp11, asm: "IMULQ"}, // arg0 * auxint
+		{name: "MULLconst", reg: gp11, asm: "IMULL"}, // arg0 * auxint
+		{name: "MULWconst", reg: gp11, asm: "IMULW"}, // arg0 * auxint
+
+		{name: "ANDQ", reg: gp21, asm: "ANDQ"},      // arg0 & arg1
+		{name: "ANDL", reg: gp21, asm: "ANDL"},      // arg0 & arg1
+		{name: "ANDW", reg: gp21, asm: "ANDW"},      // arg0 & arg1
+		{name: "ANDB", reg: gp21, asm: "ANDB"},      // arg0 & arg1
+		{name: "ANDQconst", reg: gp11, asm: "ANDQ"}, // arg0 & auxint
+		{name: "ANDLconst", reg: gp11, asm: "ANDL"}, // arg0 & auxint
+		{name: "ANDWconst", reg: gp11, asm: "ANDW"}, // arg0 & auxint
+		{name: "ANDBconst", reg: gp11, asm: "ANDB"}, // arg0 & auxint
+
+		{name: "ORQ", reg: gp21, asm: "ORQ"},      // arg0 | arg1
+		{name: "ORL", reg: gp21, asm: "ORL"},      // arg0 | arg1
+		{name: "ORW", reg: gp21, asm: "ORW"},      // arg0 | arg1
+		{name: "ORB", reg: gp21, asm: "ORB"},      // arg0 | arg1
+		{name: "ORQconst", reg: gp11, asm: "ORQ"}, // arg0 | auxint
+		{name: "ORLconst", reg: gp11, asm: "ORL"}, // arg0 | auxint
+		{name: "ORWconst", reg: gp11, asm: "ORW"}, // arg0 | auxint
+		{name: "ORBconst", reg: gp11, asm: "ORB"}, // arg0 | auxint
+
+		{name: "XORQ", reg: gp21, asm: "XORQ"},      // arg0 ^ arg1
+		{name: "XORL", reg: gp21, asm: "XORL"},      // arg0 ^ arg1
+		{name: "XORW", reg: gp21, asm: "XORW"},      // arg0 ^ arg1
+		{name: "XORB", reg: gp21, asm: "XORB"},      // arg0 ^ arg1
+		{name: "XORQconst", reg: gp11, asm: "XORQ"}, // arg0 ^ auxint
+		{name: "XORLconst", reg: gp11, asm: "XORL"}, // arg0 ^ auxint
+		{name: "XORWconst", reg: gp11, asm: "XORW"}, // arg0 ^ auxint
+		{name: "XORBconst", reg: gp11, asm: "XORB"}, // arg0 ^ auxint

 		{name: "CMPQ", reg: gp2flags, asm: "CMPQ"},      // arg0 compare to arg1
-		{name: "CMPQconst", reg: gp1flags, asm: "CMPQ"}, // arg0 compare to auxint
 		{name: "CMPL", reg: gp2flags, asm: "CMPL"},      // arg0 compare to arg1
 		{name: "CMPW", reg: gp2flags, asm: "CMPW"},      // arg0 compare to arg1
 		{name: "CMPB", reg: gp2flags, asm: "CMPB"},      // arg0 compare to arg1
+		{name: "CMPQconst", reg: gp1flags, asm: "CMPQ"}, // arg0 compare to auxint
+		{name: "CMPLconst", reg: gp1flags, asm: "CMPL"}, // arg0 compare to auxint
+		{name: "CMPWconst", reg: gp1flags, asm: "CMPW"}, // arg0 compare to auxint
+		{name: "CMPBconst", reg: gp1flags, asm: "CMPB"}, // arg0 compare to auxint

-		{name: "TESTQ", reg: gp2flags, asm: "TESTQ"}, // (arg0 & arg1) compare to 0
-		{name: "TESTB", reg: gp2flags, asm: "TESTB"}, // (arg0 & arg1) compare to 0
+		{name: "TESTQ", reg: gp2flags, asm: "TESTQ"},      // (arg0 & arg1) compare to 0
+		{name: "TESTL", reg: gp2flags, asm: "TESTL"},      // (arg0 & arg1) compare to 0
+		{name: "TESTW", reg: gp2flags, asm: "TESTW"},      // (arg0 & arg1) compare to 0
+		{name: "TESTB", reg: gp2flags, asm: "TESTB"},      // (arg0 & arg1) compare to 0
+		{name: "TESTQconst", reg: gp1flags, asm: "TESTQ"}, // (arg0 & auxint) compare to 0
+		{name: "TESTLconst", reg: gp1flags, asm: "TESTL"}, // (arg0 & auxint) compare to 0
+		{name: "TESTWconst", reg: gp1flags, asm: "TESTW"}, // (arg0 & auxint) compare to 0
+		{name: "TESTBconst", reg: gp1flags, asm: "TESTB"}, // (arg0 & auxint) compare to 0
+
+		{name: "SHLQ", reg: gp21shift, asm: "SHLQ"}, // arg0 << arg1, shift amount is mod 64
+		{name: "SHLL", reg: gp21shift, asm: "SHLL"}, // arg0 << arg1, shift amount is mod 32
+		{name: "SHLW", reg: gp21shift, asm: "SHLW"}, // arg0 << arg1, shift amount is mod 32
+		{name: "SHLB", reg: gp21shift, asm: "SHLB"}, // arg0 << arg1, shift amount is mod 32
+		{name: "SHLQconst", reg: gp11, asm: "SHLQ"}, // arg0 << auxint, shift amount 0-63
+		{name: "SHLLconst", reg: gp11, asm: "SHLL"}, // arg0 << auxint, shift amount 0-31
+		{name: "SHLWconst", reg: gp11, asm: "SHLW"}, // arg0 << auxint, shift amount 0-31
+		{name: "SHLBconst", reg: gp11, asm: "SHLB"}, // arg0 << auxint, shift amount 0-31
+		// Note: x86 is weird, the 16 and 8 byte shifts still use all 5 bits of shift amount!
+
+		{name: "SHRQ", reg: gp21shift, asm: "SHRQ"}, // unsigned arg0 >> arg1, shift amount is mod 64
+		{name: "SHRL", reg: gp21shift, asm: "SHRL"}, // unsigned arg0 >> arg1, shift amount is mod 32
+		{name: "SHRW", reg: gp21shift, asm: "SHRW"}, // unsigned arg0 >> arg1, shift amount is mod 32
+		{name: "SHRB", reg: gp21shift, asm: "SHRB"}, // unsigned arg0 >> arg1, shift amount is mod 32
+		{name: "SHRQconst", reg: gp11, asm: "SHRQ"}, // unsigned arg0 >> auxint, shift amount 0-63
+		{name: "SHRLconst", reg: gp11, asm: "SHRL"}, // unsigned arg0 >> auxint, shift amount 0-31
+		{name: "SHRWconst", reg: gp11, asm: "SHRW"}, // unsigned arg0 >> auxint, shift amount 0-31
+		{name: "SHRBconst", reg: gp11, asm: "SHRB"}, // unsigned arg0 >> auxint, shift amount 0-31
+
+		{name: "SARQ", reg: gp21shift, asm: "SARQ"}, // signed arg0 >> arg1, shift amount is mod 64
+		{name: "SARL", reg: gp21shift, asm: "SARL"}, // signed arg0 >> arg1, shift amount is mod 32
+		{name: "SARW", reg: gp21shift, asm: "SARW"}, // signed arg0 >> arg1, shift amount is mod 32
+		{name: "SARB", reg: gp21shift, asm: "SARB"}, // signed arg0 >> arg1, shift amount is mod 32
+		{name: "SARQconst", reg: gp11, asm: "SARQ"}, // signed arg0 >> auxint, shift amount 0-63
+		{name: "SARLconst", reg: gp11, asm: "SARL"}, // signed arg0 >> auxint, shift amount 0-31
+		{name: "SARWconst", reg: gp11, asm: "SARW"}, // signed arg0 >> auxint, shift amount 0-31
+		{name: "SARBconst", reg: gp11, asm: "SARB"}, // signed arg0 >> auxint, shift amount 0-31
+
+		// unary ops
+		{name: "NEGQ", reg: gp11, asm: "NEGQ"}, // -arg0
+		{name: "NEGL", reg: gp11, asm: "NEGL"}, // -arg0
+		{name: "NEGW", reg: gp11, asm: "NEGW"}, // -arg0
+		{name: "NEGB", reg: gp11, asm: "NEGB"}, // -arg0

 		{name: "SBBQcarrymask", reg: flagsgp1, asm: "SBBQ"}, // (int64)(-1) if carry is set, 0 if carry is clear.

@ -166,49 +251,12 @@ func init() {
 		// TODO: implement this when register clobbering works
 		{name: "REPSTOSQ", reg: regInfo{[]regMask{buildReg("DI"), buildReg("CX")}, buildReg("DI AX CX"), nil}}, // store arg1 8-byte words containing zero into arg0 using STOSQ. arg2=mem.

-		// Load/store from global. Same as the above loads, but arg0 is missing and
-		// aux is a GlobalOffset instead of an int64.
-		{name: "MOVQloadglobal"},  // Load from aux.(GlobalOffset).  arg0 = memory
-		{name: "MOVQstoreglobal"}, // store arg0 to aux.(GlobalOffset).  arg1=memory, returns memory.
-
 		//TODO: set register clobber to everything?
 		{name: "CALLstatic"},                                                            // call static function aux.(*gc.Sym).  arg0=mem, returns mem
 		{name: "CALLclosure", reg: regInfo{[]regMask{gpsp, buildReg("DX"), 0}, 0, nil}}, // call function via closure.  arg0=codeptr, arg1=closure, arg2=mem returns mem

 		{name: "REPMOVSB", reg: regInfo{[]regMask{buildReg("DI"), buildReg("SI"), buildReg("CX")}, buildReg("DI SI CX"), nil}}, // move arg2 bytes from arg1 to arg0.  arg3=mem, returns memory

-		{name: "ADDQ", reg: gp21},              // arg0 + arg1
-		{name: "ADDQconst", reg: gp11},         // arg0 + auxint
-		{name: "ADDL", reg: gp21, asm: "ADDL"}, // arg0 + arg1
-		{name: "ADDW", reg: gp21, asm: "ADDW"}, // arg0 + arg1
-		{name: "ADDB", reg: gp21, asm: "ADDB"}, // arg0 + arg1
-
-		{name: "SUBQ", reg: gp21, asm: "SUBQ"},      // arg0 - arg1
-		{name: "SUBQconst", reg: gp11, asm: "SUBQ"}, // arg0 - auxint
-		{name: "SUBL", reg: gp21, asm: "SUBL"},      // arg0 - arg1
-		{name: "SUBW", reg: gp21, asm: "SUBW"},      // arg0 - arg1
-		{name: "SUBB", reg: gp21, asm: "SUBB"},      // arg0 - arg1
-
-		{name: "NEGQ", reg: gp11, asm: "NEGQ"}, // -arg0
-		{name: "NEGL", reg: gp11, asm: "NEGL"}, // -arg0
-		{name: "NEGW", reg: gp11, asm: "NEGW"}, // -arg0
-		{name: "NEGB", reg: gp11, asm: "NEGB"}, // -arg0
-
-		{name: "MULL", reg: gp21, asm: "IMULL"}, // arg0*arg1
-		{name: "MULW", reg: gp21, asm: "IMULW"}, // arg0*arg1
-
-		{name: "ANDQ", reg: gp21, asm: "ANDQ"},      // arg0 & arg1
-		{name: "ANDQconst", reg: gp11, asm: "ANDQ"}, // arg0 & auxint
-		{name: "ANDL", reg: gp21, asm: "ANDL"},      // arg0 & arg1
-		{name: "ANDW", reg: gp21, asm: "ANDW"},      // arg0 & arg1
-		{name: "ANDB", reg: gp21, asm: "ANDB"},      // arg0 & arg1
-
-		{name: "ORQ", reg: gp21, asm: "ORQ"},      // arg0 | arg1
-		{name: "ORQconst", reg: gp11, asm: "ORQ"}, // arg0 | auxint
-		{name: "ORL", reg: gp21, asm: "ORL"},      // arg0 | arg1
-		{name: "ORW", reg: gp21, asm: "ORW"},      // arg0 | arg1
-		{name: "ORB", reg: gp21, asm: "ORB"},      // arg0 | arg1
-
 		// (InvertFlags (CMPQ a b)) == (CMPQ b a)
 		// So if we want (SETL (CMPQ a b)) but we can't do that because a is a constant,
 		// then we do (SETL (InvertFlags (CMPQ b a))) instead.
--- a/src/cmd/compile/internal/ssa/gen/genericOps.go
+++ b/src/cmd/compile/internal/ssa/gen/genericOps.go
@ -37,6 +37,11 @@ var genericOps = []opData{
 	{name: "Or32"},
 	{name: "Or64"},

+	{name: "Xor8"}, // arg0 ^ arg1
+	{name: "Xor16"},
+	{name: "Xor32"},
+	{name: "Xor64"},
+
 	{name: "Lsh8"}, // arg0 << arg1
 	{name: "Lsh16"},
 	{name: "Lsh32"},
--- a/src/cmd/compile/internal/ssa/gen/main.go
+++ b/src/cmd/compile/internal/ssa/gen/main.go
@ -13,6 +13,7 @@ import (
 	"go/format"
 	"io/ioutil"
 	"log"
+	"regexp"
 )

 type arch struct {
@ -164,6 +165,28 @@ func genOp() {
 	if err != nil {
 		log.Fatalf("can't write output: %v\n", err)
 	}
+
+	// Check that ../gc/ssa.go handles all the arch-specific opcodes.
+	// This is very much a hack, but it is better than nothing.
+	ssa, err := ioutil.ReadFile("../../gc/ssa.go")
+	if err != nil {
+		log.Fatalf("can't read ../../gc/ssa.go: %v", err)
+	}
+	for _, a := range archs {
+		if a.name == "generic" {
+			continue
+		}
+		for _, v := range a.ops {
+			pattern := fmt.Sprintf("\\Wssa[.]Op%s%s\\W", a.name, v.name)
+			match, err := regexp.Match(pattern, ssa)
+			if err != nil {
+				log.Fatalf("bad opcode regexp %s: %v", pattern, err)
+			}
+			if !match {
+				log.Fatalf("Op%s%s has no code generation in ../../gc/ssa.go", a.name, v.name)
+			}
+		}
+	}
 }

 // Name returns the name of the architecture for use in Op* and Block* enumerations.
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go