cmd/compile: convert more AMD64.rules lines to typed aux mode

Change-Id: Idded860128b1a23680520d8c2b9f6d8620dcfcc7 Reviewed-on: https://go-review.googlesource.com/c/go/+/228077 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2024-11-27 03:41:22 -07:00 · 2020-04-12 17:11:25 -07:00 · 2020-04-12 17:11:25 -07:00 · dc9879e8fd
commit dc9879e8fd
parent 916ecbc731
5 changed files with 408 additions and 375 deletions
--- a/src/cmd/compile/internal/ssa/gen/AMD64.rules
+++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules
@ -114,191 +114,191 @@

 // Lowering extension
 // Note: we always extend to 64 bits even though some ops don't need that many result bits.
-(SignExt8to16  ...) -> (MOVBQSX ...)
-(SignExt8to32  ...) -> (MOVBQSX ...)
-(SignExt8to64  ...) -> (MOVBQSX ...)
-(SignExt16to32 ...) -> (MOVWQSX ...)
-(SignExt16to64 ...) -> (MOVWQSX ...)
-(SignExt32to64 ...) -> (MOVLQSX ...)
+(SignExt8to16  ...) => (MOVBQSX ...)
+(SignExt8to32  ...) => (MOVBQSX ...)
+(SignExt8to64  ...) => (MOVBQSX ...)
+(SignExt16to32 ...) => (MOVWQSX ...)
+(SignExt16to64 ...) => (MOVWQSX ...)
+(SignExt32to64 ...) => (MOVLQSX ...)

-(ZeroExt8to16  ...) -> (MOVBQZX ...)
-(ZeroExt8to32  ...) -> (MOVBQZX ...)
-(ZeroExt8to64  ...) -> (MOVBQZX ...)
-(ZeroExt16to32 ...) -> (MOVWQZX ...)
-(ZeroExt16to64 ...) -> (MOVWQZX ...)
-(ZeroExt32to64 ...) -> (MOVLQZX ...)
+(ZeroExt8to16  ...) => (MOVBQZX ...)
+(ZeroExt8to32  ...) => (MOVBQZX ...)
+(ZeroExt8to64  ...) => (MOVBQZX ...)
+(ZeroExt16to32 ...) => (MOVWQZX ...)
+(ZeroExt16to64 ...) => (MOVWQZX ...)
+(ZeroExt32to64 ...) => (MOVLQZX ...)

-(Slicemask <t> x) -> (SARQconst (NEGQ <t> x) [63])
+(Slicemask <t> x) => (SARQconst (NEGQ <t> x) [63])

-(SpectreIndex <t> x y) -> (CMOVQCC x (MOVQconst [0]) (CMPQ x y))
-(SpectreSliceIndex <t> x y) -> (CMOVQHI x (MOVQconst [0]) (CMPQ x y))
+(SpectreIndex <t> x y) => (CMOVQCC x (MOVQconst [0]) (CMPQ x y))
+(SpectreSliceIndex <t> x y) => (CMOVQHI x (MOVQconst [0]) (CMPQ x y))

 // Lowering truncation
 // Because we ignore high parts of registers, truncates are just copies.
-(Trunc16to8  ...) -> (Copy ...)
-(Trunc32to8  ...) -> (Copy ...)
-(Trunc32to16 ...) -> (Copy ...)
-(Trunc64to8  ...) -> (Copy ...)
-(Trunc64to16 ...) -> (Copy ...)
-(Trunc64to32 ...) -> (Copy ...)
+(Trunc16to8  ...) => (Copy ...)
+(Trunc32to8  ...) => (Copy ...)
+(Trunc32to16 ...) => (Copy ...)
+(Trunc64to8  ...) => (Copy ...)
+(Trunc64to16 ...) => (Copy ...)
+(Trunc64to32 ...) => (Copy ...)

 // Lowering float <-> int
-(Cvt32to32F ...) -> (CVTSL2SS ...)
-(Cvt32to64F ...) -> (CVTSL2SD ...)
-(Cvt64to32F ...) -> (CVTSQ2SS ...)
-(Cvt64to64F ...) -> (CVTSQ2SD ...)
+(Cvt32to32F ...) => (CVTSL2SS ...)
+(Cvt32to64F ...) => (CVTSL2SD ...)
+(Cvt64to32F ...) => (CVTSQ2SS ...)
+(Cvt64to64F ...) => (CVTSQ2SD ...)

-(Cvt32Fto32 ...) -> (CVTTSS2SL ...)
-(Cvt32Fto64 ...) -> (CVTTSS2SQ ...)
-(Cvt64Fto32 ...) -> (CVTTSD2SL ...)
-(Cvt64Fto64 ...) -> (CVTTSD2SQ ...)
+(Cvt32Fto32 ...) => (CVTTSS2SL ...)
+(Cvt32Fto64 ...) => (CVTTSS2SQ ...)
+(Cvt64Fto32 ...) => (CVTTSD2SL ...)
+(Cvt64Fto64 ...) => (CVTTSD2SQ ...)

-(Cvt32Fto64F ...) -> (CVTSS2SD ...)
-(Cvt64Fto32F ...) -> (CVTSD2SS ...)
+(Cvt32Fto64F ...) => (CVTSS2SD ...)
+(Cvt64Fto32F ...) => (CVTSD2SS ...)

-(Round(32|64)F ...) -> (Copy ...)
+(Round(32|64)F ...) => (Copy ...)

-(CvtBoolToUint8 ...) -> (Copy ...)
+(CvtBoolToUint8 ...) => (Copy ...)

 // Lowering shifts
 // Unsigned shifts need to return 0 if shift amount is >= width of shifted value.
 //   result = (arg << shift) & (shift >= argbits ? 0 : 0xffffffffffffffff)
-(Lsh64x(64|32|16|8) <t> x y) && !shiftIsBounded(v) -> (ANDQ (SHLQ <t> x y) (SBBQcarrymask <t> (CMP(Q|L|W|B)const y [64])))
-(Lsh32x(64|32|16|8) <t> x y) && !shiftIsBounded(v) -> (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [32])))
-(Lsh16x(64|32|16|8) <t> x y) && !shiftIsBounded(v) -> (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [32])))
-(Lsh8x(64|32|16|8)  <t> x y) && !shiftIsBounded(v) -> (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [32])))
+(Lsh64x(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDQ (SHLQ <t> x y) (SBBQcarrymask <t> (CMP(Q|L|W|B)const y [64])))
+(Lsh32x(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [32])))
+(Lsh16x(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [32])))
+(Lsh8x(64|32|16|8)  <t> x y) && !shiftIsBounded(v) => (ANDL (SHLL <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [32])))

-(Lsh64x(64|32|16|8) x y) && shiftIsBounded(v) -> (SHLQ x y)
-(Lsh32x(64|32|16|8) x y) && shiftIsBounded(v) -> (SHLL x y)
-(Lsh16x(64|32|16|8) x y) && shiftIsBounded(v) -> (SHLL x y)
-(Lsh8x(64|32|16|8)  x y) && shiftIsBounded(v) -> (SHLL x y)
+(Lsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SHLQ x y)
+(Lsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SHLL x y)
+(Lsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SHLL x y)
+(Lsh8x(64|32|16|8)  x y) && shiftIsBounded(v) => (SHLL x y)

-(Rsh64Ux(64|32|16|8) <t> x y) && !shiftIsBounded(v) -> (ANDQ (SHRQ <t> x y) (SBBQcarrymask <t> (CMP(Q|L|W|B)const y [64])))
-(Rsh32Ux(64|32|16|8) <t> x y) && !shiftIsBounded(v) -> (ANDL (SHRL <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [32])))
-(Rsh16Ux(64|32|16|8) <t> x y) && !shiftIsBounded(v) -> (ANDL (SHRW <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [16])))
-(Rsh8Ux(64|32|16|8)  <t> x y) && !shiftIsBounded(v) -> (ANDL (SHRB <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [8])))
+(Rsh64Ux(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDQ (SHRQ <t> x y) (SBBQcarrymask <t> (CMP(Q|L|W|B)const y [64])))
+(Rsh32Ux(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHRL <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [32])))
+(Rsh16Ux(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (ANDL (SHRW <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [16])))
+(Rsh8Ux(64|32|16|8)  <t> x y) && !shiftIsBounded(v) => (ANDL (SHRB <t> x y) (SBBLcarrymask <t> (CMP(Q|L|W|B)const y [8])))

-(Rsh64Ux(64|32|16|8) x y) && shiftIsBounded(v) -> (SHRQ x y)
-(Rsh32Ux(64|32|16|8) x y) && shiftIsBounded(v) -> (SHRL x y)
-(Rsh16Ux(64|32|16|8) x y) && shiftIsBounded(v) -> (SHRW x y)
-(Rsh8Ux(64|32|16|8)  x y) && shiftIsBounded(v) -> (SHRB x y)
+(Rsh64Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SHRQ x y)
+(Rsh32Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SHRL x y)
+(Rsh16Ux(64|32|16|8) x y) && shiftIsBounded(v) => (SHRW x y)
+(Rsh8Ux(64|32|16|8)  x y) && shiftIsBounded(v) => (SHRB x y)

 // Signed right shift needs to return 0/-1 if shift amount is >= width of shifted value.
 // We implement this by setting the shift value to -1 (all ones) if the shift value is >= width.
-(Rsh64x(64|32|16|8) <t> x y) && !shiftIsBounded(v) -> (SARQ <t> x (OR(Q|L|L|L) <y.Type> y (NOT(Q|L|L|L) <y.Type> (SBB(Q|L|L|L)carrymask <y.Type> (CMP(Q|L|W|B)const y [64])))))
-(Rsh32x(64|32|16|8) <t> x y) && !shiftIsBounded(v) -> (SARL <t> x (OR(Q|L|L|L) <y.Type> y (NOT(Q|L|L|L) <y.Type> (SBB(Q|L|L|L)carrymask <y.Type> (CMP(Q|L|W|B)const y [32])))))
-(Rsh16x(64|32|16|8) <t> x y) && !shiftIsBounded(v) -> (SARW <t> x (OR(Q|L|L|L) <y.Type> y (NOT(Q|L|L|L) <y.Type> (SBB(Q|L|L|L)carrymask <y.Type> (CMP(Q|L|W|B)const y [16])))))
-(Rsh8x(64|32|16|8)  <t> x y) && !shiftIsBounded(v) -> (SARB <t> x (OR(Q|L|L|L) <y.Type> y (NOT(Q|L|L|L) <y.Type> (SBB(Q|L|L|L)carrymask <y.Type> (CMP(Q|L|W|B)const y [8])))))
+(Rsh64x(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (SARQ <t> x (OR(Q|L|L|L) <y.Type> y (NOT(Q|L|L|L) <y.Type> (SBB(Q|L|L|L)carrymask <y.Type> (CMP(Q|L|W|B)const y [64])))))
+(Rsh32x(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (SARL <t> x (OR(Q|L|L|L) <y.Type> y (NOT(Q|L|L|L) <y.Type> (SBB(Q|L|L|L)carrymask <y.Type> (CMP(Q|L|W|B)const y [32])))))
+(Rsh16x(64|32|16|8) <t> x y) && !shiftIsBounded(v) => (SARW <t> x (OR(Q|L|L|L) <y.Type> y (NOT(Q|L|L|L) <y.Type> (SBB(Q|L|L|L)carrymask <y.Type> (CMP(Q|L|W|B)const y [16])))))
+(Rsh8x(64|32|16|8)  <t> x y) && !shiftIsBounded(v) => (SARB <t> x (OR(Q|L|L|L) <y.Type> y (NOT(Q|L|L|L) <y.Type> (SBB(Q|L|L|L)carrymask <y.Type> (CMP(Q|L|W|B)const y [8])))))

-(Rsh64x(64|32|16|8) x y) && shiftIsBounded(v) -> (SARQ x y)
-(Rsh32x(64|32|16|8) x y) && shiftIsBounded(v) -> (SARL x y)
-(Rsh16x(64|32|16|8) x y) && shiftIsBounded(v) -> (SARW x y)
-(Rsh8x(64|32|16|8) x y)  && shiftIsBounded(v) -> (SARB x y)
+(Rsh64x(64|32|16|8) x y) && shiftIsBounded(v) => (SARQ x y)
+(Rsh32x(64|32|16|8) x y) && shiftIsBounded(v) => (SARL x y)
+(Rsh16x(64|32|16|8) x y) && shiftIsBounded(v) => (SARW x y)
+(Rsh8x(64|32|16|8) x y)  && shiftIsBounded(v) => (SARB x y)

 // Lowering integer comparisons
-(Less(64|32|16|8)      x y) -> (SETL  (CMP(Q|L|W|B)     x y))
-(Less(64|32|16|8)U     x y) -> (SETB  (CMP(Q|L|W|B)     x y))
-(Leq(64|32|16|8)       x y) -> (SETLE (CMP(Q|L|W|B)     x y))
-(Leq(64|32|16|8)U      x y) -> (SETBE (CMP(Q|L|W|B)     x y))
-(Eq(Ptr|64|32|16|8|B)  x y) -> (SETEQ (CMP(Q|Q|L|W|B|B) x y))
-(Neq(Ptr|64|32|16|8|B) x y) -> (SETNE (CMP(Q|Q|L|W|B|B) x y))
+(Less(64|32|16|8)      x y) => (SETL  (CMP(Q|L|W|B)     x y))
+(Less(64|32|16|8)U     x y) => (SETB  (CMP(Q|L|W|B)     x y))
+(Leq(64|32|16|8)       x y) => (SETLE (CMP(Q|L|W|B)     x y))
+(Leq(64|32|16|8)U      x y) => (SETBE (CMP(Q|L|W|B)     x y))
+(Eq(Ptr|64|32|16|8|B)  x y) => (SETEQ (CMP(Q|Q|L|W|B|B) x y))
+(Neq(Ptr|64|32|16|8|B) x y) => (SETNE (CMP(Q|Q|L|W|B|B) x y))

 // Lowering floating point comparisons
 // Note Go assembler gets UCOMISx operand order wrong, but it is right here
 // and the operands are reversed when generating assembly language.
-(Eq(32|64)F   x y) -> (SETEQF (UCOMIS(S|D) x y))
-(Neq(32|64)F  x y) -> (SETNEF (UCOMIS(S|D) x y))
+(Eq(32|64)F   x y) => (SETEQF (UCOMIS(S|D) x y))
+(Neq(32|64)F  x y) => (SETNEF (UCOMIS(S|D) x y))
 // Use SETGF/SETGEF with reversed operands to dodge NaN case.
-(Less(32|64)F x y) -> (SETGF  (UCOMIS(S|D) y x))
-(Leq(32|64)F  x y) -> (SETGEF (UCOMIS(S|D) y x))
+(Less(32|64)F x y) => (SETGF  (UCOMIS(S|D) y x))
+(Leq(32|64)F  x y) => (SETGEF (UCOMIS(S|D) y x))

 // Lowering loads
-(Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) -> (MOVQload ptr mem)
-(Load <t> ptr mem) && is32BitInt(t) -> (MOVLload ptr mem)
-(Load <t> ptr mem) && is16BitInt(t) -> (MOVWload ptr mem)
-(Load <t> ptr mem) && (t.IsBoolean() || is8BitInt(t)) -> (MOVBload ptr mem)
-(Load <t> ptr mem) && is32BitFloat(t) -> (MOVSSload ptr mem)
-(Load <t> ptr mem) && is64BitFloat(t) -> (MOVSDload ptr mem)
+(Load <t> ptr mem) && (is64BitInt(t) || isPtr(t)) => (MOVQload ptr mem)
+(Load <t> ptr mem) && is32BitInt(t) => (MOVLload ptr mem)
+(Load <t> ptr mem) && is16BitInt(t) => (MOVWload ptr mem)
+(Load <t> ptr mem) && (t.IsBoolean() || is8BitInt(t)) => (MOVBload ptr mem)
+(Load <t> ptr mem) && is32BitFloat(t) => (MOVSSload ptr mem)
+(Load <t> ptr mem) && is64BitFloat(t) => (MOVSDload ptr mem)

 // Lowering stores
 // These more-specific FP versions of Store pattern should come first.
-(Store {t} ptr val mem) && t.(*types.Type).Size() == 8 && is64BitFloat(val.Type) -> (MOVSDstore ptr val mem)
-(Store {t} ptr val mem) && t.(*types.Type).Size() == 4 && is32BitFloat(val.Type) -> (MOVSSstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 8 && is64BitFloat(val.Type) => (MOVSDstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 && is32BitFloat(val.Type) => (MOVSSstore ptr val mem)

-(Store {t} ptr val mem) && t.(*types.Type).Size() == 8 -> (MOVQstore ptr val mem)
-(Store {t} ptr val mem) && t.(*types.Type).Size() == 4 -> (MOVLstore ptr val mem)
-(Store {t} ptr val mem) && t.(*types.Type).Size() == 2 -> (MOVWstore ptr val mem)
-(Store {t} ptr val mem) && t.(*types.Type).Size() == 1 -> (MOVBstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 8 => (MOVQstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 => (MOVLstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 2 => (MOVWstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 1 => (MOVBstore ptr val mem)

 // Lowering moves
-(Move [0] _ _ mem) -> mem
-(Move [1] dst src mem) -> (MOVBstore dst (MOVBload src mem) mem)
-(Move [2] dst src mem) -> (MOVWstore dst (MOVWload src mem) mem)
-(Move [4] dst src mem) -> (MOVLstore dst (MOVLload src mem) mem)
-(Move [8] dst src mem) -> (MOVQstore dst (MOVQload src mem) mem)
-(Move [16] dst src mem) && config.useSSE -> (MOVOstore dst (MOVOload src mem) mem)
-(Move [16] dst src mem) && !config.useSSE ->
+(Move [0] _ _ mem) => mem
+(Move [1] dst src mem) => (MOVBstore dst (MOVBload src mem) mem)
+(Move [2] dst src mem) => (MOVWstore dst (MOVWload src mem) mem)
+(Move [4] dst src mem) => (MOVLstore dst (MOVLload src mem) mem)
+(Move [8] dst src mem) => (MOVQstore dst (MOVQload src mem) mem)
+(Move [16] dst src mem) && config.useSSE => (MOVOstore dst (MOVOload src mem) mem)
+(Move [16] dst src mem) && !config.useSSE =>
 	(MOVQstore [8] dst (MOVQload [8] src mem)
 		(MOVQstore dst (MOVQload src mem) mem))

-(Move [32] dst src mem) ->
+(Move [32] dst src mem) =>
 	(Move [16]
 		(OffPtr <dst.Type> dst [16])
 		(OffPtr <src.Type> src [16])
 		(Move [16] dst src mem))

-(Move [48] dst src mem) && config.useSSE ->
+(Move [48] dst src mem) && config.useSSE =>
 	(Move [32]
 		(OffPtr <dst.Type> dst [16])
 		(OffPtr <src.Type> src [16])
 		(Move [16] dst src mem))

-(Move [64] dst src mem) && config.useSSE ->
+(Move [64] dst src mem) && config.useSSE =>
 	(Move [32]
 		(OffPtr <dst.Type> dst [32])
 		(OffPtr <src.Type> src [32])
 		(Move [32] dst src mem))

-(Move [3] dst src mem) ->
+(Move [3] dst src mem) =>
 	(MOVBstore [2] dst (MOVBload [2] src mem)
 		(MOVWstore dst (MOVWload src mem) mem))
-(Move [5] dst src mem) ->
+(Move [5] dst src mem) =>
 	(MOVBstore [4] dst (MOVBload [4] src mem)
 		(MOVLstore dst (MOVLload src mem) mem))
-(Move [6] dst src mem) ->
+(Move [6] dst src mem) =>
 	(MOVWstore [4] dst (MOVWload [4] src mem)
 		(MOVLstore dst (MOVLload src mem) mem))
-(Move [7] dst src mem) ->
+(Move [7] dst src mem) =>
 	(MOVLstore [3] dst (MOVLload [3] src mem)
 		(MOVLstore dst (MOVLload src mem) mem))
-(Move [9] dst src mem) ->
+(Move [9] dst src mem) =>
 	(MOVBstore [8] dst (MOVBload [8] src mem)
 		(MOVQstore dst (MOVQload src mem) mem))
-(Move [10] dst src mem) ->
+(Move [10] dst src mem) =>
 	(MOVWstore [8] dst (MOVWload [8] src mem)
 		(MOVQstore dst (MOVQload src mem) mem))
-(Move [12] dst src mem) ->
+(Move [12] dst src mem) =>
 	(MOVLstore [8] dst (MOVLload [8] src mem)
 		(MOVQstore dst (MOVQload src mem) mem))
-(Move [s] dst src mem) && s == 11 || s >= 13 && s <= 15 ->
-	(MOVQstore [s-8] dst (MOVQload [s-8] src mem)
+(Move [s] dst src mem) && s == 11 || s >= 13 && s <= 15 =>
+	(MOVQstore [int32(s-8)] dst (MOVQload [int32(s-8)] src mem)
 		(MOVQstore dst (MOVQload src mem) mem))

 // Adjust moves to be a multiple of 16 bytes.
 (Move [s] dst src mem)
-	&& s > 16 && s%16 != 0 && s%16 <= 8 ->
+	&& s > 16 && s%16 != 0 && s%16 <= 8 =>
 	(Move [s-s%16]
 		(OffPtr <dst.Type> dst [s%16])
 		(OffPtr <src.Type> src [s%16])
 		(MOVQstore dst (MOVQload src mem) mem))
 (Move [s] dst src mem)
-	&& s > 16 && s%16 != 0 && s%16 > 8 && config.useSSE ->
+	&& s > 16 && s%16 != 0 && s%16 > 8 && config.useSSE =>
 	(Move [s-s%16]
 		(OffPtr <dst.Type> dst [s%16])
 		(OffPtr <src.Type> src [s%16])
 		(MOVOstore dst (MOVOload src mem) mem))
 (Move [s] dst src mem)
-	&& s > 16 && s%16 != 0 && s%16 > 8 && !config.useSSE ->
+	&& s > 16 && s%16 != 0 && s%16 > 8 && !config.useSSE =>
 	(Move [s-s%16]
 		(OffPtr <dst.Type> dst [s%16])
 		(OffPtr <src.Type> src [s%16])
@ -308,7 +308,7 @@
 // Medium copying uses a duff device.
 (Move [s] dst src mem)
 	&& s > 64 && s <= 16*64 && s%16 == 0
-	&& !config.noDuffDevice && logLargeCopy(v, s) ->
+	&& !config.noDuffDevice && logLargeCopy(v, s) =>
 	(DUFFCOPY [14*(64-s/16)] dst src mem)
 // 14 and 64 are magic constants.  14 is the number of bytes to encode:
 //	MOVUPS	(SI), X0
@ -318,71 +318,71 @@
 // and 64 is the number of such blocks. See src/runtime/duff_amd64.s:duffcopy.

 // Large copying uses REP MOVSQ.
-(Move [s] dst src mem) && (s > 16*64 || config.noDuffDevice) && s%8 == 0 && logLargeCopy(v, s) ->
+(Move [s] dst src mem) && (s > 16*64 || config.noDuffDevice) && s%8 == 0 && logLargeCopy(v, s) =>
 	(REPMOVSQ dst src (MOVQconst [s/8]) mem)

 // Lowering Zero instructions
-(Zero [0] _ mem) -> mem
-(Zero [1] destptr mem) -> (MOVBstoreconst [0] destptr mem)
-(Zero [2] destptr mem) -> (MOVWstoreconst [0] destptr mem)
-(Zero [4] destptr mem) -> (MOVLstoreconst [0] destptr mem)
-(Zero [8] destptr mem) -> (MOVQstoreconst [0] destptr mem)
+(Zero [0] _ mem) => mem
+(Zero [1] destptr mem) => (MOVBstoreconst [makeValAndOff32(0,0)] destptr mem)
+(Zero [2] destptr mem) => (MOVWstoreconst [makeValAndOff32(0,0)] destptr mem)
+(Zero [4] destptr mem) => (MOVLstoreconst [makeValAndOff32(0,0)] destptr mem)
+(Zero [8] destptr mem) => (MOVQstoreconst [makeValAndOff32(0,0)] destptr mem)

-(Zero [3] destptr mem) ->
-	(MOVBstoreconst [makeValAndOff(0,2)] destptr
-		(MOVWstoreconst [0] destptr mem))
-(Zero [5] destptr mem) ->
-	(MOVBstoreconst [makeValAndOff(0,4)] destptr
-		(MOVLstoreconst [0] destptr mem))
-(Zero [6] destptr mem) ->
-	(MOVWstoreconst [makeValAndOff(0,4)] destptr
-		(MOVLstoreconst [0] destptr mem))
-(Zero [7] destptr mem) ->
-	(MOVLstoreconst [makeValAndOff(0,3)] destptr
-		(MOVLstoreconst [0] destptr mem))
+(Zero [3] destptr mem) =>
+	(MOVBstoreconst [makeValAndOff32(0,2)] destptr
+		(MOVWstoreconst [makeValAndOff32(0,0)] destptr mem))
+(Zero [5] destptr mem) =>
+	(MOVBstoreconst [makeValAndOff32(0,4)] destptr
+		(MOVLstoreconst [makeValAndOff32(0,0)] destptr mem))
+(Zero [6] destptr mem) =>
+	(MOVWstoreconst [makeValAndOff32(0,4)] destptr
+		(MOVLstoreconst [makeValAndOff32(0,0)] destptr mem))
+(Zero [7] destptr mem) =>
+	(MOVLstoreconst [makeValAndOff32(0,3)] destptr
+		(MOVLstoreconst [makeValAndOff32(0,0)] destptr mem))

 // Strip off any fractional word zeroing.
-(Zero [s] destptr mem) && s%8 != 0 && s > 8 && !config.useSSE ->
+(Zero [s] destptr mem) && s%8 != 0 && s > 8 && !config.useSSE =>
 	(Zero [s-s%8] (OffPtr <destptr.Type> destptr [s%8])
-		(MOVQstoreconst [0] destptr mem))
+		(MOVQstoreconst [makeValAndOff32(0,0)] destptr mem))

 // Zero small numbers of words directly.
-(Zero [16] destptr mem) && !config.useSSE ->
-	(MOVQstoreconst [makeValAndOff(0,8)] destptr
-		(MOVQstoreconst [0] destptr mem))
-(Zero [24] destptr mem) && !config.useSSE ->
-	(MOVQstoreconst [makeValAndOff(0,16)] destptr
-		(MOVQstoreconst [makeValAndOff(0,8)] destptr
-			(MOVQstoreconst [0] destptr mem)))
-(Zero [32] destptr mem) && !config.useSSE ->
-	(MOVQstoreconst [makeValAndOff(0,24)] destptr
-		(MOVQstoreconst [makeValAndOff(0,16)] destptr
-			(MOVQstoreconst [makeValAndOff(0,8)] destptr
-				(MOVQstoreconst [0] destptr mem))))
+(Zero [16] destptr mem) && !config.useSSE =>
+	(MOVQstoreconst [makeValAndOff32(0,8)] destptr
+		(MOVQstoreconst [makeValAndOff32(0,0)] destptr mem))
+(Zero [24] destptr mem) && !config.useSSE =>
+	(MOVQstoreconst [makeValAndOff32(0,16)] destptr
+		(MOVQstoreconst [makeValAndOff32(0,8)] destptr
+			(MOVQstoreconst [makeValAndOff32(0,0)] destptr mem)))
+(Zero [32] destptr mem) && !config.useSSE =>
+	(MOVQstoreconst [makeValAndOff32(0,24)] destptr
+		(MOVQstoreconst [makeValAndOff32(0,16)] destptr
+			(MOVQstoreconst [makeValAndOff32(0,8)] destptr
+				(MOVQstoreconst [makeValAndOff32(0,0)] destptr mem))))

-(Zero [s] destptr mem) && s > 8 && s < 16 && config.useSSE ->
-	(MOVQstoreconst [makeValAndOff(0,s-8)] destptr
-		(MOVQstoreconst [0] destptr mem))
+(Zero [s] destptr mem) && s > 8 && s < 16 && config.useSSE =>
+	(MOVQstoreconst [makeValAndOff32(0,int32(s-8))] destptr
+		(MOVQstoreconst [makeValAndOff32(0,0)] destptr mem))

 // Adjust zeros to be a multiple of 16 bytes.
-(Zero [s] destptr mem) && s%16 != 0 && s > 16 && s%16 > 8 && config.useSSE ->
+(Zero [s] destptr mem) && s%16 != 0 && s > 16 && s%16 > 8 && config.useSSE =>
 	(Zero [s-s%16] (OffPtr <destptr.Type> destptr [s%16])
 		(MOVOstore destptr (MOVOconst [0]) mem))

-(Zero [s] destptr mem) && s%16 != 0 && s > 16 && s%16 <= 8 && config.useSSE ->
+(Zero [s] destptr mem) && s%16 != 0 && s > 16 && s%16 <= 8 && config.useSSE =>
 	(Zero [s-s%16] (OffPtr <destptr.Type> destptr [s%16])
-		(MOVQstoreconst [0] destptr mem))
+		(MOVQstoreconst [makeValAndOff32(0,0)] destptr mem))

-(Zero [16] destptr mem) && config.useSSE ->
+(Zero [16] destptr mem) && config.useSSE =>
 	(MOVOstore destptr (MOVOconst [0]) mem)
-(Zero [32] destptr mem) && config.useSSE ->
+(Zero [32] destptr mem) && config.useSSE =>
 	(MOVOstore (OffPtr <destptr.Type> destptr [16]) (MOVOconst [0])
 		(MOVOstore destptr (MOVOconst [0]) mem))
-(Zero [48] destptr mem) && config.useSSE ->
+(Zero [48] destptr mem) && config.useSSE =>
 	(MOVOstore (OffPtr <destptr.Type> destptr [32]) (MOVOconst [0])
 		(MOVOstore (OffPtr <destptr.Type> destptr [16]) (MOVOconst [0])
 			(MOVOstore destptr (MOVOconst [0]) mem)))
-(Zero [64] destptr mem) && config.useSSE ->
+(Zero [64] destptr mem) && config.useSSE =>
 	(MOVOstore (OffPtr <destptr.Type> destptr [48]) (MOVOconst [0])
 		(MOVOstore (OffPtr <destptr.Type> destptr [32]) (MOVOconst [0])
 			(MOVOstore (OffPtr <destptr.Type> destptr [16]) (MOVOconst [0])
@ -390,13 +390,13 @@

 // Medium zeroing uses a duff device.
 (Zero [s] destptr mem)
-	&& s > 64 && s <= 1024 && s%16 == 0 && !config.noDuffDevice ->
+	&& s > 64 && s <= 1024 && s%16 == 0 && !config.noDuffDevice =>
 	(DUFFZERO [s] destptr (MOVOconst [0]) mem)

 // Large zeroing uses REP STOSQ.
 (Zero [s] destptr mem)
 	&& (s > 1024 || (config.noDuffDevice && s > 64 || !config.useSSE && s > 32))
-	&& s%8 == 0 ->
+	&& s%8 == 0 =>
 	(REPSTOSQ destptr (MOVQconst [s/8]) (MOVQconst [0]) mem)

 // Lowering constants
--- a/src/cmd/compile/internal/ssa/gen/rulegen.go
+++ b/src/cmd/compile/internal/ssa/gen/rulegen.go
@ -1034,18 +1034,18 @@ func genMatch0(rr *RuleRewrite, arch arch, match, v string, cnt map[string]int,
 		if !token.IsIdentifier(e.name) || rr.declared(e.name) {
 			switch e.field {
 			case "Aux":
-				rr.add(breakf("auxTo%s(%s.%s) != %s", strings.Title(e.dclType), v, e.field, e.name))
+				rr.add(breakf("auxTo%s(%s.%s) != %s", title(e.dclType), v, e.field, e.name))
 			case "AuxInt":
-				rr.add(breakf("auxIntTo%s(%s.%s) != %s", strings.Title(e.dclType), v, e.field, e.name))
+				rr.add(breakf("auxIntTo%s(%s.%s) != %s", title(e.dclType), v, e.field, e.name))
 			case "Type":
 				rr.add(breakf("%s.%s != %s", v, e.field, e.name))
 			}
 		} else {
 			switch e.field {
 			case "Aux":
-				rr.add(declf(e.name, "auxTo%s(%s.%s)", strings.Title(e.dclType), v, e.field))
+				rr.add(declf(e.name, "auxTo%s(%s.%s)", title(e.dclType), v, e.field))
 			case "AuxInt":
-				rr.add(declf(e.name, "auxIntTo%s(%s.%s)", strings.Title(e.dclType), v, e.field))
+				rr.add(declf(e.name, "auxIntTo%s(%s.%s)", title(e.dclType), v, e.field))
 			case "Type":
 				rr.add(declf(e.name, "%s.%s", v, e.field))
 			}
@ -1762,7 +1762,8 @@ func (op opData) auxIntType() string {
 		return "int32"
 	case "Int64":
 		return "int64"
-	//case  "Int128":
+	case "Int128":
+		return "int128"
 	case "Float32":
 		return "float32"
 	case "Float64":
@ -1780,6 +1781,16 @@ func (op opData) auxIntType() string {
 	}
 }

+func title(s string) string {
+	if i := strings.Index(s, "."); i >= 0 {
+		s = s[i+1:]
+	}
+	return strings.Title(s)
+}
+
 func unTitle(s string) string {
+	if i := strings.Index(s, "."); i >= 0 {
+		s = s[i+1:]
+	}
 	return strings.ToLower(s[:1]) + s[1:]
 }
--- a/src/cmd/compile/internal/ssa/op.go
+++ b/src/cmd/compile/internal/ssa/op.go
@ -192,6 +192,10 @@ func (x ValAndOff) add(off int64) int64 {
 	return makeValAndOff(x.Val(), x.Off()+off)
 }

+// int128 is a type that stores a 128-bit constant.
+// The only allowed constant right now is 0, so we can cheat quite a bit.
+type int128 int64
+
 type BoundsKind uint8

 const (
--- a/src/cmd/compile/internal/ssa/rewrite.go
+++ b/src/cmd/compile/internal/ssa/rewrite.go
@ -568,6 +568,12 @@ func auxIntToFloat64(i int64) float64 {
 func auxIntToValAndOff(i int64) ValAndOff {
 	return ValAndOff(i)
 }
+func auxIntToInt128(x int64) int128 {
+	if x != 0 {
+		panic("nonzero int128 not allowed")
+	}
+	return 0
+}

 func boolToAuxInt(b bool) int64 {
 	if b {
@ -596,6 +602,12 @@ func float64ToAuxInt(f float64) int64 {
 func valAndOffToAuxInt(v ValAndOff) int64 {
 	return int64(v)
 }
+func int128ToAuxInt(x int128) int64 {
+	if x != 0 {
+		panic("nonzero int128 not allowed")
+	}
+	return 0
+}

 func auxToString(i interface{}) string {
 	return i.(string)
@ -605,6 +617,9 @@ func auxToSym(i interface{}) Sym {
 	s, _ := i.(Sym)
 	return s
 }
+func auxToType(i interface{}) *types.Type {
+	return i.(*types.Type)
+}

 func stringToAux(s string) interface{} {
 	return s
@ -612,6 +627,9 @@ func stringToAux(s string) interface{} {
 func symToAux(s Sym) interface{} {
 	return s
 }
+func typeToAux(t *types.Type) interface{} {
+	return t
+}

 // uaddOvf reports whether unsigned a+b would overflow.
 func uaddOvf(a, b int64) bool {
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go