[dev.ssa] cmd/compile: lots of small rewrite optimizations

Small optimizations I noticed while looking at Giovanni's test cases. More shifts by constants. Indexed stores for smaller types. Fold LEA into loads/stores. More extending loads. CMP $0 of AND -> TEST Fix order of TEST ops. Giovanni's test cases at https://gist.github.com/rasky/62fba94e3a20d1b05b2a Change-Id: I7077bc0b5319bf05767eeb39f401f4bb4b39f635 Reviewed-on: https://go-review.googlesource.com/19086 Run-TryBot: Keith Randall <khr@golang.org> Reviewed-by: Todd Neal <todd@tneal.org> Reviewed-by: David Chase <drchase@google.com>
2024-11-12 09:20:22 -07:00 · 2016-01-30 11:25:38 -08:00 · 2016-01-30 11:25:38 -08:00 · 1cc5789df9
commit 1cc5789df9
parent f962f33035
6 changed files with 1969 additions and 13 deletions
--- a/src/cmd/compile/internal/gc/ssa.go
+++ b/src/cmd/compile/internal/gc/ssa.go
@ -4003,13 +4003,18 @@ func (s *genState) genValue(v *ssa.Value) {
 		// Go assembler has swapped operands for UCOMISx relative to CMP,
 		// must account for that right here.
 		opregreg(v.Op.Asm(), regnum(v.Args[0]), regnum(v.Args[1]))
-	case ssa.OpAMD64CMPQconst, ssa.OpAMD64CMPLconst, ssa.OpAMD64CMPWconst, ssa.OpAMD64CMPBconst,
-		ssa.OpAMD64TESTQconst, ssa.OpAMD64TESTLconst, ssa.OpAMD64TESTWconst, ssa.OpAMD64TESTBconst:
+	case ssa.OpAMD64CMPQconst, ssa.OpAMD64CMPLconst, ssa.OpAMD64CMPWconst, ssa.OpAMD64CMPBconst:
 		p := Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_REG
 		p.From.Reg = regnum(v.Args[0])
 		p.To.Type = obj.TYPE_CONST
 		p.To.Offset = v.AuxInt
+	case ssa.OpAMD64TESTQconst, ssa.OpAMD64TESTLconst, ssa.OpAMD64TESTWconst, ssa.OpAMD64TESTBconst:
+		p := Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = v.AuxInt
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = regnum(v.Args[0])
 	case ssa.OpAMD64MOVBconst, ssa.OpAMD64MOVWconst, ssa.OpAMD64MOVLconst, ssa.OpAMD64MOVQconst:
 		x := regnum(v)
 		p := Prog(v.Op.Asm())
@ -4040,7 +4045,7 @@ func (s *genState) genValue(v *ssa.Value) {
 		p.From.Val = math.Float64frombits(uint64(v.AuxInt))
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = x
-	case ssa.OpAMD64MOVQload, ssa.OpAMD64MOVSSload, ssa.OpAMD64MOVSDload, ssa.OpAMD64MOVLload, ssa.OpAMD64MOVWload, ssa.OpAMD64MOVBload, ssa.OpAMD64MOVBQSXload, ssa.OpAMD64MOVBQZXload, ssa.OpAMD64MOVOload:
+	case ssa.OpAMD64MOVQload, ssa.OpAMD64MOVSSload, ssa.OpAMD64MOVSDload, ssa.OpAMD64MOVLload, ssa.OpAMD64MOVWload, ssa.OpAMD64MOVBload, ssa.OpAMD64MOVBQSXload, ssa.OpAMD64MOVBQZXload, ssa.OpAMD64MOVWQSXload, ssa.OpAMD64MOVWQZXload, ssa.OpAMD64MOVLQSXload, ssa.OpAMD64MOVLQZXload, ssa.OpAMD64MOVOload:
 		p := Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_MEM
 		p.From.Reg = regnum(v.Args[0])
@ -4081,7 +4086,7 @@ func (s *genState) genValue(v *ssa.Value) {
 		p.To.Scale = 8
 		p.To.Index = regnum(v.Args[1])
 		addAux(&p.To, v)
-	case ssa.OpAMD64MOVSSstoreidx4:
+	case ssa.OpAMD64MOVSSstoreidx4, ssa.OpAMD64MOVLstoreidx4:
 		p := Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_REG
 		p.From.Reg = regnum(v.Args[2])
@ -4090,6 +4095,24 @@ func (s *genState) genValue(v *ssa.Value) {
 		p.To.Scale = 4
 		p.To.Index = regnum(v.Args[1])
 		addAux(&p.To, v)
+	case ssa.OpAMD64MOVWstoreidx2:
+		p := Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = regnum(v.Args[2])
+		p.To.Type = obj.TYPE_MEM
+		p.To.Reg = regnum(v.Args[0])
+		p.To.Scale = 2
+		p.To.Index = regnum(v.Args[1])
+		addAux(&p.To, v)
+	case ssa.OpAMD64MOVBstoreidx1:
+		p := Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = regnum(v.Args[2])
+		p.To.Type = obj.TYPE_MEM
+		p.To.Reg = regnum(v.Args[0])
+		p.To.Scale = 1
+		p.To.Index = regnum(v.Args[1])
+		addAux(&p.To, v)
 	case ssa.OpAMD64MOVQstoreconst, ssa.OpAMD64MOVLstoreconst, ssa.OpAMD64MOVWstoreconst, ssa.OpAMD64MOVBstoreconst:
 		p := Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_CONST
@ -4365,7 +4388,9 @@ func (s *genState) genValue(v *ssa.Value) {
 			}
 			switch w.Op {
 			case ssa.OpAMD64MOVQload, ssa.OpAMD64MOVLload, ssa.OpAMD64MOVWload, ssa.OpAMD64MOVBload,
-				ssa.OpAMD64MOVQstore, ssa.OpAMD64MOVLstore, ssa.OpAMD64MOVWstore, ssa.OpAMD64MOVBstore:
+				ssa.OpAMD64MOVQstore, ssa.OpAMD64MOVLstore, ssa.OpAMD64MOVWstore, ssa.OpAMD64MOVBstore,
+				ssa.OpAMD64MOVBQSXload, ssa.OpAMD64MOVBQZXload, ssa.OpAMD64MOVWQSXload,
+				ssa.OpAMD64MOVWQZXload, ssa.OpAMD64MOVLQSXload, ssa.OpAMD64MOVLQZXload:
 				if w.Args[0] == v.Args[0] && w.Aux == nil && w.AuxInt >= 0 && w.AuxInt < minZeroPage {
 					if Debug_checknil != 0 && int(v.Line) > 1 {
 						Warnl(int(v.Line), "removed nil check")
--- a/src/cmd/compile/internal/ssa/TODO
+++ b/src/cmd/compile/internal/ssa/TODO
@ -51,6 +51,8 @@ Optimizations (better compiled code)
  Note that this is challenging for ops that generate flags
  because flagalloc wants to move those instructions around for
  flag regeneration.
+- In forms like if ... { call } else { no call }, mark the call branch as unlikely.
+- Non-constant rotate detection.

 Optimizations (better compiler)
 -------------------------------
--- a/src/cmd/compile/internal/ssa/gen/AMD64.rules
+++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules
@ -464,18 +464,63 @@
 (XORB (MOVBconst [c]) x) -> (XORBconst [c] x)

 (SHLQ x (MOVQconst [c])) -> (SHLQconst [c&63] x)
+(SHLQ x (MOVLconst [c])) -> (SHLQconst [c&63] x)
+(SHLQ x (MOVWconst [c])) -> (SHLQconst [c&63] x)
+(SHLQ x (MOVBconst [c])) -> (SHLQconst [c&63] x)
+
+(SHLL x (MOVQconst [c])) -> (SHLLconst [c&31] x)
 (SHLL x (MOVLconst [c])) -> (SHLLconst [c&31] x)
+(SHLL x (MOVWconst [c])) -> (SHLLconst [c&31] x)
+(SHLL x (MOVBconst [c])) -> (SHLLconst [c&31] x)
+
+(SHLW x (MOVQconst [c])) -> (SHLWconst [c&31] x)
+(SHLW x (MOVLconst [c])) -> (SHLWconst [c&31] x)
 (SHLW x (MOVWconst [c])) -> (SHLWconst [c&31] x)
+(SHLW x (MOVBconst [c])) -> (SHLWconst [c&31] x)
+
+(SHLB x (MOVQconst [c])) -> (SHLBconst [c&31] x)
+(SHLB x (MOVLconst [c])) -> (SHLBconst [c&31] x)
+(SHLB x (MOVWconst [c])) -> (SHLBconst [c&31] x)
 (SHLB x (MOVBconst [c])) -> (SHLBconst [c&31] x)

 (SHRQ x (MOVQconst [c])) -> (SHRQconst [c&63] x)
+(SHRQ x (MOVLconst [c])) -> (SHRQconst [c&63] x)
+(SHRQ x (MOVWconst [c])) -> (SHRQconst [c&63] x)
+(SHRQ x (MOVBconst [c])) -> (SHRQconst [c&63] x)
+
+(SHRL x (MOVQconst [c])) -> (SHRLconst [c&31] x)
 (SHRL x (MOVLconst [c])) -> (SHRLconst [c&31] x)
+(SHRL x (MOVWconst [c])) -> (SHRLconst [c&31] x)
+(SHRL x (MOVBconst [c])) -> (SHRLconst [c&31] x)
+
+(SHRW x (MOVQconst [c])) -> (SHRWconst [c&31] x)
+(SHRW x (MOVLconst [c])) -> (SHRWconst [c&31] x)
 (SHRW x (MOVWconst [c])) -> (SHRWconst [c&31] x)
+(SHRW x (MOVBconst [c])) -> (SHRWconst [c&31] x)
+
+(SHRB x (MOVQconst [c])) -> (SHRBconst [c&31] x)
+(SHRB x (MOVLconst [c])) -> (SHRBconst [c&31] x)
+(SHRB x (MOVWconst [c])) -> (SHRBconst [c&31] x)
 (SHRB x (MOVBconst [c])) -> (SHRBconst [c&31] x)

 (SARQ x (MOVQconst [c])) -> (SARQconst [c&63] x)
+(SARQ x (MOVLconst [c])) -> (SARQconst [c&63] x)
+(SARQ x (MOVWconst [c])) -> (SARQconst [c&63] x)
+(SARQ x (MOVBconst [c])) -> (SARQconst [c&63] x)
+
+(SARL x (MOVQconst [c])) -> (SARLconst [c&31] x)
 (SARL x (MOVLconst [c])) -> (SARLconst [c&31] x)
+(SARL x (MOVWconst [c])) -> (SARLconst [c&31] x)
+(SARL x (MOVBconst [c])) -> (SARLconst [c&31] x)
+
+(SARW x (MOVQconst [c])) -> (SARWconst [c&31] x)
+(SARW x (MOVLconst [c])) -> (SARWconst [c&31] x)
 (SARW x (MOVWconst [c])) -> (SARWconst [c&31] x)
+(SARW x (MOVBconst [c])) -> (SARWconst [c&31] x)
+
+(SARB x (MOVQconst [c])) -> (SARBconst [c&31] x)
+(SARB x (MOVLconst [c])) -> (SARBconst [c&31] x)
+(SARB x (MOVWconst [c])) -> (SARBconst [c&31] x)
 (SARB x (MOVBconst [c])) -> (SARBconst [c&31] x)

 // Note: the word and byte shifts keep the low 5 bits (not the low 4 or 3 bits)
@ -524,7 +569,18 @@
 // multiple memory values alive simultaneously.
 (MOVBQSX (MOVBload [off] {sym} ptr mem)) -> @v.Args[0].Block (MOVBQSXload <v.Type> [off] {sym} ptr mem)
 (MOVBQZX (MOVBload [off] {sym} ptr mem)) -> @v.Args[0].Block (MOVBQZXload <v.Type> [off] {sym} ptr mem)
-// TODO: more
+(MOVWQSX (MOVWload [off] {sym} ptr mem)) -> @v.Args[0].Block (MOVWQSXload <v.Type> [off] {sym} ptr mem)
+(MOVWQZX (MOVWload [off] {sym} ptr mem)) -> @v.Args[0].Block (MOVWQZXload <v.Type> [off] {sym} ptr mem)
+(MOVLQSX (MOVLload [off] {sym} ptr mem)) -> @v.Args[0].Block (MOVLQSXload <v.Type> [off] {sym} ptr mem)
+(MOVLQZX (MOVLload [off] {sym} ptr mem)) -> @v.Args[0].Block (MOVLQZXload <v.Type> [off] {sym} ptr mem)
+
+// Fold extensions and ANDs together.
+(MOVBQZX (ANDBconst [c] x)) -> (ANDQconst [c & 0xff] x)
+(MOVWQZX (ANDWconst [c] x)) -> (ANDQconst [c & 0xffff] x)
+(MOVLQZX (ANDLconst [c] x)) -> (ANDQconst [c & 0xffffffff] x)
+(MOVBQSX (ANDBconst [c] x)) && c & 0x80 == 0 -> (ANDQconst [c & 0x7f] x)
+(MOVWQSX (ANDWconst [c] x)) && c & 0x8000 == 0 -> (ANDQconst [c & 0x7fff] x)
+(MOVLQSX (ANDLconst [c] x)) && c & 0x80000000 == 0 -> (ANDQconst [c & 0x7fffffff] x)

 // Don't extend before storing
 (MOVLstore [off] {sym} ptr (MOVLQSX x) mem) -> (MOVLstore [off] {sym} ptr x mem)
@ -623,22 +679,63 @@
 (MOVSSstoreidx4 [off1] {sym} (ADDQconst [off2] {sym} ptr) idx val mem) -> (MOVSSstoreidx4 [addOff(off1, off2)] {sym} ptr idx val mem)
 (MOVSDloadidx8 [off1] {sym} (ADDQconst [off2] {sym} ptr) idx mem) -> (MOVSDloadidx8 [addOff(off1, off2)] {sym} ptr idx mem)
 (MOVSDstoreidx8 [off1] {sym} (ADDQconst [off2] {sym} ptr) idx val mem) -> (MOVSDstoreidx8 [addOff(off1, off2)] {sym} ptr idx val mem)
+(MOVLstoreidx4 [off1] {sym} (ADDQconst [off2] ptr) idx val mem) -> (MOVLstoreidx4 [addOff(off1, off2)] {sym} ptr idx val mem)
+(MOVWstoreidx2 [off1] {sym} (ADDQconst [off2] ptr) idx val mem) -> (MOVWstoreidx2 [addOff(off1, off2)] {sym} ptr idx val mem)
+(MOVBstoreidx1 [off1] {sym} (ADDQconst [off2] ptr) idx val mem) -> (MOVBstoreidx1 [addOff(off1, off2)] {sym} ptr idx val mem)

 (MOVQload [off1] {sym1} (LEAQ8 [off2] {sym2} ptr idx) mem) && canMergeSym(sym1, sym2) ->
 	(MOVQloadidx8 [addOff(off1, off2)] {mergeSym(sym1,sym2)} ptr idx mem)
-(MOVQstore [off1] {sym1} (LEAQ8 [off2] {sym2} ptr idx) val mem) && canMergeSym(sym1, sym2) ->
-	(MOVQstoreidx8 [addOff(off1, off2)] {mergeSym(sym1,sym2)} ptr idx val mem)
-
 (MOVSSload [off1] {sym1} (LEAQ4 [off2] {sym2} ptr idx) mem) && canMergeSym(sym1, sym2) ->
 	(MOVSSloadidx4 [addOff(off1, off2)] {mergeSym(sym1,sym2)} ptr idx mem)
-(MOVSSstore [off1] {sym1} (LEAQ4 [off2] {sym2} ptr idx) val mem) && canMergeSym(sym1, sym2) ->
-	(MOVSSstoreidx4 [addOff(off1, off2)] {mergeSym(sym1,sym2)} ptr idx val mem)
-
 (MOVSDload [off1] {sym1} (LEAQ8 [off2] {sym2} ptr idx) mem) && canMergeSym(sym1, sym2) ->
 	(MOVSDloadidx8 [addOff(off1, off2)] {mergeSym(sym1,sym2)} ptr idx mem)
+
+(MOVBstore [off1] {sym1} (LEAQ1 [off2] {sym2} ptr idx) val mem) && canMergeSym(sym1, sym2) ->
+	(MOVBstoreidx1 [addOff(off1, off2)] {mergeSym(sym1,sym2)} ptr idx val mem)
+(MOVWstore [off1] {sym1} (LEAQ2 [off2] {sym2} ptr idx) val mem) && canMergeSym(sym1, sym2) ->
+	(MOVWstoreidx2 [addOff(off1, off2)] {mergeSym(sym1,sym2)} ptr idx val mem)
+(MOVLstore [off1] {sym1} (LEAQ4 [off2] {sym2} ptr idx) val mem) && canMergeSym(sym1, sym2) ->
+	(MOVLstoreidx4 [addOff(off1, off2)] {mergeSym(sym1,sym2)} ptr idx val mem)
+(MOVQstore [off1] {sym1} (LEAQ8 [off2] {sym2} ptr idx) val mem) && canMergeSym(sym1, sym2) ->
+	(MOVQstoreidx8 [addOff(off1, off2)] {mergeSym(sym1,sym2)} ptr idx val mem)
+(MOVSSstore [off1] {sym1} (LEAQ4 [off2] {sym2} ptr idx) val mem) && canMergeSym(sym1, sym2) ->
+	(MOVSSstoreidx4 [addOff(off1, off2)] {mergeSym(sym1,sym2)} ptr idx val mem)
 (MOVSDstore [off1] {sym1} (LEAQ8 [off2] {sym2} ptr idx) val mem) && canMergeSym(sym1, sym2) ->
 	(MOVSDstoreidx8 [addOff(off1, off2)] {mergeSym(sym1,sym2)} ptr idx val mem)

+(MOVBstore [off] {sym} (ADDQ ptr idx) val mem) -> (MOVBstoreidx1 [off] {sym} ptr idx val mem)
+
+// fold LEAQs together
+(LEAQ [off1] {sym1} (LEAQ [off2] {sym2} x)) && canMergeSym(sym1, sym2) ->
+      (LEAQ [addOff(off1,off2)] {mergeSym(sym1,sym2)} x)
+
+// LEAQ into LEAQ1
+(LEAQ1 [off1] {sym1} (LEAQ [off2] {sym2} x) y) && canMergeSym(sym1, sym2) && x.Op != OpSB ->
+       (LEAQ1 [addOff(off1,off2)] {mergeSym(sym1,sym2)} x y)
+(LEAQ1 [off1] {sym1} x (LEAQ [off2] {sym2} y)) && canMergeSym(sym1, sym2) && y.Op != OpSB ->
+       (LEAQ1 [addOff(off1,off2)] {mergeSym(sym1,sym2)} x y)
+
+// LEAQ1 into LEAQ
+(LEAQ [off1] {sym1} (LEAQ1 [off2] {sym2} x y)) && canMergeSym(sym1, sym2) ->
+       (LEAQ1 [addOff(off1,off2)] {mergeSym(sym1,sym2)} x y)
+
+// LEAQ into LEAQ[248]
+(LEAQ2 [off1] {sym1} (LEAQ [off2] {sym2} x) y) && canMergeSym(sym1, sym2) && x.Op != OpSB ->
+       (LEAQ2 [addOff(off1,off2)] {mergeSym(sym1,sym2)} x y)
+(LEAQ4 [off1] {sym1} (LEAQ [off2] {sym2} x) y) && canMergeSym(sym1, sym2) && x.Op != OpSB ->
+       (LEAQ4 [addOff(off1,off2)] {mergeSym(sym1,sym2)} x y)
+(LEAQ8 [off1] {sym1} (LEAQ [off2] {sym2} x) y) && canMergeSym(sym1, sym2) && x.Op != OpSB ->
+       (LEAQ8 [addOff(off1,off2)] {mergeSym(sym1,sym2)} x y)
+
+// LEAQ[248] into LEAQ
+(LEAQ [off1] {sym1} (LEAQ2 [off2] {sym2} x y)) && canMergeSym(sym1, sym2) ->
+      (LEAQ2 [addOff(off1,off2)] {mergeSym(sym1,sym2)} x y)
+(LEAQ [off1] {sym1} (LEAQ4 [off2] {sym2} x y)) && canMergeSym(sym1, sym2) ->
+      (LEAQ4 [addOff(off1,off2)] {mergeSym(sym1,sym2)} x y)
+(LEAQ [off1] {sym1} (LEAQ8 [off2] {sym2} x y)) && canMergeSym(sym1, sym2) ->
+      (LEAQ8 [addOff(off1,off2)] {mergeSym(sym1,sym2)} x y)
+
+
 // lower Zero instructions with word sizes
 (Zero [0] _ mem) -> mem
 (Zero [1] destptr mem) -> (MOVBstoreconst [0] destptr mem)
@ -963,3 +1060,12 @@
 (XORW x x) -> (MOVWconst [0])
 (XORB x x) -> (MOVBconst [0])

+// checking AND against 0.
+(CMPQconst (ANDQ x y) [0]) -> (TESTQ x y)
+(CMPLconst (ANDL x y) [0]) -> (TESTL x y)
+(CMPWconst (ANDW x y) [0]) -> (TESTW x y)
+(CMPBconst (ANDB x y) [0]) -> (TESTB x y)
+(CMPQconst (ANDQconst [c] x) [0]) -> (TESTQconst [c] x)
+(CMPLconst (ANDLconst [c] x) [0]) -> (TESTLconst [c] x)
+(CMPWconst (ANDWconst [c] x) [0]) -> (TESTWconst [c] x)
+(CMPBconst (ANDBconst [c] x) [0]) -> (TESTBconst [c] x)
--- a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
@ -368,14 +368,22 @@ func init() {
 		{name: "MOVBQSXload", reg: gpload, asm: "MOVBQSX"},          // ditto, extend to int64
 		{name: "MOVBQZXload", reg: gpload, asm: "MOVBQZX"},          // ditto, extend to uint64
 		{name: "MOVWload", reg: gpload, asm: "MOVW", typ: "UInt16"}, // load 2 bytes from arg0+auxint+aux. arg1=mem
+		{name: "MOVWQSXload", reg: gpload, asm: "MOVWQSX"},          // ditto, extend to int64
+		{name: "MOVWQZXload", reg: gpload, asm: "MOVWQZX"},          // ditto, extend to uint64
 		{name: "MOVLload", reg: gpload, asm: "MOVL", typ: "UInt32"}, // load 4 bytes from arg0+auxint+aux. arg1=mem
+		{name: "MOVLQSXload", reg: gpload, asm: "MOVLQSX"},          // ditto, extend to int64
+		{name: "MOVLQZXload", reg: gpload, asm: "MOVLQZX"},          // ditto, extend to uint64
 		{name: "MOVQload", reg: gpload, asm: "MOVQ", typ: "UInt64"}, // load 8 bytes from arg0+auxint+aux. arg1=mem
 		{name: "MOVQloadidx8", reg: gploadidx, asm: "MOVQ"},         // load 8 bytes from arg0+8*arg1+auxint+aux. arg2=mem
 		{name: "MOVBstore", reg: gpstore, asm: "MOVB", typ: "Mem"},  // store byte in arg1 to arg0+auxint+aux. arg2=mem
 		{name: "MOVWstore", reg: gpstore, asm: "MOVW", typ: "Mem"},  // store 2 bytes in arg1 to arg0+auxint+aux. arg2=mem
 		{name: "MOVLstore", reg: gpstore, asm: "MOVL", typ: "Mem"},  // store 4 bytes in arg1 to arg0+auxint+aux. arg2=mem
 		{name: "MOVQstore", reg: gpstore, asm: "MOVQ", typ: "Mem"},  // store 8 bytes in arg1 to arg0+auxint+aux. arg2=mem
-		{name: "MOVQstoreidx8", reg: gpstoreidx, asm: "MOVQ"},       // store 8 bytes in arg2 to arg0+8*arg1+auxint+aux. arg3=mem
+
+		{name: "MOVBstoreidx1", reg: gpstoreidx, asm: "MOVB"}, // store byte in arg2 to arg0+arg1+auxint+aux. arg3=mem
+		{name: "MOVWstoreidx2", reg: gpstoreidx, asm: "MOVW"}, // store 2 bytes in arg2 to arg0+2*arg1+auxint+aux. arg3=mem
+		{name: "MOVLstoreidx4", reg: gpstoreidx, asm: "MOVL"}, // store 4 bytes in arg2 to arg0+4*arg1+auxint+aux. arg3=mem
+		{name: "MOVQstoreidx8", reg: gpstoreidx, asm: "MOVQ"}, // store 8 bytes in arg2 to arg0+8*arg1+auxint+aux. arg3=mem

 		{name: "MOVOload", reg: fpload, asm: "MOVUPS", typ: "Int128"}, // load 16 bytes from arg0+auxint+aux. arg1=mem
 		{name: "MOVOstore", reg: fpstore, asm: "MOVUPS", typ: "Mem"},  // store 16 bytes in arg1 to arg0+auxint+aux. arg2=mem
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@ -254,13 +254,20 @@ const (
 	OpAMD64MOVBQSXload
 	OpAMD64MOVBQZXload
 	OpAMD64MOVWload
+	OpAMD64MOVWQSXload
+	OpAMD64MOVWQZXload
 	OpAMD64MOVLload
+	OpAMD64MOVLQSXload
+	OpAMD64MOVLQZXload
 	OpAMD64MOVQload
 	OpAMD64MOVQloadidx8
 	OpAMD64MOVBstore
 	OpAMD64MOVWstore
 	OpAMD64MOVLstore
 	OpAMD64MOVQstore
+	OpAMD64MOVBstoreidx1
+	OpAMD64MOVWstoreidx2
+	OpAMD64MOVLstoreidx4
 	OpAMD64MOVQstoreidx8
 	OpAMD64MOVOload
 	OpAMD64MOVOstore
@ -2966,6 +2973,30 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name: "MOVWQSXload",
+		asm:  x86.AMOVWQSX,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 4295032831}, // .AX .CX .DX .BX .SP .BP .SI .DI .R8 .R9 .R10 .R11 .R12 .R13 .R14 .R15 .SB
+			},
+			outputs: []regMask{
+				65519, // .AX .CX .DX .BX .BP .SI .DI .R8 .R9 .R10 .R11 .R12 .R13 .R14 .R15
+			},
+		},
+	},
+	{
+		name: "MOVWQZXload",
+		asm:  x86.AMOVWQZX,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 4295032831}, // .AX .CX .DX .BX .SP .BP .SI .DI .R8 .R9 .R10 .R11 .R12 .R13 .R14 .R15 .SB
+			},
+			outputs: []regMask{
+				65519, // .AX .CX .DX .BX .BP .SI .DI .R8 .R9 .R10 .R11 .R12 .R13 .R14 .R15
+			},
+		},
+	},
 	{
 		name: "MOVLload",
 		asm:  x86.AMOVL,
@ -2978,6 +3009,30 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name: "MOVLQSXload",
+		asm:  x86.AMOVLQSX,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 4295032831}, // .AX .CX .DX .BX .SP .BP .SI .DI .R8 .R9 .R10 .R11 .R12 .R13 .R14 .R15 .SB
+			},
+			outputs: []regMask{
+				65519, // .AX .CX .DX .BX .BP .SI .DI .R8 .R9 .R10 .R11 .R12 .R13 .R14 .R15
+			},
+		},
+	},
+	{
+		name: "MOVLQZXload",
+		asm:  x86.AMOVLQZX,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 4295032831}, // .AX .CX .DX .BX .SP .BP .SI .DI .R8 .R9 .R10 .R11 .R12 .R13 .R14 .R15 .SB
+			},
+			outputs: []regMask{
+				65519, // .AX .CX .DX .BX .BP .SI .DI .R8 .R9 .R10 .R11 .R12 .R13 .R14 .R15
+			},
+		},
+	},
 	{
 		name: "MOVQload",
 		asm:  x86.AMOVQ,
@ -3043,6 +3098,39 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name: "MOVBstoreidx1",
+		asm:  x86.AMOVB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 65535},      // .AX .CX .DX .BX .SP .BP .SI .DI .R8 .R9 .R10 .R11 .R12 .R13 .R14 .R15
+				{2, 65535},      // .AX .CX .DX .BX .SP .BP .SI .DI .R8 .R9 .R10 .R11 .R12 .R13 .R14 .R15
+				{0, 4295032831}, // .AX .CX .DX .BX .SP .BP .SI .DI .R8 .R9 .R10 .R11 .R12 .R13 .R14 .R15 .SB
+			},
+		},
+	},
+	{
+		name: "MOVWstoreidx2",
+		asm:  x86.AMOVW,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 65535},      // .AX .CX .DX .BX .SP .BP .SI .DI .R8 .R9 .R10 .R11 .R12 .R13 .R14 .R15
+				{2, 65535},      // .AX .CX .DX .BX .SP .BP .SI .DI .R8 .R9 .R10 .R11 .R12 .R13 .R14 .R15
+				{0, 4295032831}, // .AX .CX .DX .BX .SP .BP .SI .DI .R8 .R9 .R10 .R11 .R12 .R13 .R14 .R15 .SB
+			},
+		},
+	},
+	{
+		name: "MOVLstoreidx4",
+		asm:  x86.AMOVL,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{1, 65535},      // .AX .CX .DX .BX .SP .BP .SI .DI .R8 .R9 .R10 .R11 .R12 .R13 .R14 .R15
+				{2, 65535},      // .AX .CX .DX .BX .SP .BP .SI .DI .R8 .R9 .R10 .R11 .R12 .R13 .R14 .R15
+				{0, 4295032831}, // .AX .CX .DX .BX .SP .BP .SI .DI .R8 .R9 .R10 .R11 .R12 .R13 .R14 .R15 .SB
+			},
+		},
+	},
 	{
 		name: "MOVQstoreidx8",
 		asm:  x86.AMOVQ,
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go