From 7d9f1067d1c8a2d0252fa2a115f1d016f94f7087 Mon Sep 17 00:00:00 2001
From: Keith Randall <khr@golang.org>
Date: Thu, 17 Dec 2015 10:01:24 -0800
Subject: [PATCH] [dev.ssa] cmd/compile: better register allocator

Reorder how register & stack allocation is done.  We used to allocate
registers, then fix up merge edges, then allocate stack slots.  This
lead to lots of unnecessary copies on merge edges:

v2 = LoadReg v1
v3 = StoreReg v2

If v1 and v3 are allocated to the same stack slot, then this code is
unnecessary.  But at regalloc time we didn't know the homes of v1 and
v3.

To fix this problem, allocate all the stack slots before fixing up the
merge edges.  That way, we know what stack slots values use so we know
what copies are required.

Use a good technique for shuffling values around on merge edges.

Improves performance of the go1 TimeParse benchmark by ~12%

Change-Id: I731f43e4ff1a7e0dc4cd4aa428fcdb97812b86fa
Reviewed-on: https://go-review.googlesource.com/17915
Reviewed-by: David Chase <drchase@google.com>
---
 src/cmd/compile/internal/ssa/compile.go    |   3 -
 src/cmd/compile/internal/ssa/flagalloc.go  |   9 -
 src/cmd/compile/internal/ssa/regalloc.go   | 900 +++++++++++++++------
 src/cmd/compile/internal/ssa/stackalloc.go | 301 +++----
 4 files changed, 796 insertions(+), 417 deletions(-)

diff --git a/src/cmd/compile/internal/ssa/compile.go b/src/cmd/compile/internal/ssa/compile.go
index 767b774ab0..20af6fd5bd 100644
--- a/src/cmd/compile/internal/ssa/compile.go
+++ b/src/cmd/compile/internal/ssa/compile.go
@@ -102,7 +102,6 @@ var passes = [...]pass{
 	{"schedule", schedule},   // schedule values
 	{"flagalloc", flagalloc}, // allocate flags register
 	{"regalloc", regalloc},
-	{"stackalloc", stackalloc},
 }
 
 // Double-check phase ordering constraints.
@@ -138,8 +137,6 @@ var passOrder = [...]constraint{
 	{"critical", "regalloc"},
 	// regalloc requires all the values in a block to be scheduled
 	{"schedule", "regalloc"},
-	// stack allocation requires register allocation
-	{"regalloc", "stackalloc"},
 	// checkLower must run after lowering & subsequent dead code elim
 	{"lower", "checkLower"},
 	{"lowered deadcode", "checkLower"},
diff --git a/src/cmd/compile/internal/ssa/flagalloc.go b/src/cmd/compile/internal/ssa/flagalloc.go
index 714ac016a2..c088158057 100644
--- a/src/cmd/compile/internal/ssa/flagalloc.go
+++ b/src/cmd/compile/internal/ssa/flagalloc.go
@@ -21,15 +21,6 @@ func flagalloc(f *Func) {
 		// Walk blocks backwards.  Poor-man's postorder traversal.
 		for i := len(f.Blocks) - 1; i >= 0; i-- {
 			b := f.Blocks[i]
-			if len(b.Preds) > 1 {
-				// Don't use any flags register at the start
-				// of a merge block.  This causes problems
-				// in regalloc because some of the rematerialization
-				// instructions used on incoming merge edges clobber
-				// the flags register.
-				// TODO: only for architectures where this matters?
-				continue
-			}
 			// Walk values backwards to figure out what flag
 			// value we want in the flag register at the start
 			// of the block.
diff --git a/src/cmd/compile/internal/ssa/regalloc.go b/src/cmd/compile/internal/ssa/regalloc.go
index 2690b6188e..0f1068a337 100644
--- a/src/cmd/compile/internal/ssa/regalloc.go
+++ b/src/cmd/compile/internal/ssa/regalloc.go
@@ -99,7 +99,7 @@ import (
 	"unsafe"
 )
 
-const regDebug = false
+const regDebug = false // TODO: compiler flag
 const logSpills = false
 
 // regalloc performs register allocation on f.  It sets f.RegAlloc
@@ -201,12 +201,12 @@ type use struct {
 }
 
 type valState struct {
-	regs       regMask // the set of registers holding a Value (usually just one)
-	uses       *use    // list of uses in this block
-	spill      *Value  // spilled copy of the Value
-	spill2     *Value  // special alternate spill location used for phi resolution
-	spillUsed  bool
-	spill2used bool
+	regs              regMask // the set of registers holding a Value (usually just one)
+	uses              *use    // list of uses in this block
+	spill             *Value  // spilled copy of the Value
+	spillUsed         bool
+	needReg           bool // cached value of !v.Type.IsMemory() && !v.Type.IsVoid() && !.v.Type.IsFlags()
+	rematerializeable bool // cached value of v.rematerializeable()
 }
 
 type regState struct {
@@ -218,10 +218,6 @@ type regState struct {
 type regAllocState struct {
 	f *Func
 
-	// For each value, whether it needs a register or not.
-	// Cached value of !v.Type.IsMemory() && !v.Type.IsVoid() && !v.Type.IsFlags().
-	needReg []bool
-
 	// for each block, its primary predecessor.
 	// A predecessor of b is primary if it is the closest
 	// predecessor that appears before b in the layout order.
@@ -249,14 +245,33 @@ type regAllocState struct {
 	// mask of registers currently in use
 	used regMask
 
-	// Home locations (registers) for Values
-	home []Location
-
 	// current block we're working on
 	curBlock *Block
 
 	// cache of use records
 	freeUseRecords *use
+
+	// endRegs[blockid] is the register state at the end of each block.
+	// encoded as a set of endReg records.
+	endRegs [][]endReg
+
+	// startRegs[blockid] is the register state at the start of merge blocks.
+	// saved state does not include the state of phi ops in the block.
+	startRegs [][]startReg
+
+	// spillLive[blockid] is the set of live spills at the end of each block
+	spillLive [][]ID
+}
+
+type endReg struct {
+	r register
+	v *Value // pre-regalloc value held in this register (TODO: can we use ID here?)
+	c *Value // cached version of the value
+}
+
+type startReg struct {
+	r   register
+	vid ID // pre-regalloc value needed in this register
 }
 
 // freeReg frees up register r.  Any current user of r is kicked out.
@@ -268,7 +283,7 @@ func (s *regAllocState) freeReg(r register) {
 
 	// Mark r as unused.
 	if regDebug {
-		fmt.Printf("freeReg %d (dump %s/%s)\n", r, v, s.regs[r].c)
+		fmt.Printf("freeReg %s (dump %s/%s)\n", registers[r].Name(), v, s.regs[r].c)
 	}
 	s.regs[r] = regState{}
 	s.values[v.ID].regs &^= regMask(1) << r
@@ -282,21 +297,6 @@ func (s *regAllocState) freeRegs(m regMask) {
 	}
 }
 
-func (s *regAllocState) setHome(v *Value, r register) {
-	// Remember assignment.
-	for int(v.ID) >= len(s.home) {
-		s.home = append(s.home, nil)
-		s.home = s.home[:cap(s.home)]
-	}
-	s.home[v.ID] = &registers[r]
-}
-func (s *regAllocState) getHome(v *Value) register {
-	if int(v.ID) >= len(s.home) || s.home[v.ID] == nil {
-		return noRegister
-	}
-	return register(s.home[v.ID].(*Register).Num)
-}
-
 // setOrig records that c's original value is the same as
 // v's original value.
 func (s *regAllocState) setOrig(c *Value, v *Value) {
@@ -313,7 +313,7 @@ func (s *regAllocState) setOrig(c *Value, v *Value) {
 // r must be unused.
 func (s *regAllocState) assignReg(r register, v *Value, c *Value) {
 	if regDebug {
-		fmt.Printf("assignReg %d %s/%s\n", r, v, c)
+		fmt.Printf("assignReg %s %s/%s\n", registers[r].Name(), v, c)
 	}
 	if s.regs[r].v != nil {
 		s.f.Fatalf("tried to assign register %d to %s/%s but it is already used by %s", r, v, c, s.regs[r].v)
@@ -323,7 +323,7 @@ func (s *regAllocState) assignReg(r register, v *Value, c *Value) {
 	s.regs[r] = regState{v, c}
 	s.values[v.ID].regs |= regMask(1) << r
 	s.used |= regMask(1) << r
-	s.setHome(c, r)
+	s.f.setHome(c, &registers[r])
 }
 
 // allocReg picks an unused register from regmask.  If there is no unused register,
@@ -361,16 +361,6 @@ func (s *regAllocState) allocReg(mask regMask) register {
 			continue
 		}
 		v := s.regs[t].v
-
-		if s.values[v.ID].uses == nil {
-			// No subsequent use.
-			// This can happen when fixing up merge blocks at the end.
-			// We've already run through the use lists so they are empty.
-			// Any register would be ok at this point.
-			r = t
-			maxuse = 0
-			break
-		}
 		if n := s.values[v.ID].uses.dist; n > maxuse {
 			// v's next use is farther in the future than any value
 			// we've seen so far.  A new best spill candidate.
@@ -432,12 +422,6 @@ func (s *regAllocState) allocValToReg(v *Value, mask regMask, nospill bool) *Val
 	} else {
 		switch {
 		// Load v from its spill location.
-		case vi.spill2 != nil:
-			if logSpills {
-				fmt.Println("regalloc: load spill2")
-			}
-			c = s.curBlock.NewValue1(v.Line, OpLoadReg, v.Type, vi.spill2)
-			vi.spill2used = true
 		case vi.spill != nil:
 			if logSpills {
 				fmt.Println("regalloc: load spill")
@@ -462,17 +446,16 @@ func (s *regAllocState) init(f *Func) {
 	}
 
 	s.f = f
-	s.needReg = make([]bool, f.NumValues())
 	s.regs = make([]regState, numRegs)
 	s.values = make([]valState, f.NumValues())
 	s.orig = make([]*Value, f.NumValues())
 	for _, b := range f.Blocks {
 		for _, v := range b.Values {
-			if v.Type.IsMemory() || v.Type.IsVoid() || v.Type.IsFlags() {
-				continue
+			if !v.Type.IsMemory() && !v.Type.IsVoid() && !v.Type.IsFlags() {
+				s.values[v.ID].needReg = true
+				s.values[v.ID].rematerializeable = v.rematerializeable()
+				s.orig[v.ID] = v
 			}
-			s.needReg[v.ID] = true
-			s.orig[v.ID] = v
 		}
 	}
 	s.computeLive()
@@ -498,6 +481,10 @@ func (s *regAllocState) init(f *Func) {
 		}
 		s.primary[b.ID] = int32(best)
 	}
+
+	s.endRegs = make([][]endReg, f.NumBlocks())
+	s.startRegs = make([][]startReg, f.NumBlocks())
+	s.spillLive = make([][]ID, f.NumBlocks())
 }
 
 // Adds a use record for id at distance dist from the start of the block.
@@ -521,7 +508,7 @@ func (s *regAllocState) addUse(id ID, dist int32) {
 // Any values which have no more uses are deallocated from registers.
 func (s *regAllocState) advanceUses(v *Value) {
 	for _, a := range v.Args {
-		if !s.needReg[a.ID] {
+		if !s.values[a.ID].needReg {
 			continue
 		}
 		ai := &s.values[a.ID]
@@ -536,21 +523,18 @@ func (s *regAllocState) advanceUses(v *Value) {
 	}
 }
 
-// Sets the state of the registers to that encoded in state.
-func (s *regAllocState) setState(state []regState) {
+// Sets the state of the registers to that encoded in regs.
+func (s *regAllocState) setState(regs []endReg) {
 	s.freeRegs(s.used)
-	for r, x := range state {
-		if x.c == nil {
-			continue
-		}
-		s.assignReg(register(r), x.v, x.c)
+	for _, x := range regs {
+		s.assignReg(x.r, x.v, x.c)
 	}
 }
 
-// compatRegs returns the set of registers which can store v.
-func (s *regAllocState) compatRegs(v *Value) regMask {
+// compatRegs returns the set of registers which can store a type t.
+func (s *regAllocState) compatRegs(t Type) regMask {
 	var m regMask
-	if v.Type.IsFloat() {
+	if t.IsFloat() {
 		m = 0xffff << 16 // X0-X15
 	} else {
 		m = 0xffef << 0 // AX-R15, except SP
@@ -560,11 +544,8 @@ func (s *regAllocState) compatRegs(v *Value) regMask {
 
 func (s *regAllocState) regalloc(f *Func) {
 	liveSet := newSparseSet(f.NumValues())
-	argset := newSparseSet(f.NumValues())
 	var oldSched []*Value
 	var phis []*Value
-	var stackPhis []*Value
-	var regPhis []*Value
 	var phiRegs []register
 	var args []*Value
 
@@ -572,11 +553,6 @@ func (s *regAllocState) regalloc(f *Func) {
 		f.Fatalf("entry block must be first")
 	}
 
-	// For each merge block, we record the starting register state (after phi ops)
-	// for that merge block.  Indexed by blockid/regnum.
-	startRegs := make([][]*Value, f.NumBlocks())
-	// end state of registers for each block, idexed by blockid/regnum.
-	endRegs := make([][]regState, f.NumBlocks())
 	for _, b := range f.Blocks {
 		s.curBlock = b
 
@@ -587,18 +563,21 @@ func (s *regAllocState) regalloc(f *Func) {
 			s.addUse(e.ID, int32(len(b.Values))+e.dist) // pseudo-uses from beyond end of block
 			liveSet.add(e.ID)
 		}
-		if c := b.Control; c != nil && s.needReg[c.ID] {
-			s.addUse(c.ID, int32(len(b.Values))) // psuedo-use by control value
-			liveSet.add(c.ID)
+		if v := b.Control; v != nil && s.values[v.ID].needReg {
+			s.addUse(v.ID, int32(len(b.Values))) // psuedo-use by control value
+			liveSet.add(v.ID)
 		}
 		for i := len(b.Values) - 1; i >= 0; i-- {
 			v := b.Values[i]
-			if v.Op == OpPhi {
-				break // Don't process phi ops.
-			}
 			liveSet.remove(v.ID)
+			if v.Op == OpPhi {
+				// Remove v from the live set, but don't add
+				// any inputs.  This is the state the len(b.Preds)>1
+				// case below desires; it wants to process phis specially.
+				continue
+			}
 			for _, a := range v.Args {
-				if !s.needReg[a.ID] {
+				if !s.values[a.ID].needReg {
 					continue
 				}
 				s.addUse(a.ID, int32(i))
@@ -613,7 +592,7 @@ func (s *regAllocState) regalloc(f *Func) {
 				if u == nil {
 					continue
 				}
-				fmt.Printf("v%d:", i)
+				fmt.Printf("  v%d:", i)
 				for u != nil {
 					fmt.Printf(" %d", u.dist)
 					u = u.next
@@ -643,7 +622,7 @@ func (s *regAllocState) regalloc(f *Func) {
 			}
 		} else if len(b.Preds) == 1 {
 			// Start regalloc state with the end state of the previous block.
-			s.setState(endRegs[b.Preds[0].ID])
+			s.setState(s.endRegs[b.Preds[0].ID])
 			if nphi > 0 {
 				f.Fatalf("phis in single-predecessor block")
 			}
@@ -669,52 +648,83 @@ func (s *regAllocState) regalloc(f *Func) {
 				f.Fatalf("block with no primary predecessor %s", b)
 			}
 			p := b.Preds[idx]
-			s.setState(endRegs[p.ID])
+			s.setState(s.endRegs[p.ID])
+
+			if regDebug {
+				fmt.Printf("starting merge block %s with end state of %s:\n", b, p)
+				for _, x := range s.endRegs[p.ID] {
+					fmt.Printf("  %s: orig:%s cache:%s\n", registers[x.r].Name(), x.v, x.c)
+				}
+			}
 
 			// Decide on registers for phi ops.  Use the registers determined
 			// by the primary predecessor if we can.
 			// TODO: pick best of (already processed) predecessors?
 			// Majority vote?  Deepest nesting level?
 			phiRegs = phiRegs[:0]
-			var used regMask
+			var phiUsed regMask
 			for _, v := range phis {
-				if v.Type.IsMemory() {
+				if !s.values[v.ID].needReg {
 					phiRegs = append(phiRegs, noRegister)
 					continue
 				}
-				regs := s.values[v.Args[idx].ID].regs
-				m := regs &^ used
+				a := v.Args[idx]
+				m := s.values[a.ID].regs &^ phiUsed
 				var r register
 				if m != 0 {
 					r = pickReg(m)
-					used |= regMask(1) << r
+					s.freeReg(r)
+					phiUsed |= regMask(1) << r
+					phiRegs = append(phiRegs, r)
 				} else {
-					r = noRegister
+					phiRegs = append(phiRegs, noRegister)
 				}
-				phiRegs = append(phiRegs, r)
 			}
-			// Change register user from phi input to phi.  Add phi spill code.
+
+			// Second pass - deallocate any phi inputs which are now dead.
+			for _, v := range phis {
+				if !s.values[v.ID].needReg {
+					continue
+				}
+				a := v.Args[idx]
+				if !liveSet.contains(a.ID) {
+					// Input is dead beyond the phi, deallocate
+					// anywhere else it might live.
+					s.freeRegs(s.values[a.ID].regs)
+				}
+			}
+
+			// Third pass - pick registers for phis whose inputs
+			// were not in a register.
 			for i, v := range phis {
-				if v.Type.IsMemory() {
+				if !s.values[v.ID].needReg {
+					continue
+				}
+				if phiRegs[i] != noRegister {
+					continue
+				}
+				m := s.compatRegs(v.Type) &^ phiUsed &^ s.used
+				if m != 0 {
+					r := pickReg(m)
+					phiRegs[i] = r
+					phiUsed |= regMask(1) << r
+				}
+			}
+
+			// Set registers for phis.  Add phi spill code.
+			for i, v := range phis {
+				if !s.values[v.ID].needReg {
 					continue
 				}
 				r := phiRegs[i]
 				if r == noRegister {
-					m := s.compatRegs(v) & ^s.used
-					if m == 0 {
-						// stack-based phi
-						// Spills will be inserted in all the predecessors below.
-						s.values[v.ID].spill = v        // v starts life spilled
-						s.values[v.ID].spillUsed = true // use is guaranteed
-						continue
-					}
-					// Allocate phi to an unused register.
-					r = pickReg(m)
-				} else {
-					s.freeReg(r)
+					// stack-based phi
+					// Spills will be inserted in all the predecessors below.
+					s.values[v.ID].spill = v        // v starts life spilled
+					s.values[v.ID].spillUsed = true // use is guaranteed
+					continue
 				}
 				// register-based phi
-				// Transfer ownership of register from input arg to phi.
 				s.assignReg(r, v, v)
 				// Spill the phi in case we need to restore it later.
 				spill := b.NewValue1(v.Line, OpStoreReg, v.Type, v)
@@ -723,15 +733,35 @@ func (s *regAllocState) regalloc(f *Func) {
 				s.values[v.ID].spillUsed = false
 			}
 
-			// Save the starting state for use by incoming edges below.
-			startRegs[b.ID] = make([]*Value, numRegs)
+			// Save the starting state for use by merge edges.
+			var regList []startReg
 			for r := register(0); r < numRegs; r++ {
-				startRegs[b.ID][r] = s.regs[r].v
+				v := s.regs[r].v
+				if v == nil {
+					continue
+				}
+				if phiUsed>>r&1 != 0 {
+					// Skip registers that phis used, we'll handle those
+					// specially during merge edge processing.
+					continue
+				}
+				regList = append(regList, startReg{r, v.ID})
+			}
+			s.startRegs[b.ID] = regList
+
+			if regDebug {
+				fmt.Printf("after phis\n")
+				for _, x := range s.startRegs[b.ID] {
+					fmt.Printf("  %s: v%d\n", registers[x.r].Name(), x.vid)
+				}
 			}
 		}
 
 		// Process all the non-phi values.
-		for idx, v := range oldSched {
+		for _, v := range oldSched {
+			if regDebug {
+				fmt.Printf("  processing %s\n", v.LongString())
+			}
 			if v.Op == OpPhi {
 				f.Fatalf("phi %s not at start of block", v)
 			}
@@ -758,9 +788,6 @@ func (s *regAllocState) regalloc(f *Func) {
 				continue
 			}
 			regspec := opcodeTable[v.Op].reg
-			if regDebug {
-				fmt.Printf("%d: working on %s %s %v\n", idx, v, v.LongString(), regspec)
-			}
 			if len(regspec.inputs) == 0 && len(regspec.outputs) == 0 {
 				// No register allocation required (or none specified yet)
 				s.freeRegs(regspec.clobbers)
@@ -768,7 +795,7 @@ func (s *regAllocState) regalloc(f *Func) {
 				continue
 			}
 
-			if v.rematerializeable() {
+			if s.values[v.ID].rematerializeable {
 				// Value is rematerializeable, don't issue it here.
 				// It will get issued just before each use (see
 				// allocValueToReg).
@@ -800,7 +827,7 @@ func (s *regAllocState) regalloc(f *Func) {
 			// Pick register for output.
 			var r register
 			var mask regMask
-			if s.needReg[v.ID] {
+			if s.values[v.ID].needReg {
 				mask = regspec.outputs[0] &^ s.reserved()
 				if mask>>33&1 != 0 {
 					s.f.Fatalf("bad mask %s\n", v.LongString())
@@ -827,7 +854,7 @@ func (s *regAllocState) regalloc(f *Func) {
 			//     f()
 			// }
 			// It would be good to have both spill and restore inside the IF.
-			if s.needReg[v.ID] {
+			if s.values[v.ID].needReg {
 				spill := b.NewValue1(v.Line, OpStoreReg, v.Type, v)
 				s.setOrig(spill, v)
 				s.values[v.ID].spill = spill
@@ -835,21 +862,70 @@ func (s *regAllocState) regalloc(f *Func) {
 			}
 		}
 
-		if c := b.Control; c != nil && s.needReg[c.ID] {
+		if v := b.Control; v != nil && s.values[v.ID].needReg {
+			if regDebug {
+				fmt.Printf("  processing control %s\n", v.LongString())
+			}
 			// Load control value into reg.
 			// TODO: regspec for block control values, instead of using
 			// register set from the control op's output.
-			s.allocValToReg(c, opcodeTable[c.Op].reg.outputs[0], false)
+			s.allocValToReg(v, opcodeTable[v.Op].reg.outputs[0], false)
 			// Remove this use from the uses list.
-			u := s.values[c.ID].uses
-			s.values[c.ID].uses = u.next
+			vi := &s.values[v.ID]
+			u := vi.uses
+			vi.uses = u.next
+			if u.next == nil {
+				s.freeRegs(vi.regs) // value is dead
+			}
 			u.next = s.freeUseRecords
 			s.freeUseRecords = u
 		}
 
-		// Record endRegs
-		endRegs[b.ID] = make([]regState, numRegs)
-		copy(endRegs[b.ID], s.regs)
+		// Save end-of-block register state.
+		var regList []endReg
+		for r := register(0); r < numRegs; r++ {
+			v := s.regs[r].v
+			if v == nil {
+				continue
+			}
+			regList = append(regList, endReg{r, v, s.regs[r].c})
+		}
+		s.endRegs[b.ID] = regList
+
+		// Check. TODO: remove
+		{
+			liveSet.clear()
+			for _, x := range s.live[b.ID] {
+				liveSet.add(x.ID)
+			}
+			for r := register(0); r < numRegs; r++ {
+				v := s.regs[r].v
+				if v == nil {
+					continue
+				}
+				if !liveSet.contains(v.ID) {
+					s.f.Fatalf("val %s is in reg but not live at end of %s", v, b)
+				}
+			}
+		}
+
+		// If a value is live at the end of the block and
+		// isn't in a register, remember that its spill location
+		// is live.  We need to remember this information so that
+		// the liveness analysis in stackalloc correct.
+		for _, e := range s.live[b.ID] {
+			if s.values[e.ID].regs != 0 {
+				// in a register, we'll use that source for the merge.
+				continue
+			}
+			spill := s.values[e.ID].spill
+			if spill == nil {
+				// rematerializeable values will have spill==nil.
+				continue
+			}
+			s.spillLive[b.ID] = append(s.spillLive[b.ID], spill.ID)
+			s.values[e.ID].spillUsed = true
+		}
 
 		// Clear any final uses.
 		// All that is left should be the pseudo-uses added for values which
@@ -868,137 +944,6 @@ func (s *regAllocState) regalloc(f *Func) {
 		}
 	}
 
-	// Process merge block input edges.  They are the tricky ones.
-	dst := make([]*Value, numRegs)
-	for _, b := range f.Blocks {
-		if len(b.Preds) <= 1 {
-			continue
-		}
-		for i, p := range b.Preds {
-			if regDebug {
-				fmt.Printf("processing %s->%s\n", p, b)
-			}
-
-			// Find phis, separate them into stack & register classes.
-			stackPhis = stackPhis[:0]
-			regPhis = regPhis[:0]
-			for _, v := range b.Values {
-				if v.Op != OpPhi {
-					break
-				}
-				if v.Type.IsMemory() {
-					continue
-				}
-				if s.getHome(v) != noRegister {
-					regPhis = append(regPhis, v)
-				} else {
-					stackPhis = append(stackPhis, v)
-				}
-			}
-
-			// Start with the state that exists at the end of the
-			// predecessor block.  We'll be adding instructions here
-			// to shuffle registers & stack phis into the right spot.
-			s.setState(endRegs[p.ID])
-			s.curBlock = p
-
-			// Handle stack-based phi ops first.  We need to handle them
-			// first because we need a register with which to copy them.
-
-			// We must be careful not to overwrite any stack phis which are
-			// themselves args of other phis.  For example:
-			//  v1 = phi(v2, v3) : 8(SP)
-			//  v2 = phi(v4, v5) : 16(SP)
-			// Here we must not write v2 until v2 is read and written to v1.
-			// The situation could be even more complicated, with cycles, etc.
-			// So in the interest of being simple, we find all the phis which
-			// are arguments of other phis and copy their values to a temporary
-			// location first.  This temporary location is called "spill2" and
-			// represents a higher-priority but temporary spill location for the value.
-			// Note this is not a problem for register-based phis because
-			// if needed we will use the spilled location as the source, and
-			// the spill location is not clobbered by the code generated here.
-			argset.clear()
-			for _, v := range stackPhis {
-				argset.add(v.Args[i].ID)
-			}
-			for _, v := range regPhis {
-				argset.add(v.Args[i].ID)
-			}
-			for _, v := range stackPhis {
-				if !argset.contains(v.ID) {
-					continue
-				}
-
-				// This stack-based phi is the argument of some other
-				// phi in this block.  We must make a copy of its
-				// value so that we don't clobber it prematurely.
-				c := s.allocValToReg(v, s.compatRegs(v), false)
-				d := p.NewValue1(v.Line, OpStoreReg, v.Type, c)
-				s.setOrig(d, v)
-				s.values[v.ID].spill2 = d
-			}
-
-			// Assign to stack-based phis.  We do stack phis first because
-			// we might need a register to do the assignment.
-			for _, v := range stackPhis {
-				// Load phi arg into a register, then store it with a StoreReg.
-				// If already in a register, use that.  If not, pick a compatible
-				// register.
-				w := v.Args[i]
-				c := s.allocValToReg(w, s.compatRegs(w), false)
-				v.Args[i] = p.NewValue1(v.Line, OpStoreReg, v.Type, c)
-				s.setOrig(v.Args[i], w)
-			}
-			// Figure out what value goes in each register.
-			for r := register(0); r < numRegs; r++ {
-				dst[r] = startRegs[b.ID][r]
-			}
-			// Handle register-based phi ops.
-			for _, v := range regPhis {
-				r := s.getHome(v)
-				if dst[r] != v {
-					f.Fatalf("dst not right")
-				}
-				v.Args[i] = s.allocValToReg(v.Args[i], regMask(1)<<r, false)
-				dst[r] = nil // we've handled this one
-			}
-			// Move other non-phi register values to the right register.
-			for r := register(0); r < numRegs; r++ {
-				if dst[r] == nil {
-					continue
-				}
-				if s.regs[r].v == dst[r] {
-					continue
-				}
-				mv := s.allocValToReg(dst[r], regMask(1)<<r, false)
-				// TODO: ssa form is probably violated by this step.
-				// I don't know how to splice in the new value because
-				// I need to potentially make a phi and replace all uses.
-				_ = mv
-			}
-			// Reset spill2 fields
-			for _, v := range stackPhis {
-				spill2 := s.values[v.ID].spill2
-				if spill2 == nil {
-					continue
-				}
-				if !s.values[v.ID].spill2used {
-					spill2.Op = OpInvalid
-					spill2.Type = TypeInvalid
-					spill2.resetArgs()
-				} else if logSpills {
-					fmt.Println("regalloc: spilled phi")
-				}
-				s.values[v.ID].spill2 = nil
-				s.values[v.ID].spill2used = false
-			}
-		}
-	}
-	// TODO: be smarter about the order in which to shuffle registers around.
-	// if we need to do AX->CX and CX->DX, do the latter first.  Now if we do the
-	// former first then the latter must be a restore instead of a register move.
-
 	// Erase any spills we never used
 	for i := range s.values {
 		vi := s.values[i]
@@ -1031,24 +976,450 @@ func (s *regAllocState) regalloc(f *Func) {
 		// Not important now because this is the last phase that manipulates Values
 	}
 
-	// Set final regalloc result.
-	f.RegAlloc = s.home
+	// Anything that didn't get a register gets a stack location here.
+	// (StoreReg, stack-based phis, inputs, ...)
+	stacklive := stackalloc(s.f, s.spillLive)
+
+	// Fix up all merge edges.
+	s.shuffle(stacklive)
+}
+
+// shuffle fixes up all the merge edges (those going into blocks of indegree > 1).
+func (s *regAllocState) shuffle(stacklive [][]ID) {
+	var e edgeState
+	e.s = s
+	e.cache = map[ID][]*Value{}
+	e.contents = map[Location]contentRecord{}
+	if regDebug {
+		fmt.Printf("shuffle %s\n", s.f.Name)
+		fmt.Println(s.f.String())
+	}
+
+	for _, b := range s.f.Blocks {
+		if len(b.Preds) <= 1 {
+			continue
+		}
+		e.b = b
+		for i, p := range b.Preds {
+			e.p = p
+			e.setup(i, s.endRegs[p.ID], s.startRegs[b.ID], stacklive[p.ID])
+			e.process()
+		}
+	}
+}
+
+type edgeState struct {
+	s    *regAllocState
+	p, b *Block // edge goes from p->b.
+
+	// for each pre-regalloc value, a list of equivalent cached values
+	cache map[ID][]*Value
+
+	// map from location to the value it contains
+	contents map[Location]contentRecord
+
+	// desired destination locations
+	destinations []dstRecord
+	extra        []dstRecord
+
+	usedRegs   regMask // registers currently holding something
+	uniqueRegs regMask // registers holding the only copy of a value
+	finalRegs  regMask // registers holding final target
+}
+
+type contentRecord struct {
+	vid   ID     // pre-regalloc value
+	c     *Value // cached value
+	final bool   // this is a satisfied destination
+}
+
+type dstRecord struct {
+	loc    Location // register or stack slot
+	vid    ID       // pre-regalloc value it should contain
+	splice **Value  // place to store reference to the generating instruction
+}
+
+// setup initializes the edge state for shuffling.
+func (e *edgeState) setup(idx int, srcReg []endReg, dstReg []startReg, stacklive []ID) {
+	if regDebug {
+		fmt.Printf("edge %s->%s\n", e.p, e.b)
+	}
+
+	// Clear state.
+	for k := range e.cache {
+		delete(e.cache, k)
+	}
+	for k := range e.contents {
+		delete(e.contents, k)
+	}
+
+	// Live registers can be sources.
+	for _, x := range srcReg {
+		e.set(&registers[x.r], x.v.ID, x.c, false)
+	}
+	// So can all of the spill locations.
+	for _, spillID := range stacklive {
+		v := e.s.orig[spillID]
+		spill := e.s.values[v.ID].spill
+		e.set(e.s.f.getHome(spillID), v.ID, spill, false)
+	}
+
+	// Figure out all the destinations we need.
+	dsts := e.destinations[:0]
+	for _, x := range dstReg {
+		dsts = append(dsts, dstRecord{&registers[x.r], x.vid, nil})
+	}
+	// Phis need their args to end up in a specific location.
+	for _, v := range e.b.Values {
+		if v.Op != OpPhi {
+			break
+		}
+		loc := e.s.f.getHome(v.ID)
+		if loc == nil {
+			continue
+		}
+		dsts = append(dsts, dstRecord{loc, v.Args[idx].ID, &v.Args[idx]})
+	}
+	e.destinations = dsts
+
+	if regDebug {
+		for vid, a := range e.cache {
+			for _, c := range a {
+				fmt.Printf("src %s: v%d cache=%s\n", e.s.f.getHome(c.ID).Name(), vid, c)
+			}
+		}
+		for _, d := range e.destinations {
+			fmt.Printf("dst %s: v%d\n", d.loc.Name(), d.vid)
+		}
+	}
+}
+
+// process generates code to move all the values to the right destination locations.
+func (e *edgeState) process() {
+	dsts := e.destinations
+
+	// Process the destinations until they are all satisfied.
+	for len(dsts) > 0 {
+		i := 0
+		for _, d := range dsts {
+			if !e.processDest(d.loc, d.vid, d.splice) {
+				// Failed - save for next iteration.
+				dsts[i] = d
+				i++
+			}
+		}
+		if i < len(dsts) {
+			// Made some progress.  Go around again.
+			dsts = dsts[:i]
+
+			// Append any extras destinations we generated.
+			dsts = append(dsts, e.extra...)
+			e.extra = e.extra[:0]
+			continue
+		}
+
+		// We made no progress.  That means that any
+		// remaining unsatisfied moves are in simple cycles.
+		// For example, A -> B -> C -> D -> A.
+		//   A ----> B
+		//   ^       |
+		//   |       |
+		//   |       v
+		//   D <---- C
+
+		// To break the cycle, we pick an unused register, say R,
+		// and put a copy of B there.
+		//   A ----> B
+		//   ^       |
+		//   |       |
+		//   |       v
+		//   D <---- C <---- R=copyofB
+		// When we resume the outer loop, the A->B move can now proceed,
+		// and eventually the whole cycle completes.
+
+		// Copy any cycle location to a temp register.  This duplicates
+		// one of the cycle entries, allowing the just duplicated value
+		// to be overwritten and the cycle to proceed.
+		loc := dsts[0].loc
+		vid := e.contents[loc].vid
+		c := e.contents[loc].c
+		r := e.findRegFor(c.Type)
+		if regDebug {
+			fmt.Printf("breaking cycle with v%d in %s:%s\n", vid, loc.Name(), c)
+		}
+		if _, isReg := loc.(*Register); isReg {
+			c = e.p.NewValue1(c.Line, OpCopy, c.Type, c)
+		} else {
+			c = e.p.NewValue1(c.Line, OpLoadReg, c.Type, c)
+		}
+		e.set(r, vid, c, false)
+	}
+}
+
+// processDest generates code to put value vid into location loc.  Returns true
+// if progress was made.
+func (e *edgeState) processDest(loc Location, vid ID, splice **Value) bool {
+	occupant := e.contents[loc]
+	if occupant.vid == vid {
+		// Value is already in the correct place.
+		e.contents[loc] = contentRecord{vid, occupant.c, true}
+		if splice != nil {
+			*splice = occupant.c
+		}
+		// Note: if splice==nil then c will appear dead.  This is
+		// non-SSA formed code, so be careful after this pass not to run
+		// deadcode elimination.
+		return true
+	}
+
+	// Check if we're allowed to clobber the destination location.
+	if len(e.cache[occupant.vid]) == 1 && !e.s.values[occupant.vid].rematerializeable {
+		// We can't overwrite the last copy
+		// of a value that needs to survive.
+		return false
+	}
+
+	// Copy from a source of v, register preferred.
+	v := e.s.orig[vid]
+	var c *Value
+	var src Location
+	if regDebug {
+		fmt.Printf("moving v%d to %s\n", vid, loc.Name())
+		fmt.Printf("sources of v%d:", vid)
+	}
+	for _, w := range e.cache[vid] {
+		h := e.s.f.getHome(w.ID)
+		if regDebug {
+			fmt.Printf(" %s:%s", h.Name(), w)
+		}
+		_, isreg := h.(*Register)
+		if src == nil || isreg {
+			c = w
+			src = h
+		}
+	}
+	if regDebug {
+		if src != nil {
+			fmt.Printf(" [use %s]\n", src.Name())
+		} else {
+			fmt.Printf(" [no source]\n")
+		}
+	}
+	_, dstReg := loc.(*Register)
+	var x *Value
+	if c == nil {
+		if !e.s.values[vid].rematerializeable {
+			e.s.f.Fatalf("can't find source for %s->%s: v%d\n", e.p, e.b, vid)
+		}
+		if dstReg {
+			x = v.copyInto(e.p)
+		} else {
+			// Rematerialize into stack slot.  Need a free
+			// register to accomplish this.
+			e.erase(loc) // see pre-clobber comment below
+			r := e.findRegFor(v.Type)
+			x = v.copyInto(e.p)
+			e.set(r, vid, x, false)
+			x = e.p.NewValue1(x.Line, OpStoreReg, x.Type, x)
+		}
+	} else {
+		// Emit move from src to dst.
+		_, srcReg := src.(*Register)
+		if srcReg {
+			if dstReg {
+				x = e.p.NewValue1(c.Line, OpCopy, c.Type, c)
+			} else {
+				x = e.p.NewValue1(c.Line, OpStoreReg, c.Type, c)
+			}
+		} else {
+			if dstReg {
+				x = e.p.NewValue1(c.Line, OpLoadReg, c.Type, c)
+			} else {
+				// mem->mem.  Use temp register.
+
+				// Pre-clobber destination.  This avoids the
+				// following situation:
+				//   - v is currently held in R0 and stacktmp0.
+				//   - We want to copy stacktmp1 to stacktmp0.
+				//   - We choose R0 as the temporary register.
+				// During the copy, both R0 and stacktmp0 are
+				// clobbered, losing both copies of v.  Oops!
+				// Erasing the destination early means R0 will not
+				// be chosen as the temp register, as it will then
+				// be the last copy of v.
+				e.erase(loc)
+
+				r := e.findRegFor(c.Type)
+				t := e.p.NewValue1(c.Line, OpLoadReg, c.Type, c)
+				e.set(r, vid, t, false)
+				x = e.p.NewValue1(c.Line, OpStoreReg, c.Type, t)
+			}
+		}
+	}
+	e.set(loc, vid, x, true)
+	if splice != nil {
+		*splice = x
+	}
+	return true
+}
+
+// set changes the contents of location loc to hold the given value and its cached representative.
+func (e *edgeState) set(loc Location, vid ID, c *Value, final bool) {
+	e.s.f.setHome(c, loc)
+	e.erase(loc)
+	e.contents[loc] = contentRecord{vid, c, final}
+	a := e.cache[vid]
+	a = append(a, c)
+	e.cache[vid] = a
+	if r, ok := loc.(*Register); ok {
+		e.usedRegs |= regMask(1) << uint(r.Num)
+		if final {
+			e.finalRegs |= regMask(1) << uint(r.Num)
+		}
+		if len(a) == 1 {
+			e.uniqueRegs |= regMask(1) << uint(r.Num)
+		}
+		if len(a) == 2 {
+			if t, ok := e.s.f.getHome(a[0].ID).(*Register); ok {
+				e.uniqueRegs &^= regMask(1) << uint(t.Num)
+			}
+		}
+	}
+	if regDebug {
+		fmt.Printf("%s\n", c.LongString())
+		fmt.Printf("v%d now available in %s:%s\n", vid, loc.Name(), c)
+	}
+}
+
+// erase removes any user of loc.
+func (e *edgeState) erase(loc Location) {
+	cr := e.contents[loc]
+	if cr.c == nil {
+		return
+	}
+	vid := cr.vid
+
+	if cr.final {
+		// Add a destination to move this value back into place.
+		// Make sure it gets added to the tail of the destination queue
+		// so we make progress on other moves first.
+		e.extra = append(e.extra, dstRecord{loc, cr.vid, nil})
+	}
+
+	// Remove c from the list of cached values.
+	a := e.cache[vid]
+	for i, c := range a {
+		if e.s.f.getHome(c.ID) == loc {
+			if regDebug {
+				fmt.Printf("v%d no longer available in %s:%s\n", vid, loc.Name(), c)
+			}
+			a[i], a = a[len(a)-1], a[:len(a)-1]
+			break
+		}
+	}
+	e.cache[vid] = a
+
+	// Update register masks.
+	if r, ok := loc.(*Register); ok {
+		e.usedRegs &^= regMask(1) << uint(r.Num)
+		if cr.final {
+			e.finalRegs &^= regMask(1) << uint(r.Num)
+		}
+	}
+	if len(a) == 1 {
+		if r, ok := e.s.f.getHome(a[0].ID).(*Register); ok {
+			e.uniqueRegs |= regMask(1) << uint(r.Num)
+		}
+	}
+}
+
+// findRegFor finds a register we can use to make a temp copy of type typ.
+func (e *edgeState) findRegFor(typ Type) Location {
+	// Which registers are possibilities.
+	var m regMask
+	if typ.IsFloat() {
+		m = e.s.compatRegs(e.s.f.Config.fe.TypeFloat64())
+	} else {
+		m = e.s.compatRegs(e.s.f.Config.fe.TypeInt64())
+	}
+
+	// Pick a register.  In priority order:
+	// 1) an unused register
+	// 2) a non-unique register not holding a final value
+	// 3) a non-unique register
+	x := m &^ e.usedRegs
+	if x != 0 {
+		return &registers[pickReg(x)]
+	}
+	x = m &^ e.uniqueRegs &^ e.finalRegs
+	if x != 0 {
+		return &registers[pickReg(x)]
+	}
+	x = m &^ e.uniqueRegs
+	if x != 0 {
+		return &registers[pickReg(x)]
+	}
+
+	// No register is available.  Allocate a temp location to spill a register to.
+	// The type of the slot is immaterial - it will not be live across
+	// any safepoint.  Just use a type big enough to hold any register.
+	typ = e.s.f.Config.fe.TypeInt64()
+	t := LocalSlot{e.s.f.Config.fe.Auto(typ), typ, 0}
+	// TODO: reuse these slots.
+
+	// Pick a register to spill.
+	for vid, a := range e.cache {
+		for _, c := range a {
+			if r, ok := e.s.f.getHome(c.ID).(*Register); ok && m>>uint(r.Num)&1 != 0 {
+				x := e.p.NewValue1(c.Line, OpStoreReg, c.Type, c)
+				e.set(t, vid, x, false)
+				if regDebug {
+					fmt.Printf("  SPILL %s->%s %s\n", r.Name(), t.Name(), x.LongString())
+				}
+				// r will now be overwritten by the caller.  At some point
+				// later, the newly saved value will be moved back to its
+				// final destination in processDest.
+				return r
+			}
+		}
+	}
+
+	e.s.f.Fatalf("can't find empty register on edge %s->%s", e.p, e.b)
+	return nil
 }
 
 func (v *Value) rematerializeable() bool {
 	// TODO: add a flags field to opInfo for this test?
+	regspec := opcodeTable[v.Op].reg
 
 	// rematerializeable ops must be able to fill any register.
-	outputs := opcodeTable[v.Op].reg.outputs
+	outputs := regspec.outputs
 	if len(outputs) == 0 || countRegs(outputs[0]) <= 1 {
 		// Note: this case handles OpAMD64LoweredGetClosurePtr
 		// which can't be moved.
 		return false
 	}
+
+	// We can't rematerialize instructions which
+	// clobber the flags register.
+	if regspec.clobbers&flagRegMask != 0 {
+		if v.Op == OpAMD64MOVQconst && v.AuxInt != 0 ||
+			v.Op == OpAMD64MOVLconst && int32(v.AuxInt) != 0 ||
+			v.Op == OpAMD64MOVWconst && int16(v.AuxInt) != 0 ||
+			v.Op == OpAMD64MOVBconst && int8(v.AuxInt) != 0 {
+			// These are marked as clobbering flags, but only
+			// the 0 versions actually do.  TODO: fix MOV->XOR rewrites
+			// to understand when they are allowed to clobber flags?
+			return true
+		}
+		return false
+	}
+
 	if len(v.Args) == 0 {
 		return true
 	}
 	if len(v.Args) == 1 && (v.Args[0].Op == OpSP || v.Args[0].Op == OpSB) {
+		// SP and SB (generated by OpSP and OpSB) are always available.
 		return true
 	}
 	return false
@@ -1084,9 +1455,6 @@ func (s *regAllocState) computeLive() {
 	// out to all of them.
 	po := postorder(f)
 	for {
-		for _, b := range po {
-			f.Logf("live %s %v\n", b, s.live[b.ID])
-		}
 		changed := false
 
 		for _, b := range po {
@@ -1099,7 +1467,7 @@ func (s *regAllocState) computeLive() {
 			}
 
 			// Mark control value as live
-			if b.Control != nil && s.needReg[b.Control.ID] {
+			if b.Control != nil && s.values[b.Control.ID].needReg {
 				live.set(b.Control.ID, int32(len(b.Values)))
 			}
 
@@ -1115,7 +1483,7 @@ func (s *regAllocState) computeLive() {
 					continue
 				}
 				for _, a := range v.Args {
-					if s.needReg[a.ID] {
+					if s.values[a.ID].needReg {
 						live.set(a.ID, int32(i))
 					}
 				}
@@ -1162,7 +1530,7 @@ func (s *regAllocState) computeLive() {
 				// simultaneously happening at the start of the block).
 				for _, v := range phis {
 					id := v.Args[i].ID
-					if s.needReg[id] && !t.contains(id) || delta < t.get(id) {
+					if s.values[id].needReg && !t.contains(id) || delta < t.get(id) {
 						update = true
 						t.set(id, delta)
 					}
@@ -1185,6 +1553,16 @@ func (s *regAllocState) computeLive() {
 			break
 		}
 	}
+	if regDebug {
+		fmt.Println("live values at end of each block")
+		for _, b := range f.Blocks {
+			fmt.Printf("  %s:", b)
+			for _, x := range s.live[b.ID] {
+				fmt.Printf(" v%d", x.ID)
+			}
+			fmt.Println()
+		}
+	}
 }
 
 // reserved returns a mask of reserved registers.
diff --git a/src/cmd/compile/internal/ssa/stackalloc.go b/src/cmd/compile/internal/ssa/stackalloc.go
index 3eb5c3cf4a..797a6b05e6 100644
--- a/src/cmd/compile/internal/ssa/stackalloc.go
+++ b/src/cmd/compile/internal/ssa/stackalloc.go
@@ -6,55 +6,65 @@
 
 package ssa
 
+import "fmt"
+
+const stackDebug = false // TODO: compiler flag
+
+type stackAllocState struct {
+	f         *Func
+	values    []stackValState
+	live      [][]ID // live[b.id] = live values at the end of block b.
+	interfere [][]ID // interfere[v.id] = values that interfere with v.
+}
+
+type stackValState struct {
+	typ      Type
+	spill    *Value
+	needSlot bool
+}
+
 // stackalloc allocates storage in the stack frame for
 // all Values that did not get a register.
-func stackalloc(f *Func) {
-	// Cache value types by ID.
-	types := make([]Type, f.NumValues())
+// Returns a map from block ID to the stack values live at the end of that block.
+func stackalloc(f *Func, spillLive [][]ID) [][]ID {
+	if stackDebug {
+		fmt.Println("before stackalloc")
+		fmt.Println(f.String())
+	}
+	var s stackAllocState
+	s.init(f, spillLive)
+	s.stackalloc()
+	return s.live
+}
+
+func (s *stackAllocState) init(f *Func, spillLive [][]ID) {
+	s.f = f
+
+	// Initialize value information.
+	s.values = make([]stackValState, f.NumValues())
 	for _, b := range f.Blocks {
 		for _, v := range b.Values {
-			types[v.ID] = v.Type
-		}
-	}
-
-	// Build interference graph among StoreReg and stack phi ops.
-	live := f.liveSpills()
-	interfere := make([][]ID, f.NumValues())
-	s := newSparseSet(f.NumValues())
-	for _, b := range f.Blocks {
-		// Start with known live values at the end of the block.
-		s.clear()
-		for i := 0; i < len(b.Succs); i++ {
-			s.addAll(live[b.ID][i])
-		}
-
-		// Propagate backwards to the start of the block.
-		// Remember interfering sets.
-		for i := len(b.Values) - 1; i >= 0; i-- {
-			v := b.Values[i]
-			switch {
-			case v.Op == OpStoreReg, v.isStackPhi():
-				s.remove(v.ID)
-				for _, id := range s.contents() {
-					if v.Type.Equal(types[id]) {
-						// Only need interferences between equivalent types.
-						interfere[v.ID] = append(interfere[v.ID], id)
-						interfere[id] = append(interfere[id], v.ID)
-					}
-				}
-			case v.Op == OpLoadReg:
-				s.add(v.Args[0].ID)
-			case v.Op == OpArg:
-				// This is an input argument which is pre-spilled.  It is kind of
-				// like a StoreReg, but we don't remove v.ID here because we want
-				// this value to appear live even before this point.  Being live
-				// all the way to the start of the entry block prevents other
-				// values from being allocated to the same slot and clobbering
-				// the input value before we have a chance to load it.
+			s.values[v.ID].typ = v.Type
+			s.values[v.ID].needSlot = !v.Type.IsMemory() && !v.Type.IsVoid() && !v.Type.IsFlags() && f.getHome(v.ID) == nil && !v.rematerializeable()
+			if stackDebug && s.values[v.ID].needSlot {
+				fmt.Printf("%s needs a stack slot\n", v)
+			}
+			if v.Op == OpStoreReg {
+				s.values[v.Args[0].ID].spill = v
 			}
 		}
 	}
 
+	// Compute liveness info for values needing a slot.
+	s.computeLive(spillLive)
+
+	// Build interference graph among values needing a slot.
+	s.buildInterferenceGraph()
+}
+
+func (s *stackAllocState) stackalloc() {
+	f := s.f
+
 	// Build map from values to their names, if any.
 	// A value may be associated with more than one name (e.g. after
 	// the assignment i=j). This step picks one name per value arbitrarily.
@@ -67,49 +77,41 @@ func stackalloc(f *Func) {
 		}
 	}
 
-	// Figure out which StoreReg ops are phi args.  We don't pick slots for
-	// phi args because a stack phi and its args must all use the same stack slot.
-	phiArg := make([]bool, f.NumValues())
-	for _, b := range f.Blocks {
-		for _, v := range b.Values {
-			if !v.isStackPhi() {
-				continue
-			}
-			for _, a := range v.Args {
-				phiArg[a.ID] = true
-			}
-		}
-	}
-
 	// Allocate args to their assigned locations.
 	for _, v := range f.Entry.Values {
 		if v.Op != OpArg {
 			continue
 		}
-		f.setHome(v, LocalSlot{v.Aux.(GCNode), v.Type, v.AuxInt})
+		loc := LocalSlot{v.Aux.(GCNode), v.Type, v.AuxInt}
+		if stackDebug {
+			fmt.Printf("stackalloc %s to %s\n", v, loc.Name())
+		}
+		f.setHome(v, loc)
 	}
 
 	// For each type, we keep track of all the stack slots we
 	// have allocated for that type.
+	// TODO: share slots among equivalent types.  We would need to
+	// only share among types with the same GC signature.  See the
+	// type.Equal calls below for where this matters.
 	locations := map[Type][]LocalSlot{}
 
 	// Each time we assign a stack slot to a value v, we remember
 	// the slot we used via an index into locations[v.Type].
-	// TODO: share slots among equivalent types.
 	slots := make([]int, f.NumValues())
 	for i := f.NumValues() - 1; i >= 0; i-- {
 		slots[i] = -1
 	}
 
-	// Pick a stack slot for each non-phi-arg StoreReg and each stack phi.
+	// Pick a stack slot for each value needing one.
 	used := make([]bool, f.NumValues())
 	for _, b := range f.Blocks {
 		for _, v := range b.Values {
-			if v.Op != OpStoreReg && !v.isStackPhi() {
+			if !s.values[v.ID].needSlot {
 				continue
 			}
-			if phiArg[v.ID] {
-				continue
+			if v.Op == OpArg {
+				continue // already picked
 			}
 
 			// If this is a named value, try to use the name as
@@ -121,7 +123,7 @@ func stackalloc(f *Func) {
 				name = names[v.ID]
 			}
 			if name.N != nil && v.Type.Equal(name.Type) {
-				for _, id := range interfere[v.ID] {
+				for _, id := range s.interfere[v.ID] {
 					h := f.getHome(id)
 					if h != nil && h.(LocalSlot) == name {
 						// A variable can interfere with itself.
@@ -129,22 +131,10 @@ func stackalloc(f *Func) {
 						goto noname
 					}
 				}
-				if v.Op == OpPhi {
-					for _, a := range v.Args {
-						for _, id := range interfere[a.ID] {
-							h := f.getHome(id)
-							if h != nil && h.(LocalSlot) == name {
-								goto noname
-							}
-						}
-					}
+				if stackDebug {
+					fmt.Printf("stackalloc %s to %s\n", v, name.Name())
 				}
 				f.setHome(v, name)
-				if v.Op == OpPhi {
-					for _, a := range v.Args {
-						f.setHome(a, name)
-					}
-				}
 				continue
 			}
 
@@ -155,25 +145,12 @@ func stackalloc(f *Func) {
 			for i := 0; i < len(locs); i++ {
 				used[i] = false
 			}
-			for _, xid := range interfere[v.ID] {
+			for _, xid := range s.interfere[v.ID] {
 				slot := slots[xid]
 				if slot >= 0 {
 					used[slot] = true
 				}
 			}
-			if v.Op == OpPhi {
-				// Stack phi and args must get the same stack slot, so
-				// anything the args interfere with is something the phi
-				// interferes with.
-				for _, a := range v.Args {
-					for _, xid := range interfere[a.ID] {
-						slot := slots[xid]
-						if slot >= 0 {
-							used[slot] = true
-						}
-					}
-				}
-			}
 			// Find an unused stack slot.
 			var i int
 			for i = 0; i < len(locs); i++ {
@@ -188,83 +165,80 @@ func stackalloc(f *Func) {
 			}
 			// Use the stack variable at that index for v.
 			loc := locs[i]
+			if stackDebug {
+				fmt.Printf("stackalloc %s to %s\n", v, loc.Name())
+			}
 			f.setHome(v, loc)
 			slots[v.ID] = i
-			if v.Op == OpPhi {
-				for _, a := range v.Args {
-					f.setHome(a, loc)
-					slots[a.ID] = i
-				}
-			}
 		}
 	}
 }
 
-// live returns a map from block ID and successor edge index to a list
-// of StoreReg/stackphi value IDs live on that edge.
+// computeLive computes a map from block ID to a list of
+// stack-slot-needing value IDs live at the end of that block.
 // TODO: this could be quadratic if lots of variables are live across lots of
 // basic blocks.  Figure out a way to make this function (or, more precisely, the user
 // of this function) require only linear size & time.
-func (f *Func) liveSpills() [][][]ID {
-	live := make([][][]ID, f.NumBlocks())
-	for _, b := range f.Blocks {
-		live[b.ID] = make([][]ID, len(b.Succs))
-	}
+func (s *stackAllocState) computeLive(spillLive [][]ID) {
+	s.live = make([][]ID, s.f.NumBlocks())
 	var phis []*Value
-
-	s := newSparseSet(f.NumValues())
-	t := newSparseSet(f.NumValues())
+	live := newSparseSet(s.f.NumValues())
+	t := newSparseSet(s.f.NumValues())
 
 	// Instead of iterating over f.Blocks, iterate over their postordering.
 	// Liveness information flows backward, so starting at the end
 	// increases the probability that we will stabilize quickly.
-	po := postorder(f)
+	po := postorder(s.f)
 	for {
 		changed := false
 		for _, b := range po {
 			// Start with known live values at the end of the block
-			s.clear()
-			for i := 0; i < len(b.Succs); i++ {
-				s.addAll(live[b.ID][i])
-			}
+			live.clear()
+			live.addAll(s.live[b.ID])
 
 			// Propagate backwards to the start of the block
 			phis = phis[:0]
 			for i := len(b.Values) - 1; i >= 0; i-- {
 				v := b.Values[i]
-				switch {
-				case v.Op == OpStoreReg:
-					s.remove(v.ID)
-				case v.Op == OpLoadReg:
-					s.add(v.Args[0].ID)
-				case v.isStackPhi():
-					s.remove(v.ID)
-					// save stack phi ops for later
-					phis = append(phis, v)
+				live.remove(v.ID)
+				if v.Op == OpPhi {
+					// Save phi for later.
+					// Note: its args might need a stack slot even though
+					// the phi itself doesn't.  So don't use needSlot.
+					if !v.Type.IsMemory() && !v.Type.IsVoid() {
+						phis = append(phis, v)
+					}
+					continue
+				}
+				for _, a := range v.Args {
+					if s.values[a.ID].needSlot {
+						live.add(a.ID)
+					}
 				}
 			}
 
 			// for each predecessor of b, expand its list of live-at-end values
 			// invariant: s contains the values live at the start of b (excluding phi inputs)
 			for i, p := range b.Preds {
-				// Find index of b in p's successors.
-				var j int
-				for j = 0; j < len(p.Succs); j++ {
-					if p.Succs[j] == b {
-						break
+				t.clear()
+				t.addAll(s.live[p.ID])
+				t.addAll(live.contents())
+				t.addAll(spillLive[p.ID])
+				for _, v := range phis {
+					a := v.Args[i]
+					if s.values[a.ID].needSlot {
+						t.add(a.ID)
+					}
+					if spill := s.values[a.ID].spill; spill != nil {
+						//TODO: remove?  Subsumed by SpillUse?
+						t.add(spill.ID)
 					}
 				}
-				t.clear()
-				t.addAll(live[p.ID][j])
-				t.addAll(s.contents())
-				for _, v := range phis {
-					t.add(v.Args[i].ID)
-				}
-				if t.size() == len(live[p.ID][j]) {
+				if t.size() == len(s.live[p.ID]) {
 					continue
 				}
 				// grow p's live set
-				live[p.ID][j] = append(live[p.ID][j][:0], t.contents()...)
+				s.live[p.ID] = append(s.live[p.ID][:0], t.contents()...)
 				changed = true
 			}
 		}
@@ -273,7 +247,11 @@ func (f *Func) liveSpills() [][][]ID {
 			break
 		}
 	}
-	return live
+	if stackDebug {
+		for _, b := range s.f.Blocks {
+			fmt.Printf("stacklive %s %v\n", b, s.live[b.ID])
+		}
+	}
 }
 
 func (f *Func) getHome(vid ID) Location {
@@ -290,16 +268,51 @@ func (f *Func) setHome(v *Value, loc Location) {
 	f.RegAlloc[v.ID] = loc
 }
 
-func (v *Value) isStackPhi() bool {
-	if v.Op != OpPhi {
-		return false
+func (s *stackAllocState) buildInterferenceGraph() {
+	f := s.f
+	s.interfere = make([][]ID, f.NumValues())
+	live := newSparseSet(f.NumValues())
+	for _, b := range f.Blocks {
+		// Propagate liveness backwards to the start of the block.
+		// Two values interfere if one is defined while the other is live.
+		live.clear()
+		live.addAll(s.live[b.ID])
+		for i := len(b.Values) - 1; i >= 0; i-- {
+			v := b.Values[i]
+			if s.values[v.ID].needSlot {
+				live.remove(v.ID)
+				for _, id := range live.contents() {
+					if s.values[v.ID].typ.Equal(s.values[id].typ) {
+						s.interfere[v.ID] = append(s.interfere[v.ID], id)
+						s.interfere[id] = append(s.interfere[id], v.ID)
+					}
+				}
+			}
+			for _, a := range v.Args {
+				if s.values[a.ID].needSlot {
+					live.add(a.ID)
+				}
+			}
+			if v.Op == OpArg && s.values[v.ID].needSlot {
+				// OpArg is an input argument which is pre-spilled.
+				// We add back v.ID here because we want this value
+				// to appear live even before this point.  Being live
+				// all the way to the start of the entry block prevents other
+				// values from being allocated to the same slot and clobbering
+				// the input value before we have a chance to load it.
+				live.add(v.ID)
+			}
+		}
 	}
-	if v.Type == TypeMem {
-		return false
+	if stackDebug {
+		for vid, i := range s.interfere {
+			if len(i) > 0 {
+				fmt.Printf("v%d interferes with", vid)
+				for _, x := range i {
+					fmt.Printf(" v%d", x)
+				}
+				fmt.Println()
+			}
+		}
 	}
-	if int(v.ID) >= len(v.Block.Func.RegAlloc) {
-		return true
-	}
-	return v.Block.Func.RegAlloc[v.ID] == nil
-	// TODO: use a separate opcode for StackPhi?
 }