[dev.ssa] cmd/compile: better register allocator

Reorder how register & stack allocation is done. We used to allocate registers, then fix up merge edges, then allocate stack slots. This lead to lots of unnecessary copies on merge edges: v2 = LoadReg v1 v3 = StoreReg v2 If v1 and v3 are allocated to the same stack slot, then this code is unnecessary. But at regalloc time we didn't know the homes of v1 and v3. To fix this problem, allocate all the stack slots before fixing up the merge edges. That way, we know what stack slots values use so we know what copies are required. Use a good technique for shuffling values around on merge edges. Improves performance of the go1 TimeParse benchmark by ~12% Change-Id: I731f43e4ff1a7e0dc4cd4aa428fcdb97812b86fa Reviewed-on: https://go-review.googlesource.com/17915 Reviewed-by: David Chase <drchase@google.com>
2024-10-05 09:11:21 -06:00 · 2015-12-17 10:01:24 -08:00 · 2015-12-17 10:01:24 -08:00 · 7d9f1067d1
commit 7d9f1067d1
parent 5b355a7907
4 changed files with 796 additions and 417 deletions
--- a/src/cmd/compile/internal/ssa/compile.go
+++ b/src/cmd/compile/internal/ssa/compile.go
@ -102,7 +102,6 @@ var passes = [...]pass{
 	{"schedule", schedule},   // schedule values
 	{"flagalloc", flagalloc}, // allocate flags register
 	{"regalloc", regalloc},
-	{"stackalloc", stackalloc},
 }

 // Double-check phase ordering constraints.
@ -138,8 +137,6 @@ var passOrder = [...]constraint{
 	{"critical", "regalloc"},
 	// regalloc requires all the values in a block to be scheduled
 	{"schedule", "regalloc"},
-	// stack allocation requires register allocation
-	{"regalloc", "stackalloc"},
 	// checkLower must run after lowering & subsequent dead code elim
 	{"lower", "checkLower"},
 	{"lowered deadcode", "checkLower"},
--- a/src/cmd/compile/internal/ssa/flagalloc.go
+++ b/src/cmd/compile/internal/ssa/flagalloc.go
@ -21,15 +21,6 @@ func flagalloc(f *Func) {
 		// Walk blocks backwards.  Poor-man's postorder traversal.
 		for i := len(f.Blocks) - 1; i >= 0; i-- {
 			b := f.Blocks[i]
-			if len(b.Preds) > 1 {
-				// Don't use any flags register at the start
-				// of a merge block.  This causes problems
-				// in regalloc because some of the rematerialization
-				// instructions used on incoming merge edges clobber
-				// the flags register.
-				// TODO: only for architectures where this matters?
-				continue
-			}
 			// Walk values backwards to figure out what flag
 			// value we want in the flag register at the start
 			// of the block.
--- a/src/cmd/compile/internal/ssa/regalloc.go
+++ b/src/cmd/compile/internal/ssa/regalloc.go
--- a/src/cmd/compile/internal/ssa/stackalloc.go
+++ b/src/cmd/compile/internal/ssa/stackalloc.go
@ -6,55 +6,65 @@

 package ssa

+import "fmt"
+
+const stackDebug = false // TODO: compiler flag
+
+type stackAllocState struct {
+	f         *Func
+	values    []stackValState
+	live      [][]ID // live[b.id] = live values at the end of block b.
+	interfere [][]ID // interfere[v.id] = values that interfere with v.
+}
+
+type stackValState struct {
+	typ      Type
+	spill    *Value
+	needSlot bool
+}
+
 // stackalloc allocates storage in the stack frame for
 // all Values that did not get a register.
-func stackalloc(f *Func) {
-	// Cache value types by ID.
-	types := make([]Type, f.NumValues())
+// Returns a map from block ID to the stack values live at the end of that block.
+func stackalloc(f *Func, spillLive [][]ID) [][]ID {
+	if stackDebug {
+		fmt.Println("before stackalloc")
+		fmt.Println(f.String())
+	}
+	var s stackAllocState
+	s.init(f, spillLive)
+	s.stackalloc()
+	return s.live
+}
+
+func (s *stackAllocState) init(f *Func, spillLive [][]ID) {
+	s.f = f
+
+	// Initialize value information.
+	s.values = make([]stackValState, f.NumValues())
 	for _, b := range f.Blocks {
 		for _, v := range b.Values {
-			types[v.ID] = v.Type
-		}
-	}
-
-	// Build interference graph among StoreReg and stack phi ops.
-	live := f.liveSpills()
-	interfere := make([][]ID, f.NumValues())
-	s := newSparseSet(f.NumValues())
-	for _, b := range f.Blocks {
-		// Start with known live values at the end of the block.
-		s.clear()
-		for i := 0; i < len(b.Succs); i++ {
-			s.addAll(live[b.ID][i])
-		}
-
-		// Propagate backwards to the start of the block.
-		// Remember interfering sets.
-		for i := len(b.Values) - 1; i >= 0; i-- {
-			v := b.Values[i]
-			switch {
-			case v.Op == OpStoreReg, v.isStackPhi():
-				s.remove(v.ID)
-				for _, id := range s.contents() {
-					if v.Type.Equal(types[id]) {
-						// Only need interferences between equivalent types.
-						interfere[v.ID] = append(interfere[v.ID], id)
-						interfere[id] = append(interfere[id], v.ID)
-					}
-				}
-			case v.Op == OpLoadReg:
-				s.add(v.Args[0].ID)
-			case v.Op == OpArg:
-				// This is an input argument which is pre-spilled.  It is kind of
-				// like a StoreReg, but we don't remove v.ID here because we want
-				// this value to appear live even before this point.  Being live
-				// all the way to the start of the entry block prevents other
-				// values from being allocated to the same slot and clobbering
-				// the input value before we have a chance to load it.
+			s.values[v.ID].typ = v.Type
+			s.values[v.ID].needSlot = !v.Type.IsMemory() && !v.Type.IsVoid() && !v.Type.IsFlags() && f.getHome(v.ID) == nil && !v.rematerializeable()
+			if stackDebug && s.values[v.ID].needSlot {
+				fmt.Printf("%s needs a stack slot\n", v)
+			}
+			if v.Op == OpStoreReg {
+				s.values[v.Args[0].ID].spill = v
 			}
 		}
 	}

+	// Compute liveness info for values needing a slot.
+	s.computeLive(spillLive)
+
+	// Build interference graph among values needing a slot.
+	s.buildInterferenceGraph()
+}
+
+func (s *stackAllocState) stackalloc() {
+	f := s.f
+
 	// Build map from values to their names, if any.
 	// A value may be associated with more than one name (e.g. after
 	// the assignment i=j). This step picks one name per value arbitrarily.
@ -67,49 +77,41 @@ func stackalloc(f *Func) {
 		}
 	}

-	// Figure out which StoreReg ops are phi args.  We don't pick slots for
-	// phi args because a stack phi and its args must all use the same stack slot.
-	phiArg := make([]bool, f.NumValues())
-	for _, b := range f.Blocks {
-		for _, v := range b.Values {
-			if !v.isStackPhi() {
-				continue
-			}
-			for _, a := range v.Args {
-				phiArg[a.ID] = true
-			}
-		}
-	}
-
 	// Allocate args to their assigned locations.
 	for _, v := range f.Entry.Values {
 		if v.Op != OpArg {
 			continue
 		}
-		f.setHome(v, LocalSlot{v.Aux.(GCNode), v.Type, v.AuxInt})
+		loc := LocalSlot{v.Aux.(GCNode), v.Type, v.AuxInt}
+		if stackDebug {
+			fmt.Printf("stackalloc %s to %s\n", v, loc.Name())
+		}
+		f.setHome(v, loc)
 	}

 	// For each type, we keep track of all the stack slots we
 	// have allocated for that type.
+	// TODO: share slots among equivalent types.  We would need to
+	// only share among types with the same GC signature.  See the
+	// type.Equal calls below for where this matters.
 	locations := map[Type][]LocalSlot{}

 	// Each time we assign a stack slot to a value v, we remember
 	// the slot we used via an index into locations[v.Type].
-	// TODO: share slots among equivalent types.
 	slots := make([]int, f.NumValues())
 	for i := f.NumValues() - 1; i >= 0; i-- {
 		slots[i] = -1
 	}

-	// Pick a stack slot for each non-phi-arg StoreReg and each stack phi.
+	// Pick a stack slot for each value needing one.
 	used := make([]bool, f.NumValues())
 	for _, b := range f.Blocks {
 		for _, v := range b.Values {
-			if v.Op != OpStoreReg && !v.isStackPhi() {
+			if !s.values[v.ID].needSlot {
 				continue
 			}
-			if phiArg[v.ID] {
-				continue
+			if v.Op == OpArg {
+				continue // already picked
 			}

 			// If this is a named value, try to use the name as
@ -121,7 +123,7 @@ func stackalloc(f *Func) {
 				name = names[v.ID]
 			}
 			if name.N != nil && v.Type.Equal(name.Type) {
-				for _, id := range interfere[v.ID] {
+				for _, id := range s.interfere[v.ID] {
 					h := f.getHome(id)
 					if h != nil && h.(LocalSlot) == name {
 						// A variable can interfere with itself.
@ -129,22 +131,10 @@ func stackalloc(f *Func) {
 						goto noname
 					}
 				}
-				if v.Op == OpPhi {
-					for _, a := range v.Args {
-						for _, id := range interfere[a.ID] {
-							h := f.getHome(id)
-							if h != nil && h.(LocalSlot) == name {
-								goto noname
-							}
-						}
-					}
+				if stackDebug {
+					fmt.Printf("stackalloc %s to %s\n", v, name.Name())
 				}
 				f.setHome(v, name)
-				if v.Op == OpPhi {
-					for _, a := range v.Args {
-						f.setHome(a, name)
-					}
-				}
 				continue
 			}

@ -155,25 +145,12 @@ func stackalloc(f *Func) {
 			for i := 0; i < len(locs); i++ {
 				used[i] = false
 			}
-			for _, xid := range interfere[v.ID] {
+			for _, xid := range s.interfere[v.ID] {
 				slot := slots[xid]
 				if slot >= 0 {
 					used[slot] = true
 				}
 			}
-			if v.Op == OpPhi {
-				// Stack phi and args must get the same stack slot, so
-				// anything the args interfere with is something the phi
-				// interferes with.
-				for _, a := range v.Args {
-					for _, xid := range interfere[a.ID] {
-						slot := slots[xid]
-						if slot >= 0 {
-							used[slot] = true
-						}
-					}
-				}
-			}
 			// Find an unused stack slot.
 			var i int
 			for i = 0; i < len(locs); i++ {
@ -188,83 +165,80 @@ func stackalloc(f *Func) {
 			}
 			// Use the stack variable at that index for v.
 			loc := locs[i]
+			if stackDebug {
+				fmt.Printf("stackalloc %s to %s\n", v, loc.Name())
+			}
 			f.setHome(v, loc)
 			slots[v.ID] = i
-			if v.Op == OpPhi {
-				for _, a := range v.Args {
-					f.setHome(a, loc)
-					slots[a.ID] = i
-				}
-			}
 		}
 	}
 }

-// live returns a map from block ID and successor edge index to a list
-// of StoreReg/stackphi value IDs live on that edge.
+// computeLive computes a map from block ID to a list of
+// stack-slot-needing value IDs live at the end of that block.
 // TODO: this could be quadratic if lots of variables are live across lots of
 // basic blocks.  Figure out a way to make this function (or, more precisely, the user
 // of this function) require only linear size & time.
-func (f *Func) liveSpills() [][][]ID {
-	live := make([][][]ID, f.NumBlocks())
-	for _, b := range f.Blocks {
-		live[b.ID] = make([][]ID, len(b.Succs))
-	}
+func (s *stackAllocState) computeLive(spillLive [][]ID) {
+	s.live = make([][]ID, s.f.NumBlocks())
 	var phis []*Value
-
-	s := newSparseSet(f.NumValues())
-	t := newSparseSet(f.NumValues())
+	live := newSparseSet(s.f.NumValues())
+	t := newSparseSet(s.f.NumValues())

 	// Instead of iterating over f.Blocks, iterate over their postordering.
 	// Liveness information flows backward, so starting at the end
 	// increases the probability that we will stabilize quickly.
-	po := postorder(f)
+	po := postorder(s.f)
 	for {
 		changed := false
 		for _, b := range po {
 			// Start with known live values at the end of the block
-			s.clear()
-			for i := 0; i < len(b.Succs); i++ {
-				s.addAll(live[b.ID][i])
-			}
+			live.clear()
+			live.addAll(s.live[b.ID])

 			// Propagate backwards to the start of the block
 			phis = phis[:0]
 			for i := len(b.Values) - 1; i >= 0; i-- {
 				v := b.Values[i]
-				switch {
-				case v.Op == OpStoreReg:
-					s.remove(v.ID)
-				case v.Op == OpLoadReg:
-					s.add(v.Args[0].ID)
-				case v.isStackPhi():
-					s.remove(v.ID)
-					// save stack phi ops for later
-					phis = append(phis, v)
+				live.remove(v.ID)
+				if v.Op == OpPhi {
+					// Save phi for later.
+					// Note: its args might need a stack slot even though
+					// the phi itself doesn't.  So don't use needSlot.
+					if !v.Type.IsMemory() && !v.Type.IsVoid() {
+						phis = append(phis, v)
+					}
+					continue
+				}
+				for _, a := range v.Args {
+					if s.values[a.ID].needSlot {
+						live.add(a.ID)
+					}
 				}
 			}

 			// for each predecessor of b, expand its list of live-at-end values
 			// invariant: s contains the values live at the start of b (excluding phi inputs)
 			for i, p := range b.Preds {
-				// Find index of b in p's successors.
-				var j int
-				for j = 0; j < len(p.Succs); j++ {
-					if p.Succs[j] == b {
-						break
+				t.clear()
+				t.addAll(s.live[p.ID])
+				t.addAll(live.contents())
+				t.addAll(spillLive[p.ID])
+				for _, v := range phis {
+					a := v.Args[i]
+					if s.values[a.ID].needSlot {
+						t.add(a.ID)
+					}
+					if spill := s.values[a.ID].spill; spill != nil {
+						//TODO: remove?  Subsumed by SpillUse?
+						t.add(spill.ID)
 					}
 				}
-				t.clear()
-				t.addAll(live[p.ID][j])
-				t.addAll(s.contents())
-				for _, v := range phis {
-					t.add(v.Args[i].ID)
-				}
-				if t.size() == len(live[p.ID][j]) {
+				if t.size() == len(s.live[p.ID]) {
 					continue
 				}
 				// grow p's live set
-				live[p.ID][j] = append(live[p.ID][j][:0], t.contents()...)
+				s.live[p.ID] = append(s.live[p.ID][:0], t.contents()...)
 				changed = true
 			}
 		}
@ -273,7 +247,11 @@ func (f *Func) liveSpills() [][][]ID {
 			break
 		}
 	}
-	return live
+	if stackDebug {
+		for _, b := range s.f.Blocks {
+			fmt.Printf("stacklive %s %v\n", b, s.live[b.ID])
+		}
+	}
 }

 func (f *Func) getHome(vid ID) Location {
@ -290,16 +268,51 @@ func (f *Func) setHome(v *Value, loc Location) {
 	f.RegAlloc[v.ID] = loc
 }

-func (v *Value) isStackPhi() bool {
-	if v.Op != OpPhi {
-		return false
+func (s *stackAllocState) buildInterferenceGraph() {
+	f := s.f
+	s.interfere = make([][]ID, f.NumValues())
+	live := newSparseSet(f.NumValues())
+	for _, b := range f.Blocks {
+		// Propagate liveness backwards to the start of the block.
+		// Two values interfere if one is defined while the other is live.
+		live.clear()
+		live.addAll(s.live[b.ID])
+		for i := len(b.Values) - 1; i >= 0; i-- {
+			v := b.Values[i]
+			if s.values[v.ID].needSlot {
+				live.remove(v.ID)
+				for _, id := range live.contents() {
+					if s.values[v.ID].typ.Equal(s.values[id].typ) {
+						s.interfere[v.ID] = append(s.interfere[v.ID], id)
+						s.interfere[id] = append(s.interfere[id], v.ID)
+					}
+				}
+			}
+			for _, a := range v.Args {
+				if s.values[a.ID].needSlot {
+					live.add(a.ID)
+				}
+			}
+			if v.Op == OpArg && s.values[v.ID].needSlot {
+				// OpArg is an input argument which is pre-spilled.
+				// We add back v.ID here because we want this value
+				// to appear live even before this point.  Being live
+				// all the way to the start of the entry block prevents other
+				// values from being allocated to the same slot and clobbering
+				// the input value before we have a chance to load it.
+				live.add(v.ID)
+			}
+		}
 	}
-	if v.Type == TypeMem {
-		return false
+	if stackDebug {
+		for vid, i := range s.interfere {
+			if len(i) > 0 {
+				fmt.Printf("v%d interferes with", vid)
+				for _, x := range i {
+					fmt.Printf(" v%d", x)
+				}
+				fmt.Println()
+			}
+		}
 	}
-	if int(v.ID) >= len(v.Block.Func.RegAlloc) {
-		return true
-	}
-	return v.Block.Func.RegAlloc[v.ID] == nil
-	// TODO: use a separate opcode for StackPhi?
 }