[dev.ssa] cmd/compile/internal/ssa: Fix scheduler

The DFS scheduler doesn't do the right thing. If a Value x is used by more than one other Value, then x is put into the DFS queue when its first user (call it y) is visited. It is not removed and reinserted when the second user of x (call it z) is visited, so the dependency between x and z is not respected. There is no easy way to fix this with the DFS queue because we'd have to rip values out of the middle of the DFS queue. The new scheduler works from the end of the block backwards, scheduling instructions which have had all of their uses already scheduled. A simple priority scheme breaks ties between multiple instructions that are ready to schedule simultaneously. Keep track of whether we've scheduled or not, and make print() use the scheduled order if we have. Fix some shift tests that this change tickles. Add unsigned right shift tests. Change-Id: I44164c10bb92ae8ab8f76d7a5180cbafab826ea1 Reviewed-on: https://go-review.googlesource.com/13069 Reviewed-by: Todd Neal <todd@tneal.org>
2024-10-05 16:41:21 -06:00 · 2015-08-03 12:33:03 -07:00 · 2015-08-03 12:33:03 -07:00 · a678a5c7a5
commit a678a5c7a5
parent 4dcf8ea1a4
4 changed files with 132 additions and 106 deletions
--- a/src/cmd/compile/internal/gc/testdata/arith_ssa.go
+++ b/src/cmd/compile/internal/gc/testdata/arith_ssa.go
@ -68,7 +68,7 @@ func testBitwiseLogic() {
 		failed = true
 	}
 	if want, got := int32(832), testBitwiseLsh_ssa(13, 4, 2); want != got {
-		println("testBitwiseXor failed, wanted", want, "got", got)
+		println("testBitwiseLsh failed, wanted", want, "got", got)
 		failed = true
 	}
 	if want, got := int32(0), testBitwiseLsh_ssa(13, 25, 15); want != got {
@ -79,16 +79,28 @@ func testBitwiseLogic() {
 		println("testBitwiseLsh failed, wanted", want, "got", got)
 		failed = true
 	}
-	if want, got := int32(0), testBitwiseRsh_ssa(-13, 25, 15); want != got {
-		println("testBitwiseLsh failed, wanted", want, "got", got)
+	if want, got := int32(-13), testBitwiseRsh_ssa(-832, 4, 2); want != got {
+		println("testBitwiseRsh failed, wanted", want, "got", got)
 		failed = true
 	}
 	if want, got := int32(0), testBitwiseRsh_ssa(13, 25, 15); want != got {
-		println("testBitwiseLsh failed, wanted", want, "got", got)
+		println("testBitwiseRsh failed, wanted", want, "got", got)
 		failed = true
 	}
 	if want, got := int32(-1), testBitwiseRsh_ssa(-13, 25, 15); want != got {
-		println("testBitwiseLsh failed, wanted", want, "got", got)
+		println("testBitwiseRsh failed, wanted", want, "got", got)
+		failed = true
+	}
+	if want, got := uint32(0x3ffffff), testBitwiseRshU_ssa(0xffffffff, 4, 2); want != got {
+		println("testBitwiseRshU failed, wanted", want, "got", got)
+		failed = true
+	}
+	if want, got := uint32(0), testBitwiseRshU_ssa(13, 25, 15); want != got {
+		println("testBitwiseRshU failed, wanted", want, "got", got)
+		failed = true
+	}
+	if want, got := uint32(0), testBitwiseRshU_ssa(0x8aaaaaaa, 25, 15); want != got {
+		println("testBitwiseRshU failed, wanted", want, "got", got)
 		failed = true
 	}
 }
@ -123,6 +135,12 @@ func testBitwiseRsh_ssa(a int32, b, c uint32) int32 {
 	return a >> b >> c
 }

+func testBitwiseRshU_ssa(a uint32, b, c uint32) uint32 {
+	switch { // prevent inlining
+	}
+	return a >> b >> c
+}
+
 // testSubqToNegq ensures that the SUBQ -> NEGQ translation works correctly.
 func testSubqToNegq() {
 	want := int64(-318294940372190156)
--- a/src/cmd/compile/internal/ssa/func.go
+++ b/src/cmd/compile/internal/ssa/func.go
@ -18,6 +18,8 @@ type Func struct {
 	bid        idAlloc     // block ID allocator
 	vid        idAlloc     // value ID allocator

+	scheduled bool // Values in Blocks are in final order
+
 	// when register allocation is done, maps value ids to locations
 	RegAlloc []Location
 	// when stackalloc is done, the size of the stack frame
--- a/src/cmd/compile/internal/ssa/print.go
+++ b/src/cmd/compile/internal/ssa/print.go
@ -34,9 +34,19 @@ func fprintFunc(w io.Writer, f *Func) {
 			}
 		}
 		io.WriteString(w, "\n")
-		n := 0
+
+		if f.scheduled {
+			// Order of Values has been decided - print in that order.
+			for _, v := range b.Values {
+				fmt.Fprint(w, "    ")
+				fmt.Fprintln(w, v.LongString())
+				printed[v.ID] = true
+			}
+			continue
+		}

 		// print phis first since all value cycles contain a phi
+		n := 0
 		for _, v := range b.Values {
 			if v.Op != OpPhi {
 				continue
--- a/src/cmd/compile/internal/ssa/schedule.go
+++ b/src/cmd/compile/internal/ssa/schedule.go
@ -6,121 +6,117 @@ package ssa

 // Schedule the Values in each Block.  After this phase returns, the
 // order of b.Values matters and is the order in which those values
-// will appear in the assembly output.  For now it generates an
-// arbitrary valid schedule using a topological sort.  TODO(khr):
+// will appear in the assembly output.  For now it generates a
+// reasonable valid schedule using a priority queue.  TODO(khr):
 // schedule smarter.
 func schedule(f *Func) {
-	const (
-		unmarked = 0
-		found    = 1
-		expanded = 2
-		done     = 3
-	)
-	state := make([]byte, f.NumValues())
-	var queue []*Value //stack-like worklist.  Contains found and expanded nodes.
+	// For each value, the number of times it is used in the block
+	// by values that have not been scheduled yet.
+	uses := make([]int, f.NumValues())
+
+	// "priority" for a value
+	score := make([]int, f.NumValues())
+
+	// scheduling order.  We queue values in this list in reverse order.
 	var order []*Value

-	nextMem := make([]*Value, f.NumValues()) // maps mem values to the next live value
-	additionalEdges := make([][]*Value, f.NumValues())
+	// priority queue of legally schedulable (0 unscheduled uses) values
+	var priq [4][]*Value
+
 	for _, b := range f.Blocks {
-		// Set the nextMem values for this block.  If the previous
-		// write is from a different block, then its nextMem entry
-		// might have already been set during processing of an earlier
-		// block.  This loop resets the nextMem entries to be correct
-		// for this block.
+		// Compute uses.
 		for _, v := range b.Values {
-			if v.Type.IsMemory() {
+			if v.Op != OpPhi {
+				// Note: if a value is used by a phi, it does not induce
+				// a scheduling edge because that use is from the
+				// previous iteration.
 				for _, w := range v.Args {
-					if w.Type.IsMemory() {
-						nextMem[w.ID] = v
+					if w.Block == b {
+						uses[w.ID]++
 					}
 				}
 			}
 		}
-		// Add a anti-dependency between each load v and the memory value n
-		// following the memory value that v loads from.
-		// This will enforce the single-live-mem restriction.
+		// Compute score.  Larger numbers are scheduled closer to the end of the block.
 		for _, v := range b.Values {
-			if v.Type.IsMemory() {
-				continue
-			}
-			for _, w := range v.Args {
-				if w.Type.IsMemory() && nextMem[w.ID] != nil {
-					// Filter for intra-block edges.
-					if n := nextMem[w.ID]; n.Block == b {
-						additionalEdges[n.ID] = append(additionalEdges[n.ID], v)
-					}
-				}
-			}
-		}
-
-		order = order[:0]
-
-		// Schedule phis first
-		for _, v := range b.Values {
-			if v.Op == OpPhi {
-				// TODO: what if a phi is also a control op?  It happens for
-				// mem ops all the time, which shouldn't matter.  But for
-				// regular ops we might be violating invariants about where
-				// control ops live.
-				if v == b.Control && !v.Type.IsMemory() {
-					f.Unimplementedf("phi is a control op %s %s", v, b)
-				}
-				order = append(order, v)
-			}
-		}
-
-		// Topologically sort the non-phi values in b.
-		for _, v := range b.Values {
-			if v.Op == OpPhi {
-				continue
-			}
-			if v == b.Control {
-				continue
-			}
-			if state[v.ID] != unmarked {
-				if state[v.ID] != done {
-					panic("bad state")
-				}
-				continue
-			}
-			state[v.ID] = found
-			queue = append(queue, v)
-			for len(queue) > 0 {
-				v = queue[len(queue)-1]
-				switch state[v.ID] {
-				case found:
-					state[v.ID] = expanded
-					// Note that v is not popped.  We leave it in place
-					// until all its children have been explored.
-					for _, w := range v.Args {
-						if w.Block == b && w.Op != OpPhi && w != b.Control && state[w.ID] == unmarked {
-							state[w.ID] = found
-							queue = append(queue, w)
-						}
-					}
-					for _, w := range additionalEdges[v.ID] {
-						if w.Block == b && w.Op != OpPhi && w != b.Control && state[w.ID] == unmarked {
-							state[w.ID] = found
-							queue = append(queue, w)
-						}
-					}
-				case expanded:
-					queue = queue[:len(queue)-1]
-					state[v.ID] = done
-					order = append(order, v)
-				default:
-					panic("bad state")
-				}
+			switch {
+			case v.Op == OpPhi:
+				// We want all the phis first.
+				score[v.ID] = 0
+			case v.Type.IsMemory():
+				// Schedule stores as late as possible.
+				// This makes sure that loads do not get scheduled
+				// after a following store (1-live-memory requirement).
+				score[v.ID] = 2
+			case v.Type.IsFlags():
+				// Schedule flag register generation as late as possible.
+				// This makes sure that we only have one live flags
+				// value at a time.
+				score[v.ID] = 2
+			default:
+				score[v.ID] = 1
 			}
 		}
 		if b.Control != nil {
-			order = append(order, b.Control)
+			// Force the control value to be scheduled at the end.
+			score[b.Control.ID] = 3
+			// TODO: some times control values are used by other values
+			// in the block.  So the control value will not appear at
+			// the very end.  Decide if this is a problem or not.
+		}
+
+		// Initialize priority queue with schedulable values.
+		for i := range priq {
+			priq[i] = priq[i][:0]
+		}
+		for _, v := range b.Values {
+			if uses[v.ID] == 0 {
+				s := score[v.ID]
+				priq[s] = append(priq[s], v)
+			}
+		}
+
+		// Schedule highest priority value, update use counts, repeat.
+		order = order[:0]
+		for {
+			// Find highest priority schedulable value.
+			var v *Value
+			for i := len(priq) - 1; i >= 0; i-- {
+				n := len(priq[i])
+				if n == 0 {
+					continue
+				}
+				v = priq[i][n-1]
+				priq[i] = priq[i][:n-1]
+				break
+			}
+			if v == nil {
+				break
+			}
+
+			// Add it to the schedule.
+			order = append(order, v)
+
+			// Update use counts of arguments.
+			for _, w := range v.Args {
+				if w.Block != b {
+					continue
+				}
+				uses[w.ID]--
+				if uses[w.ID] == 0 {
+					// All uses scheduled, w is now schedulable.
+					s := score[w.ID]
+					priq[s] = append(priq[s], w)
+				}
+			}
+		}
+		if len(order) != len(b.Values) {
+			f.Fatalf("schedule does not include all values")
+		}
+		for i := 0; i < len(b.Values); i++ {
+			b.Values[i] = order[len(b.Values)-1-i]
 		}
-		copy(b.Values, order)
 	}
-	// TODO: only allow one live flags type (x86)
-	// This restriction will force and any flag uses to appear before
-	// the next flag update.  This "anti-dependence" is not recorded
-	// explicitly in ssa form.
+
+	f.scheduled = true
 }