cmd/compile: parallelize one more bit of rulegen

'go tool trace' pointed at an obvious inefficiency; roughly the first fifth of the program's life was CPU-heavy and making use of only one CPU core at a time. This was due to genOp being run before genLower. We did make genLower use goroutines to parallelize the work between architectures, but we didn't make genOp run in parallel too. Do that. To avoid having two layers of goroutines, simply fire off all goroutines from the main function, and inline genLower, since it now becomes just two lines of code. Overall, this shaves another ~300ms from 'go run *.go' on my laptop. name old time/op new time/op delta Rulegen 2.04s ± 2% 1.76s ± 2% -13.93% (p=0.008 n=5+5) name old user-time/op new user-time/op delta Rulegen 9.04s ± 1% 9.25s ± 1% +2.37% (p=0.008 n=5+5) name old sys-time/op new sys-time/op delta Rulegen 235ms ±14% 245ms ±16% ~ (p=0.690 n=5+5) name old peak-RSS-bytes new peak-RSS-bytes delta Rulegen 179MB ± 1% 190MB ± 2% +6.21% (p=0.008 n=5+5) Change-Id: I057e074c592afe06c831b03ca447fba12005e6f6 Reviewed-on: https://go-review.googlesource.com/c/go/+/196177 Run-TryBot: Daniel Martí <mvdan@mvdan.cc> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
2024-11-19 02:24:41 -07:00 · 2019-09-18 16:33:54 +01:00 · 2019-09-18 16:33:54 +01:00 · 3c6eaa7c0d
commit 3c6eaa7c0d
parent 90f9426573
1 changed files with 32 additions and 26 deletions
--- a/src/cmd/compile/internal/ssa/gen/main.go
+++ b/src/cmd/compile/internal/ssa/gen/main.go
@ -114,8 +114,37 @@ func main() {
 		defer pprof.StopCPUProfile()
 	}
 	sort.Sort(ArchsByName(archs))
-	genOp()
-	genLower()
+
+	// The generate tasks are run concurrently, since they are CPU-intensive
+	// that can easily make use of many cores on a machine.
+	//
+	// Note that there is no limit on the concurrency at the moment. On a
+	// four-core laptop at the time of writing, peak RSS usually reaches
+	// ~200MiB, which seems doable by practically any machine nowadays. If
+	// that stops being the case, we can cap this func to a fixed number of
+	// architectures being generated at once.
+
+	tasks := []func(){
+		genOp,
+	}
+	for _, a := range archs {
+		a := a // the funcs are ran concurrently at a later time
+		tasks = append(tasks, func() {
+			genRules(a)
+			genSplitLoadRules(a)
+		})
+	}
+	var wg sync.WaitGroup
+	for _, task := range tasks {
+		task := task
+		wg.Add(1)
+		go func() {
+			task()
+			wg.Done()
+		}()
+	}
+	wg.Wait()
+
 	if *memprofile != "" {
 		f, err := os.Create(*memprofile)
 		if err != nil {
@ -385,8 +414,7 @@ func genOp() {
 		panic(err)
 	}

-	err = ioutil.WriteFile("../opGen.go", b, 0666)
-	if err != nil {
+	if err := ioutil.WriteFile("../opGen.go", b, 0666); err != nil {
 		log.Fatalf("can't write output: %v\n", err)
 	}

@ -432,28 +460,6 @@ func (a arch) Name() string {
 	return s
 }

-// genLower generates all arch-specific rewrite Go source files. The files are
-// generated and written concurrently, since it's a CPU-intensive task that can
-// easily make use of many cores on a machine.
-//
-// Note that there is no limit on the concurrency at the moment. On a four-core
-// laptop at the time of writing, peak RSS usually reached ~230MiB, which seems
-// doable by practically any machine nowadays. If that stops being the case, we
-// can cap this func to a fixed number of architectures being generated at once.
-func genLower() {
-	var wg sync.WaitGroup
-	for _, a := range archs {
-		a := a
-		wg.Add(1)
-		go func() {
-			genRules(a)
-			genSplitLoadRules(a)
-			wg.Done()
-		}()
-	}
-	wg.Wait()
-}
-
 // countRegs returns the number of set bits in the register mask.
 func countRegs(r regMask) int {
 	n := 0