From 48a6362dff840cfe75d5f1a674157af1f5d9ee85 Mon Sep 17 00:00:00 2001 From: Michael Anthony Knyszek Date: Sun, 12 Nov 2023 21:48:31 +0000 Subject: [PATCH] cmd/trace/v2: add goroutine analysis pages This is a complete fork and most of a rewrite of the goroutine analysis pages for v2 traces. It fixes an issue with the old page where GC time didn't really make any sense, generalizes the page and breaks things down further, and adds clarifying text. This change also modifies the SummarizeGoroutines API to not stream the trace. This is unfortunate, but we're already reading and holding the entire trace in memory for the trace viewer. We can revisit this decision in the future. Also, we want to do this now because the GoroutineSummary holds on to pointers to events, and these events will be used by the user region and user task analyses. While tracev2 events are values and they should be equivalent no matter how many times we parse a trace, this lets us reference the event in the slice directly. For #60773. For #63960. Fixes #62443. Change-Id: I1c5ab68141869378843f4f2826686038e4533090 Reviewed-on: https://go-review.googlesource.com/c/go/+/541257 Reviewed-by: Michael Pratt Auto-Submit: Michael Knyszek LUCI-TryBot-Result: Go LUCI --- src/cmd/trace/v2/goroutines.go | 435 ++++++++++++++++++++++++ src/cmd/trace/v2/main.go | 11 +- src/internal/trace/goroutinesv2.go | 70 ++-- src/internal/trace/goroutinesv2_test.go | 18 +- src/internal/trace/traceviewer/http.go | 29 +- 5 files changed, 532 insertions(+), 31 deletions(-) create mode 100644 src/cmd/trace/v2/goroutines.go diff --git a/src/cmd/trace/v2/goroutines.go b/src/cmd/trace/v2/goroutines.go new file mode 100644 index 00000000000..8fbfc9c22d5 --- /dev/null +++ b/src/cmd/trace/v2/goroutines.go @@ -0,0 +1,435 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Goroutine-related profiles. + +package trace + +import ( + "cmp" + "fmt" + "html/template" + "internal/trace" + "internal/trace/traceviewer" + tracev2 "internal/trace/v2" + "log" + "net/http" + "slices" + "sort" + "strconv" + "strings" + "time" +) + +// GoroutinesHandlerFunc returns a HandlerFunc that serves list of goroutine groups. +func GoroutinesHandlerFunc(summaries map[tracev2.GoID]*trace.GoroutineSummary) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + // goroutineGroup describes a group of goroutines grouped by start PC. + type goroutineGroup struct { + ID uint64 // Unique identifier (PC). + Name string // Start function. + N int // Total number of goroutines in this group. + ExecTime time.Duration // Total execution time of all goroutines in this group. + } + // Accumulate groups by PC. + groupsByPC := make(map[uint64]goroutineGroup) + for _, summary := range summaries { + group := groupsByPC[summary.PC] + group.ID = summary.PC + group.Name = summary.Name + group.N++ + group.ExecTime += summary.ExecTime + groupsByPC[summary.PC] = group + } + var groups []goroutineGroup + for pc, group := range groupsByPC { + group.ID = pc + // If goroutine didn't run during the trace (no sampled PC), + // the v.ID and v.Name will be zero value. + if group.ID == 0 && group.Name == "" { + group.Name = "(Inactive, no stack trace sampled)" + } + groups = append(groups, group) + } + slices.SortFunc(groups, func(a, b goroutineGroup) int { + return cmp.Compare(b.ExecTime, a.ExecTime) + }) + w.Header().Set("Content-Type", "text/html;charset=utf-8") + if err := templGoroutines.Execute(w, groups); err != nil { + log.Printf("failed to execute template: %v", err) + return + } + } +} + +var templGoroutines = template.Must(template.New("").Parse(` + + + +

Goroutines

+Below is a table of all goroutines in the trace grouped by start location and sorted by the total execution time of the group.
+
+Click a start location to view more details about that group.
+
+ + + + + + +{{range $}} + + + + + +{{end}} +
Start locationCountTotal execution time
{{.Name}}{{.N}}{{.ExecTime}}
+ + +`)) + +// GoroutineHandler creates a handler that serves information about +// goroutines in a particular group. +func GoroutineHandler(summaries map[tracev2.GoID]*trace.GoroutineSummary) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + pc, err := strconv.ParseUint(r.FormValue("id"), 10, 64) + if err != nil { + http.Error(w, fmt.Sprintf("failed to parse id parameter '%v': %v", r.FormValue("id"), err), http.StatusInternalServerError) + return + } + + type goroutine struct { + *trace.GoroutineSummary + NonOverlappingStats map[string]time.Duration + HasRangeTime bool + } + + // Collect all the goroutines in the group. + var ( + goroutines []goroutine + name string + totalExecTime, execTime time.Duration + maxTotalTime time.Duration + ) + validNonOverlappingStats := make(map[string]struct{}) + validRangeStats := make(map[string]struct{}) + for _, summary := range summaries { + totalExecTime += summary.ExecTime + + if summary.PC != pc { + continue + } + nonOverlappingStats := summary.NonOverlappingStats() + for name := range nonOverlappingStats { + validNonOverlappingStats[name] = struct{}{} + } + var totalRangeTime time.Duration + for name, dt := range summary.RangeTime { + validRangeStats[name] = struct{}{} + totalRangeTime += dt + } + goroutines = append(goroutines, goroutine{ + GoroutineSummary: summary, + NonOverlappingStats: nonOverlappingStats, + HasRangeTime: totalRangeTime != 0, + }) + name = summary.Name + execTime += summary.ExecTime + if maxTotalTime < summary.TotalTime { + maxTotalTime = summary.TotalTime + } + } + + // Compute the percent of total execution time these goroutines represent. + execTimePercent := "" + if totalExecTime > 0 { + execTimePercent = fmt.Sprintf("%.2f%%", float64(execTime)/float64(totalExecTime)*100) + } + + // Sort. + sortBy := r.FormValue("sortby") + if _, ok := validNonOverlappingStats[sortBy]; ok { + slices.SortFunc(goroutines, func(a, b goroutine) int { + return cmp.Compare(b.NonOverlappingStats[sortBy], a.NonOverlappingStats[sortBy]) + }) + } else { + // Sort by total time by default. + slices.SortFunc(goroutines, func(a, b goroutine) int { + return cmp.Compare(b.TotalTime, a.TotalTime) + }) + } + + // Write down all the non-overlapping stats and sort them. + allNonOverlappingStats := make([]string, 0, len(validNonOverlappingStats)) + for name := range validNonOverlappingStats { + allNonOverlappingStats = append(allNonOverlappingStats, name) + } + slices.SortFunc(allNonOverlappingStats, func(a, b string) int { + if a == b { + return 0 + } + if a == "Execution time" { + return -1 + } + if b == "Execution time" { + return 1 + } + return cmp.Compare(a, b) + }) + + // Write down all the range stats and sort them. + allRangeStats := make([]string, 0, len(validRangeStats)) + for name := range validRangeStats { + allRangeStats = append(allRangeStats, name) + } + sort.Strings(allRangeStats) + + err = templGoroutine.Execute(w, struct { + Name string + PC uint64 + N int + ExecTimePercent string + MaxTotal time.Duration + Goroutines []goroutine + NonOverlappingStats []string + RangeStats []string + }{ + Name: name, + PC: pc, + N: len(goroutines), + ExecTimePercent: execTimePercent, + MaxTotal: maxTotalTime, + Goroutines: goroutines, + NonOverlappingStats: allNonOverlappingStats, + RangeStats: allRangeStats, + }) + if err != nil { + http.Error(w, fmt.Sprintf("failed to execute template: %v", err), http.StatusInternalServerError) + return + } + } +} + +func stat2Color(statName string) string { + color := "#636363" + if strings.HasPrefix(statName, "Block time") { + color = "#d01c8b" + } + switch statName { + case "Sched wait time": + color = "#2c7bb6" + case "Syscall execution time": + color = "#7b3294" + case "Execution time": + color = "#d7191c" + } + return color +} + +var templGoroutine = template.Must(template.New("").Funcs(template.FuncMap{ + "percent": func(dividend, divisor time.Duration) template.HTML { + if divisor == 0 { + return "" + } + return template.HTML(fmt.Sprintf("(%.1f%%)", float64(dividend)/float64(divisor)*100)) + }, + "headerStyle": func(statName string) template.HTMLAttr { + return template.HTMLAttr(fmt.Sprintf("style=\"background-color: %s;\"", stat2Color(statName))) + }, + "barStyle": func(statName string, dividend, divisor time.Duration) template.HTMLAttr { + width := "0" + if divisor != 0 { + width = fmt.Sprintf("%.2f%%", float64(dividend)/float64(divisor)*100) + } + return template.HTMLAttr(fmt.Sprintf("style=\"width: %s; background-color: %s;\"", width, stat2Color(statName))) + }, +}).Parse(` + +Goroutine {{.Name}} + + + + +

Goroutine breakdown

+ +Table of contents + + +

Summary

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Goroutine start location:{{.Name}}
Count:{{.N}}
Execution Time:{{.ExecTimePercent}} of total program execution time
Network wait profile: graph (download)
Sync block profile: graph (download)
Syscall block profile: graph (download)
Scheduler wait profile: graph (download)
+ +

Breakdown

+ +The table below breaks down where each goroutine is spent its time during the +traced period. +All of the columns except total time are non-overlapping. +
+
+ + + + + + +{{range $.NonOverlappingStats}} + +{{end}} + +{{range .Goroutines}} + + + + + {{$Goroutine := .}} + {{range $.NonOverlappingStats}} + {{$Time := index $Goroutine.NonOverlappingStats .}} + + {{end}} + +{{end}} +
Goroutine Total {{.}}
{{.ID}} {{ .TotalTime.String }} +
+ {{$Goroutine := .}} + {{range $.NonOverlappingStats}} + {{$Time := index $Goroutine.NonOverlappingStats .}} + {{if $Time}} +   + {{end}} + {{end}} +
+
{{$Time.String}}
+ +

Special ranges

+ +The table below describes how much of the traced period each goroutine spent in +certain special time ranges. +If a goroutine has spent no time in any special time ranges, it is excluded from +the table. +For example, how much time it spent helping the GC. Note that these times do +overlap with the times from the first table. +In general the goroutine may not be executing in these special time ranges. +For example, it may have blocked while trying to help the GC. +This must be taken into account when interpreting the data. +
+
+ + + + + +{{range $.RangeStats}} + +{{end}} + +{{range .Goroutines}} + {{if .HasRangeTime}} + + + + {{$Goroutine := .}} + {{range $.RangeStats}} + {{$Time := index $Goroutine.RangeTime .}} + + {{end}} + + {{end}} +{{end}} +
Goroutine Total {{.}}
{{.ID}} {{ .TotalTime.String }} {{$Time.String}}
+`)) diff --git a/src/cmd/trace/v2/main.go b/src/cmd/trace/v2/main.go index 93c9d89c20f..b355d81ce60 100644 --- a/src/cmd/trace/v2/main.go +++ b/src/cmd/trace/v2/main.go @@ -6,6 +6,7 @@ package trace import ( "fmt" + "internal/trace" "internal/trace/traceviewer" tracev2 "internal/trace/v2" "io" @@ -39,9 +40,9 @@ func Main(traceFile, httpAddr, pprof string, debug int) error { if err != nil { return fmt.Errorf("failed to create server socket: %w", err) } - addr := "http://" + ln.Addr().String() - log.Print("Parsing trace...") + + log.Print("Preparing trace for viewer...") parsed, err := parseTrace(tracef) if err != nil { return err @@ -50,11 +51,13 @@ func Main(traceFile, httpAddr, pprof string, debug int) error { // We might double-close, but that's fine; we ignore the error. tracef.Close() - log.Print("Splitting trace...") + log.Print("Splitting trace for viewer...") ranges, err := splitTrace(parsed) if err != nil { return err } + log.Printf("Analyzing goroutines...") + gSummaries := trace.SummarizeGoroutines(parsed.events) log.Printf("Opening browser. Trace viewer is listening on %s", addr) browser.Open(addr) @@ -64,6 +67,8 @@ func Main(traceFile, httpAddr, pprof string, debug int) error { mux.Handle("/trace", traceviewer.TraceHandler()) mux.Handle("/jsontrace", JSONTraceHandler(parsed)) mux.Handle("/static/", traceviewer.StaticHandler()) + mux.HandleFunc("/goroutines", GoroutinesHandlerFunc(gSummaries)) + mux.HandleFunc("/goroutine", GoroutineHandler(gSummaries)) err = http.Serve(ln, mux) return fmt.Errorf("failed to start http server: %w", err) diff --git a/src/internal/trace/goroutinesv2.go b/src/internal/trace/goroutinesv2.go index 7dbe0c59051..7ed5771cd33 100644 --- a/src/internal/trace/goroutinesv2.go +++ b/src/internal/trace/goroutinesv2.go @@ -58,13 +58,51 @@ type UserRegionSummary struct { // GoroutineExecStats contains statistics about a goroutine's execution // during a period of time. type GoroutineExecStats struct { + // These stats are all non-overlapping. ExecTime time.Duration SchedWaitTime time.Duration BlockTimeByReason map[string]time.Duration SyscallTime time.Duration SyscallBlockTime time.Duration - RangeTime map[string]time.Duration - TotalTime time.Duration + + // TotalTime is the duration of the goroutine's presence in the trace. + // Necessarily overlaps with other stats. + TotalTime time.Duration + + // Total time the goroutine spent in certain ranges; may overlap + // with other stats. + RangeTime map[string]time.Duration +} + +func (s GoroutineExecStats) NonOverlappingStats() map[string]time.Duration { + stats := map[string]time.Duration{ + "Execution time": s.ExecTime, + "Sched wait time": s.SchedWaitTime, + "Syscall execution time": s.SyscallTime, + "Block time (syscall)": s.SyscallBlockTime, + "Unknown time": s.UnknownTime(), + } + for reason, dt := range s.BlockTimeByReason { + stats["Block time ("+reason+")"] += dt + } + // N.B. Don't include RangeTime or TotalTime; they overlap with these other + // stats. + return stats +} + +// UnknownTime returns whatever isn't accounted for in TotalTime. +func (s GoroutineExecStats) UnknownTime() time.Duration { + sum := s.ExecTime + s.SchedWaitTime + s.SyscallTime + + s.SyscallBlockTime + for _, dt := range s.BlockTimeByReason { + sum += dt + } + // N.B. Don't include range time. Ranges overlap with + // other stats, whereas these stats are non-overlapping. + if sum < s.TotalTime { + return s.TotalTime - sum + } + return 0 } // sub returns the stats v-s. @@ -172,7 +210,7 @@ type goroutineSummary struct { } // SummarizeGoroutines generates statistics for all goroutines in the trace. -func SummarizeGoroutines(trace io.Reader) (map[tracev2.GoID]*GoroutineSummary, error) { +func SummarizeGoroutines(events []tracev2.Event) map[tracev2.GoID]*GoroutineSummary { // Create the analysis state. b := goroutineStatsBuilder{ gs: make(map[tracev2.GoID]*GoroutineSummary), @@ -182,21 +220,11 @@ func SummarizeGoroutines(trace io.Reader) (map[tracev2.GoID]*GoroutineSummary, e } // Process the trace. - r, err := tracev2.NewReader(trace) - if err != nil { - return nil, err - } - for { - ev, err := r.ReadEvent() - if err == io.EOF { - break - } - if err != nil { - return nil, err - } + for i := range events { + ev := &events[i] b.event(ev) } - return b.finalize(), nil + return b.finalize() } // goroutineStatsBuilder constructs per-goroutine time statistics for v2 traces. @@ -225,7 +253,7 @@ type rangeP struct { } // event feeds a single event into the stats builder. -func (b *goroutineStatsBuilder) event(ev tracev2.Event) { +func (b *goroutineStatsBuilder) event(ev *tracev2.Event) { if b.syncTs == 0 { b.syncTs = ev.Time() } @@ -280,7 +308,7 @@ func (b *goroutineStatsBuilder) event(ev tracev2.Event) { regions := creatorG.activeRegions s := regions[len(regions)-1] if s.TaskID != tracev2.NoTask { - g.activeRegions = []*UserRegionSummary{{TaskID: s.TaskID, Start: &ev}} + g.activeRegions = []*UserRegionSummary{{TaskID: s.TaskID, Start: ev}} } } b.gs[g.ID] = g @@ -358,7 +386,7 @@ func (b *goroutineStatsBuilder) event(ev tracev2.Event) { // "Forever" is like goroutine death. fallthrough case tracev2.GoNotExist: - g.finalize(ev.Time(), &ev) + g.finalize(ev.Time(), ev) case tracev2.GoSyscall: b.syscallingP[ev.Proc()] = id b.syscallingG[id] = ev.Proc() @@ -445,7 +473,7 @@ func (b *goroutineStatsBuilder) event(ev tracev2.Event) { g.activeRegions = append(g.activeRegions, &UserRegionSummary{ Name: r.Type, TaskID: r.Task, - Start: &ev, + Start: ev, GoroutineExecStats: g.snapshotStat(ev.Time()), }) case tracev2.EventRegionEnd: @@ -463,7 +491,7 @@ func (b *goroutineStatsBuilder) event(ev tracev2.Event) { sd = &UserRegionSummary{Name: r.Type, TaskID: r.Task} } sd.GoroutineExecStats = g.snapshotStat(ev.Time()).sub(sd.GoroutineExecStats) - sd.End = &ev + sd.End = ev g.Regions = append(g.Regions, sd) } } diff --git a/src/internal/trace/goroutinesv2_test.go b/src/internal/trace/goroutinesv2_test.go index 99ec8dd8b09..ecd7f2631d7 100644 --- a/src/internal/trace/goroutinesv2_test.go +++ b/src/internal/trace/goroutinesv2_test.go @@ -7,6 +7,7 @@ package trace import ( tracev2 "internal/trace/v2" "internal/trace/v2/testtrace" + "io" "testing" ) @@ -95,11 +96,22 @@ func summarizeTraceTest(t *testing.T, testPath string) map[tracev2.GoID]*Gorouti if err != nil { t.Fatalf("malformed test %s: bad trace file: %v", testPath, err) } - summaries, err := SummarizeGoroutines(r) + var events []tracev2.Event + tr, err := tracev2.NewReader(r) if err != nil { - t.Fatalf("failed to process trace %s: %v", testPath, err) + t.Fatalf("failed to create trace reader %s: %v", testPath, err) } - return summaries + for { + ev, err := tr.ReadEvent() + if err == io.EOF { + break + } + if err != nil { + t.Fatalf("failed to process trace %s: %v", testPath, err) + } + events = append(events, ev) + } + return SummarizeGoroutines(events) } func checkRegionEvents(t *testing.T, wantStart, wantEnd tracev2.EventKind, goid tracev2.GoID, region *UserRegionSummary) { diff --git a/src/internal/trace/traceviewer/http.go b/src/internal/trace/traceviewer/http.go index 0c58535a72e..ac2803be2d2 100644 --- a/src/internal/trace/traceviewer/http.go +++ b/src/internal/trace/traceviewer/http.go @@ -21,9 +21,7 @@ func MainHandler(ranges []Range) http.Handler { }) } -var templMain = template.Must(template.New("").Parse(` - - +code, +pre, +textarea.code { + font-family: SFMono-Regular, Consolas, 'Liberation Mono', Menlo, monospace; + font-size: 0.875rem; + line-height: 1.5em; +} + +pre, +textarea.code { + background-color: var(--color-background-accented); + border: var(--border); + border-radius: var(--border-radius); + color: var(--color-text); + overflow-x: auto; + padding: 0.625rem; + tab-size: 4; + white-space: pre; +} +` + +var templMain = template.Must(template.New("").Parse(` + +

cmd/trace: the Go trace event viewer