// Copyright 2014 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // +build build_coordinator // The coordinator runs on GCE and coordinates builds in Docker containers. package main // import "golang.org/x/tools/dashboard/coordinator" import ( "archive/tar" "bytes" "compress/gzip" "crypto/hmac" "crypto/md5" "crypto/rand" "encoding/json" "errors" "flag" "fmt" "html" "io" "io/ioutil" "log" "net/http" "net/url" "os" "os/exec" "path" "regexp" "sort" "strconv" "strings" "sync" "time" "golang.org/x/oauth2" "golang.org/x/oauth2/google" "golang.org/x/tools/dashboard/types" "google.golang.org/api/compute/v1" "google.golang.org/cloud/compute/metadata" ) var ( masterKeyFile = flag.String("masterkey", "", "Path to builder master key. Else fetched using GCE project attribute 'builder-master-key'.") maxLocalBuilds = flag.Int("maxbuilds", 6, "Max concurrent Docker builds (VM builds don't count)") cleanZones = flag.String("zones", "us-central1-a,us-central1-b,us-central1-f", "Comma-separated list of zones to periodically clean of stale build VMs (ones that failed to shut themselves down)") // Debug flags: addTemp = flag.Bool("temp", false, "Append -temp to all builders.") just = flag.String("just", "", "If non-empty, run single build in the foreground. Requires rev.") rev = flag.String("rev", "", "Revision to build.") ) var ( startTime = time.Now() builders = map[string]buildConfig{} // populated at startup, keys like "openbsd-amd64-56" watchers = map[string]watchConfig{} // populated at startup, keyed by repo, e.g. "https://go.googlesource.com/go" donec = make(chan builderRev) // reports of finished builders statusMu sync.Mutex // guards both status (ongoing ones) and statusDone (just finished) status = map[builderRev]*buildStatus{} statusDone []*buildStatus // finished recently, capped to maxStatusDone ) const ( maxStatusDone = 30 // vmDeleteTimeout is how long before we delete a VM. // In practice this need only be as long as the slowest // builder (plan9 currently), because on startup this program // already deletes all buildlets it doesn't know about // (i.e. ones from a previous instance of the coordinator). vmDeleteTimeout = 45 * time.Minute ) // Initialized by initGCE: var ( projectID string projectZone string computeService *compute.Service externalIP string ) func initGCE() error { if !metadata.OnGCE() { return errors.New("not running on GCE; VM support disabled") } var err error projectID, err = metadata.ProjectID() if err != nil { return fmt.Errorf("failed to get current GCE ProjectID: %v", err) } projectZone, err = metadata.Get("instance/zone") if err != nil || projectZone == "" { return fmt.Errorf("failed to get current GCE zone: %v", err) } // Convert the zone from "projects/1234/zones/us-central1-a" to "us-central1-a". projectZone = path.Base(projectZone) if !hasComputeScope() { return errors.New("The coordinator is not running with access to read and write Compute resources. VM support disabled.") } externalIP, err = metadata.ExternalIP() if err != nil { return fmt.Errorf("ExternalIP: %v", err) } ts := google.ComputeTokenSource("default") computeService, _ = compute.New(oauth2.NewClient(oauth2.NoContext, ts)) return nil } type imageInfo struct { url string // of tar file mu sync.Mutex lastMod string } var images = map[string]*imageInfo{ "go-commit-watcher": {url: "https://storage.googleapis.com/go-builder-data/docker-commit-watcher.tar.gz"}, "gobuilders/linux-x86-base": {url: "https://storage.googleapis.com/go-builder-data/docker-linux.base.tar.gz"}, "gobuilders/linux-x86-clang": {url: "https://storage.googleapis.com/go-builder-data/docker-linux.clang.tar.gz"}, "gobuilders/linux-x86-gccgo": {url: "https://storage.googleapis.com/go-builder-data/docker-linux.gccgo.tar.gz"}, "gobuilders/linux-x86-nacl": {url: "https://storage.googleapis.com/go-builder-data/docker-linux.nacl.tar.gz"}, "gobuilders/linux-x86-sid": {url: "https://storage.googleapis.com/go-builder-data/docker-linux.sid.tar.gz"}, } // A buildConfig describes how to run either a Docker-based or VM-based build. type buildConfig struct { name string // "linux-amd64-race" // VM-specific settings: (used if vmImage != "") vmImage string // e.g. "openbsd-amd64-56" machineType string // optional GCE instance type // Docker-specific settings: (used if vmImage == "") image string // Docker image to use to build cmd string // optional -cmd flag (relative to go/src/) env []string // extra environment ("key=value") pairs dashURL string // url of the build dashboard tool string // the tool this configuration is for } func (c *buildConfig) usesDocker() bool { return c.vmImage == "" } func (c *buildConfig) usesVM() bool { return c.vmImage != "" } func (c *buildConfig) MachineType() string { if v := c.machineType; v != "" { return v } return "n1-highcpu-4" } // recordResult sends build results to the dashboard func (b *buildConfig) recordResult(ok bool, hash, buildLog string, runTime time.Duration) error { req := map[string]interface{}{ "Builder": b.name, "PackagePath": "", "Hash": hash, "GoHash": "", "OK": ok, "Log": buildLog, "RunTime": runTime, } args := url.Values{"key": {builderKey(b.name)}, "builder": {b.name}} return dash("POST", "result", args, req, nil) } // pingDashboard is a goroutine that periodically POSTS to build.golang.org/building // to let it know that we're still working on a build. func pingDashboard(st *buildStatus) { u := "https://build.golang.org/building?" + url.Values{ "builder": []string{st.name}, "key": []string{builderKey(st.name)}, "hash": []string{st.rev}, "url": []string{fmt.Sprintf("http://%v/logs?name=%s&rev=%s&st=%p", externalIP, st.name, st.rev, st)}, }.Encode() for { st.mu.Lock() done := st.done st.mu.Unlock() if !done.IsZero() { return } if res, _ := http.PostForm(u, nil); res != nil { res.Body.Close() } time.Sleep(60 * time.Second) } } type watchConfig struct { repo string // "https://go.googlesource.com/go" dash string // "https://build.golang.org/" (must end in /) interval time.Duration // Polling interval } func main() { flag.Parse() if err := initGCE(); err != nil { log.Printf("VM support disabled due to error initializing GCE: %v", err) } addBuilder(buildConfig{name: "linux-386"}) addBuilder(buildConfig{name: "linux-386-387", env: []string{"GO386=387"}}) addBuilder(buildConfig{name: "linux-amd64"}) addBuilder(buildConfig{name: "linux-amd64-nocgo", env: []string{"CGO_ENABLED=0", "USER=root"}}) addBuilder(buildConfig{name: "linux-amd64-noopt", env: []string{"GO_GCFLAGS=-N -l"}}) addBuilder(buildConfig{name: "linux-amd64-race"}) addBuilder(buildConfig{name: "nacl-386"}) addBuilder(buildConfig{name: "nacl-amd64p32"}) addBuilder(buildConfig{ name: "linux-amd64-gccgo", image: "gobuilders/linux-x86-gccgo", cmd: "make RUNTESTFLAGS=\"--target_board=unix/-m64\" check-go -j16", dashURL: "https://build.golang.org/gccgo", tool: "gccgo", }) addBuilder(buildConfig{ name: "linux-386-gccgo", image: "gobuilders/linux-x86-gccgo", cmd: "make RUNTESTFLAGS=\"--target_board=unix/-m32\" check-go -j16", dashURL: "https://build.golang.org/gccgo", tool: "gccgo", }) addBuilder(buildConfig{name: "linux-386-sid", image: "gobuilders/linux-x86-sid"}) addBuilder(buildConfig{name: "linux-amd64-sid", image: "gobuilders/linux-x86-sid"}) addBuilder(buildConfig{name: "linux-386-clang", image: "gobuilders/linux-x86-clang"}) addBuilder(buildConfig{name: "linux-amd64-clang", image: "gobuilders/linux-x86-clang"}) // VMs: addBuilder(buildConfig{ name: "openbsd-amd64-gce56", vmImage: "openbsd-amd64-56", machineType: "n1-highcpu-2", }) addBuilder(buildConfig{ // It's named "partial" because the buildlet sets // GOTESTONLY=std to stop after the "go test std" // tests because it's so slow otherwise. // TODO(braditz): move that env variable to the // coordinator and into this config. name: "plan9-386-gcepartial", vmImage: "plan9-386", // We *were* using n1-standard-1 because Plan 9 can only // reliably use a single CPU. Using 2 or 4 and we see // test failures. See: // https://golang.org/issue/8393 // https://golang.org/issue/9491 // n1-standard-1 has 3.6 GB of memory which is // overkill (userspace probably only sees 2GB anyway), // but it's the cheapest option. And plenty to keep // our ~250 MB of inputs+outputs in its ramfs. // // But the docs says "For the n1 series of machine // types, a virtual CPU is implemented as a single // hyperthread on a 2.6GHz Intel Sandy Bridge Xeon or // Intel Ivy Bridge Xeon (or newer) processor. This // means that the n1-standard-2 machine type will see // a whole physical core." // // ... so we use n1-highcpu-2 (1.80 RAM, still // plenty), just so we can get 1 whole core for the // single-core Plan 9. It will see 2 virtual cores and // only use 1, but we hope that 1 will be more powerful // and we'll stop timing out on tests. machineType: "n1-highcpu-2", }) addWatcher(watchConfig{repo: "https://go.googlesource.com/go", dash: "https://build.golang.org/"}) // TODO(adg,cmang): fix gccgo watcher // addWatcher(watchConfig{repo: "https://code.google.com/p/gofrontend", dash: "https://build.golang.org/gccgo/"}) if (*just != "") != (*rev != "") { log.Fatalf("--just and --rev must be used together") } if *just != "" { conf, ok := builders[*just] if !ok { log.Fatalf("unknown builder %q", *just) } cmd := exec.Command("docker", append([]string{"run"}, conf.dockerRunArgs(*rev)...)...) cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr if err := cmd.Run(); err != nil { log.Fatalf("Build failed: %v", err) } return } http.HandleFunc("/", handleStatus) http.HandleFunc("/logs", handleLogs) go http.ListenAndServe(":80", nil) go cleanUpOldContainers() go cleanUpOldVMs() stopWatchers() // clean up before we start new ones for _, watcher := range watchers { if err := startWatching(watchers[watcher.repo]); err != nil { log.Printf("Error starting watcher for %s: %v", watcher.repo, err) } } workc := make(chan builderRev) go findWorkLoop(workc) // TODO(cmang): gccgo will need its own findWorkLoop ticker := time.NewTicker(1 * time.Minute) for { select { case work := <-workc: log.Printf("workc received %+v; len(status) = %v, maxLocalBuilds = %v; cur = %p", work, len(status), *maxLocalBuilds, status[work]) if mayBuildRev(work) { conf := builders[work.name] if st, err := startBuilding(conf, work.rev); err == nil { setStatus(work, st) go pingDashboard(st) } else { log.Printf("Error starting to build %v: %v", work, err) } } case done := <-donec: log.Printf("%v done", done) markDone(done) case <-ticker.C: if numCurrentBuilds() == 0 && time.Now().After(startTime.Add(10*time.Minute)) { // TODO: halt the whole machine to kill the VM or something } } } } func numCurrentBuilds() int { statusMu.Lock() defer statusMu.Unlock() return len(status) } func isBuilding(work builderRev) bool { statusMu.Lock() defer statusMu.Unlock() _, building := status[work] return building } // mayBuildRev reports whether the build type & revision should be started. // It returns true if it's not already building, and there is capacity. func mayBuildRev(work builderRev) bool { conf := builders[work.name] statusMu.Lock() _, building := status[work] statusMu.Unlock() if building { return false } if conf.usesVM() { // These don't count towards *maxLocalBuilds. return true } numDocker, err := numDockerBuilds() if err != nil { log.Printf("not starting %v due to docker ps failure: %v", work, err) return false } return numDocker < *maxLocalBuilds } func setStatus(work builderRev, st *buildStatus) { statusMu.Lock() defer statusMu.Unlock() status[work] = st } func markDone(work builderRev) { statusMu.Lock() defer statusMu.Unlock() st, ok := status[work] if !ok { return } delete(status, work) if len(statusDone) == maxStatusDone { copy(statusDone, statusDone[1:]) statusDone = statusDone[:len(statusDone)-1] } statusDone = append(statusDone, st) } func vmIsBuilding(instName string) bool { if instName == "" { log.Printf("bogus empty instance name passed to vmIsBuilding") return false } statusMu.Lock() defer statusMu.Unlock() for _, st := range status { if st.instName == instName { return true } } return false } // statusPtrStr disambiguates which status to return if there are // multiple in the history (e.g. recent failures where the build // didn't finish for reasons outside of all.bash failing) func getStatus(work builderRev, statusPtrStr string) *buildStatus { statusMu.Lock() defer statusMu.Unlock() match := func(st *buildStatus) bool { return statusPtrStr == "" || fmt.Sprintf("%p", st) == statusPtrStr } if st, ok := status[work]; ok && match(st) { return st } for _, st := range statusDone { if st.builderRev == work && match(st) { return st } } return nil } type byAge []*buildStatus func (s byAge) Len() int { return len(s) } func (s byAge) Less(i, j int) bool { return s[i].start.Before(s[j].start) } func (s byAge) Swap(i, j int) { s[i], s[j] = s[j], s[i] } func handleStatus(w http.ResponseWriter, r *http.Request) { var active []*buildStatus var recent []*buildStatus statusMu.Lock() for _, st := range status { active = append(active, st) } recent = append(recent, statusDone...) numTotal := len(status) numDocker, err := numDockerBuilds() statusMu.Unlock() sort.Sort(byAge(active)) sort.Sort(sort.Reverse(byAge(recent))) io.WriteString(w, "
%d total builds active (Docker: %d/%d; VMs: %d/∞):", numTotal, numDocker, *maxLocalBuilds, numTotal-numDocker) io.WriteString(w, "
") for _, st := range active { io.WriteString(w, st.htmlStatusLine()) } io.WriteString(w, "") io.WriteString(w, "
") for _, st := range recent { io.WriteString(w, st.htmlStatusLine()) } io.WriteString(w, "") fmt.Fprintf(w, "
%s", html.EscapeString(diskFree())) } func diskFree() string { out, _ := exec.Command("df", "-h").Output() return string(out) } func handleLogs(w http.ResponseWriter, r *http.Request) { st := getStatus(builderRev{r.FormValue("name"), r.FormValue("rev")}, r.FormValue("st")) if st == nil { http.NotFound(w, r) return } w.Header().Set("Content-Type", "text/plain; charset=utf-8") io.WriteString(w, st.logs()) // TODO: if st is still building, stream them to the user with // http.Flusher.Flush and CloseNotifier and registering interest // of new writes with the buildStatus. Will require moving the // BUILDERKEY scrubbing into the Write method. } // findWorkLoop polls http://build.golang.org/?mode=json looking for new work // for the main dashboard. It does not support gccgo. // TODO(bradfitz): it also currently does not support subrepos. func findWorkLoop(work chan<- builderRev) { ticker := time.NewTicker(15 * time.Second) for { if err := findWork(work); err != nil { log.Printf("failed to find new work: %v", err) } <-ticker.C } } func findWork(work chan<- builderRev) error { var bs types.BuildStatus res, err := http.Get("https://build.golang.org/?mode=json") if err != nil { return err } defer res.Body.Close() if err := json.NewDecoder(res.Body).Decode(&bs); err != nil { return err } if res.StatusCode != 200 { return fmt.Errorf("unexpected http status %v", res.Status) } knownToDashboard := map[string]bool{} // keys are builder for _, b := range bs.Builders { knownToDashboard[b] = true } var goRevisions []string for _, br := range bs.Revisions { if br.Repo == "go" { goRevisions = append(goRevisions, br.Revision) } else { // TODO(bradfitz): support these: golang.org/issue/9506 continue } if len(br.Results) != len(bs.Builders) { return errors.New("bogus JSON response from dashboard: results is too long.") } for i, res := range br.Results { if res != "" { // It's either "ok" or a failure URL. continue } builder := bs.Builders[i] if _, ok := builders[builder]; !ok { // Not managed by the coordinator. continue } br := builderRev{bs.Builders[i], br.Revision} if !isBuilding(br) { work <- br } } } // And to bootstrap new builders, see if we have any builders // that the dashboard doesn't know about. for b := range builders { if knownToDashboard[b] { continue } for _, rev := range goRevisions { br := builderRev{b, rev} if !isBuilding(br) { work <- br } } } return nil } // builderRev is a build configuration type and a revision. type builderRev struct { name string // e.g. "linux-amd64-race" rev string // lowercase hex git hash } // returns the part after "docker run" func (conf buildConfig) dockerRunArgs(rev string) (args []string) { if key := builderKey(conf.name); key != "" { tmpKey := "/tmp/" + conf.name + ".buildkey" if _, err := os.Stat(tmpKey); err != nil { if err := ioutil.WriteFile(tmpKey, []byte(key), 0600); err != nil { log.Fatal(err) } } // Images may look for .gobuildkey in / or /root, so provide both. // TODO(adg): fix images that look in the wrong place. args = append(args, "-v", tmpKey+":/.gobuildkey") args = append(args, "-v", tmpKey+":/root/.gobuildkey") } for _, pair := range conf.env { args = append(args, "-e", pair) } args = append(args, conf.image, "/usr/local/bin/builder", "-rev="+rev, "-dashboard="+conf.dashURL, "-tool="+conf.tool, "-buildroot=/", "-v", ) if conf.cmd != "" { args = append(args, "-cmd", conf.cmd) } args = append(args, conf.name) return } func addBuilder(c buildConfig) { if c.tool == "gccgo" { // TODO(cmang,bradfitz,adg): fix gccgo return } if c.name == "" { panic("empty name") } if *addTemp { c.name += "-temp" } if _, dup := builders[c.name]; dup { panic("dup name") } if c.dashURL == "" { c.dashURL = "https://build.golang.org" } if c.tool == "" { c.tool = "go" } if strings.HasPrefix(c.name, "nacl-") { if c.image == "" { c.image = "gobuilders/linux-x86-nacl" } if c.cmd == "" { c.cmd = "/usr/local/bin/build-command.pl" } } if strings.HasPrefix(c.name, "linux-") && c.image == "" { c.image = "gobuilders/linux-x86-base" } if c.image == "" && c.vmImage == "" { panic("empty image and vmImage") } if c.image != "" && c.vmImage != "" { panic("can't specify both image and vmImage") } builders[c.name] = c } // returns the part after "docker run" func (conf watchConfig) dockerRunArgs() (args []string) { log.Printf("Running watcher with master key %q", masterKey()) if key := masterKey(); len(key) > 0 { tmpKey := "/tmp/watcher.buildkey" if _, err := os.Stat(tmpKey); err != nil { if err := ioutil.WriteFile(tmpKey, key, 0600); err != nil { log.Fatal(err) } } // Images may look for .gobuildkey in / or /root, so provide both. // TODO(adg): fix images that look in the wrong place. args = append(args, "-v", tmpKey+":/.gobuildkey") args = append(args, "-v", tmpKey+":/root/.gobuildkey") } args = append(args, "go-commit-watcher", "/usr/local/bin/watcher", "-repo="+conf.repo, "-dash="+conf.dash, "-poll="+conf.interval.String(), ) return } func addWatcher(c watchConfig) { if c.repo == "" { c.repo = "https://go.googlesource.com/go" } if c.dash == "" { c.dash = "https://build.golang.org/" } if c.interval == 0 { c.interval = 10 * time.Second } watchers[c.repo] = c } func condUpdateImage(img string) error { ii := images[img] if ii == nil { return fmt.Errorf("image %q doesn't exist", img) } ii.mu.Lock() defer ii.mu.Unlock() res, err := http.Head(ii.url) if err != nil { return fmt.Errorf("Error checking %s: %v", ii.url, err) } if res.StatusCode != 200 { return fmt.Errorf("Error checking %s: %v", ii.url, res.Status) } if res.Header.Get("Last-Modified") == ii.lastMod { return nil } res, err = http.Get(ii.url) if err != nil || res.StatusCode != 200 { return fmt.Errorf("Get after Head failed for %s: %v, %v", ii.url, err, res) } defer res.Body.Close() log.Printf("Running: docker load of %s\n", ii.url) cmd := exec.Command("docker", "load") cmd.Stdin = res.Body var out bytes.Buffer cmd.Stdout = &out cmd.Stderr = &out if cmd.Run(); err != nil { log.Printf("Failed to pull latest %s from %s and pipe into docker load: %v, %s", img, ii.url, err, out.Bytes()) return err } ii.lastMod = res.Header.Get("Last-Modified") return nil } // numDockerBuilds finds the number of go builder instances currently running. func numDockerBuilds() (n int, err error) { out, err := exec.Command("docker", "ps").Output() if err != nil { return 0, err } for _, line := range strings.Split(string(out), "\n") { if strings.Contains(line, "gobuilders/") { n++ } } return n, nil } func startBuilding(conf buildConfig, rev string) (*buildStatus, error) { if conf.usesVM() { return startBuildingInVM(conf, rev) } else { return startBuildingInDocker(conf, rev) } } func startBuildingInDocker(conf buildConfig, rev string) (*buildStatus, error) { if err := condUpdateImage(conf.image); err != nil { log.Printf("Failed to setup container for %v %v: %v", conf.name, rev, err) return nil, err } cmd := exec.Command("docker", append([]string{"run", "-d"}, conf.dockerRunArgs(rev)...)...) all, err := cmd.CombinedOutput() log.Printf("Docker run for %v %v = err:%v, output:%s", conf.name, rev, err, all) if err != nil { return nil, err } container := strings.TrimSpace(string(all)) brev := builderRev{ name: conf.name, rev: rev, } st := &buildStatus{ builderRev: brev, container: container, start: time.Now(), } log.Printf("%v now building in Docker container %v", brev, st.container) go func() { all, err := exec.Command("docker", "wait", container).CombinedOutput() output := strings.TrimSpace(string(all)) var ok bool if err == nil { exit, err := strconv.Atoi(output) ok = (err == nil && exit == 0) } st.setDone(ok) log.Printf("docker wait %s/%s: %v, %s", container, rev, err, output) donec <- builderRev{conf.name, rev} exec.Command("docker", "rm", container).Run() }() go func() { cmd := exec.Command("docker", "logs", "-f", container) cmd.Stdout = st cmd.Stderr = st if err := cmd.Run(); err != nil { // The docker logs subcommand always returns // success, even if the underlying process // fails. log.Printf("failed to follow docker logs of %s: %v", container, err) } }() return st, nil } var osArchRx = regexp.MustCompile(`^(\w+-\w+)`) func randHex(n int) string { buf := make([]byte, n/2) _, err := rand.Read(buf) if err != nil { panic("Failed to get randomness: " + err.Error()) } return fmt.Sprintf("%x", buf) } // startBuildingInVM starts a VM on GCE running the buildlet binary to build rev. func startBuildingInVM(conf buildConfig, rev string) (*buildStatus, error) { brev := builderRev{ name: conf.name, rev: rev, } st := &buildStatus{ builderRev: brev, start: time.Now(), } // name is the project-wide unique name of the GCE instance. It can't be longer // than 61 bytes, so we only use the first 8 bytes of the rev. name := "buildlet-" + conf.name + "-" + rev[:8] + "-rn" + randHex(6) // buildletURL is the URL of the buildlet binary which the VMs // are configured to download at boot and run. This lets us // update the buildlet more easily than rebuilding the whole // VM image. We put this URL in a well-known GCE metadata attribute. // The value will be of the form: // http://storage.googleapis.com/go-builder-data/buildlet.GOOS-GOARCH m := osArchRx.FindStringSubmatch(conf.name) if m == nil { return nil, fmt.Errorf("invalid builder name %q", conf.name) } buildletURL := "http://storage.googleapis.com/go-builder-data/buildlet." + m[1] prefix := "https://www.googleapis.com/compute/v1/projects/" + projectID machType := prefix + "/zones/" + projectZone + "/machineTypes/" + conf.MachineType() instance := &compute.Instance{ Name: name, Description: fmt.Sprintf("Go Builder building %s %s", conf.name, rev), MachineType: machType, Disks: []*compute.AttachedDisk{ { AutoDelete: true, Boot: true, Type: "PERSISTENT", InitializeParams: &compute.AttachedDiskInitializeParams{ DiskName: name, SourceImage: "https://www.googleapis.com/compute/v1/projects/" + projectID + "/global/images/" + conf.vmImage, DiskType: "https://www.googleapis.com/compute/v1/projects/" + projectID + "/zones/" + projectZone + "/diskTypes/pd-ssd", }, }, }, Tags: &compute.Tags{ // Warning: do NOT list "http-server" or "allow-ssh" (our // project's custom tag to allow ssh access) here; the // buildlet provides full remote code execution. Items: []string{}, }, Metadata: &compute.Metadata{ Items: []*compute.MetadataItems{ { Key: "buildlet-binary-url", Value: buildletURL, }, // In case the VM gets away from us (generally: if the // coordinator dies while a build is running), then we // set this attribute of when it should be killed so // we can kill it later when the coordinator is // restarted. The cleanUpOldVMs goroutine loop handles // that killing. { Key: "delete-at", Value: fmt.Sprint(time.Now().Add(vmDeleteTimeout).Unix()), }, }, }, NetworkInterfaces: []*compute.NetworkInterface{ &compute.NetworkInterface{ AccessConfigs: []*compute.AccessConfig{ &compute.AccessConfig{ Type: "ONE_TO_ONE_NAT", Name: "External NAT", }, }, Network: prefix + "/global/networks/default", }, }, } op, err := computeService.Instances.Insert(projectID, projectZone, instance).Do() if err != nil { return nil, fmt.Errorf("Failed to create instance: %v", err) } st.createOp = op.Name st.instName = name log.Printf("%v now building in VM %v", brev, st.instName) // Start the goroutine to monitor the VM now that it's booting. This might // take minutes for it to come up, and then even more time to do the build. go func() { err := watchVM(st) if st.hasEvent("instance_created") { deleteVM(projectZone, st.instName) } st.setDone(err == nil) if err != nil { fmt.Fprintf(st, "\n\nError: %v\n", err) } donec <- builderRev{conf.name, rev} }() return st, nil } // watchVM monitors a VM doing a build. func watchVM(st *buildStatus) (retErr error) { goodRes := func(res *http.Response, err error, what string) bool { if err != nil { retErr = fmt.Errorf("%s: %v", what, err) return false } if res.StatusCode/100 != 2 { slurp, _ := ioutil.ReadAll(io.LimitReader(res.Body, 4<<10)) retErr = fmt.Errorf("%s: %v; body: %s", what, res.Status, slurp) res.Body.Close() return false } return true } st.logEventTime("instance_create_requested") // Wait for instance create operation to succeed. OpLoop: for { time.Sleep(2 * time.Second) op, err := computeService.ZoneOperations.Get(projectID, projectZone, st.createOp).Do() if err != nil { return fmt.Errorf("Failed to get op %s: %v", st.createOp, err) } switch op.Status { case "PENDING", "RUNNING": continue case "DONE": if op.Error != nil { for _, operr := range op.Error.Errors { return fmt.Errorf("Error creating instance: %+v", operr) } return errors.New("Failed to start.") } break OpLoop default: log.Fatalf("Unknown status %q: %+v", op.Status, op) } } st.logEventTime("instance_created") inst, err := computeService.Instances.Get(projectID, projectZone, st.instName).Do() if err != nil { return fmt.Errorf("Error getting instance %s details after creation: %v", st.instName, err) } st.logEventTime("got_instance_info") // Find its internal IP. var ip string for _, iface := range inst.NetworkInterfaces { if strings.HasPrefix(iface.NetworkIP, "10.") { ip = iface.NetworkIP } } if ip == "" { return errors.New("didn't find its internal IP address") } // Wait for it to boot and its buildlet to come up on port 80. st.logEventTime("waiting_for_buildlet") buildletURL := "http://" + ip const numTries = 60 var alive bool for i := 1; i <= numTries; i++ { res, err := http.Get(buildletURL) if err != nil { time.Sleep(1 * time.Second) continue } res.Body.Close() if res.StatusCode != 200 { return fmt.Errorf("buildlet returned HTTP status code %d on try number %d", res.StatusCode, i) } st.logEventTime("buildlet_up") alive = true break } if !alive { return fmt.Errorf("buildlet didn't come up in %d seconds", numTries) } // Write the VERSION file. st.logEventTime("start_write_version_tar") verReq, err := http.NewRequest("PUT", buildletURL+"/writetgz", versionTgz(st.rev)) if err != nil { return err } verRes, err := http.DefaultClient.Do(verReq) if !goodRes(verRes, err, "writing VERSION tgz") { return } // Feed the buildlet a tar file for it to extract. // TODO: cache these. st.logEventTime("start_fetch_gerrit_tgz") tarRes, err := http.Get("https://go.googlesource.com/go/+archive/" + st.rev + ".tar.gz") if !goodRes(tarRes, err, "fetching tarball from Gerrit") { return } st.logEventTime("start_write_tar") putReq, err := http.NewRequest("PUT", buildletURL+"/writetgz", tarRes.Body) if err != nil { tarRes.Body.Close() return err } putRes, err := http.DefaultClient.Do(putReq) st.logEventTime("end_write_tar") tarRes.Body.Close() if !goodRes(putRes, err, "writing tarball to buildlet") { return } // Run the builder cmd := "all.bash" if strings.HasPrefix(st.name, "windows-") { cmd = "all.bat" } else if strings.HasPrefix(st.name, "plan9-") { cmd = "all.rc" } execStartTime := time.Now() st.logEventTime("start_exec") res, err := http.PostForm(buildletURL+"/exec", url.Values{"cmd": {"src/" + cmd}}) if !goodRes(res, err, "running "+cmd) { return } defer res.Body.Close() st.logEventTime("running_exec") // Stream the output: if _, err := io.Copy(st, res.Body); err != nil { return fmt.Errorf("error copying response: %v", err) } st.logEventTime("done") // Don't record to the dashboard unless we heard the trailer from // the buildlet, otherwise it was probably some unrelated error // (like the VM being killed, or the buildlet crashing due to // e.g. https://golang.org/issue/9309, since we require a tip // build of the buildlet to get Trailers support) state := res.Trailer.Get("Process-State") if state == "" { return errors.New("missing Process-State trailer from HTTP response; buildlet built with old (<= 1.4) Go?") } conf := builders[st.name] var log string if state != "ok" { log = st.logs() } if err := conf.recordResult(state == "ok", st.rev, log, time.Since(execStartTime)); err != nil { return fmt.Errorf("Status was %q but failed to report it to the dashboard: %v", state, err) } if state != "ok" { return fmt.Errorf("%s failed: %v", cmd, state) } return nil } type eventAndTime struct { evt string t time.Time } // buildStatus is the status of a build. type buildStatus struct { // Immutable: builderRev start time.Time container string // container ID for docker, else it's a VM // Immutable, used by VM only: createOp string // Instances.Insert operation name instName string mu sync.Mutex // guards following done time.Time // finished running succeeded bool // set when done output bytes.Buffer // stdout and stderr events []eventAndTime } func (st *buildStatus) setDone(succeeded bool) { st.mu.Lock() defer st.mu.Unlock() st.succeeded = succeeded st.done = time.Now() } func (st *buildStatus) logEventTime(event string) { st.mu.Lock() defer st.mu.Unlock() st.events = append(st.events, eventAndTime{event, time.Now()}) } func (st *buildStatus) hasEvent(event string) bool { st.mu.Lock() defer st.mu.Unlock() for _, e := range st.events { if e.evt == event { return true } } return false } // htmlStatusLine returns the HTML to show within the
block on // the main page's list of active builds. func (st *buildStatus) htmlStatusLine() string { st.mu.Lock() defer st.mu.Unlock() urlPrefix := "https://go-review.googlesource.com/#/q/" if strings.Contains(st.name, "gccgo") { urlPrefix = "https://code.google.com/p/gofrontend/source/detail?r=" } var buf bytes.Buffer fmt.Fprintf(&buf, "%s rev %s", st.name, urlPrefix, st.rev, st.rev) if st.done.IsZero() { buf.WriteString(", running") } else if st.succeeded { buf.WriteString(", succeeded") } else { buf.WriteString(", failed") } logsURL := fmt.Sprintf("/logs?name=%s&rev=%s&st=%p", st.name, st.rev, st) if st.container != "" { fmt.Fprintf(&buf, " in container %s", logsURL, st.container) } else { fmt.Fprintf(&buf, " in VM %s", logsURL, st.instName) } t := st.done if t.IsZero() { t = st.start } fmt.Fprintf(&buf, ", %v ago\n", time.Since(t)) for i, evt := range st.events { var elapsed string if i != 0 { elapsed = fmt.Sprintf("+%0.1fs", evt.t.Sub(st.events[i-1].t).Seconds()) } msg := evt.evt if msg == "running_exec" { msg = fmt.Sprintf("%s", logsURL, msg) } fmt.Fprintf(&buf, " %7s %v %s\n", elapsed, evt.t.Format(time.RFC3339), msg) } return buf.String() } func (st *buildStatus) logs() string { st.mu.Lock() logs := st.output.String() st.mu.Unlock() key := builderKey(st.name) return strings.Replace(string(logs), key, "BUILDERKEY", -1) } func (st *buildStatus) Write(p []byte) (n int, err error) { st.mu.Lock() defer st.mu.Unlock() const maxBufferSize = 2 << 20 // 2MB of output is way more than we expect. plen := len(p) if st.output.Len()+len(p) > maxBufferSize { p = p[:maxBufferSize-st.output.Len()] } st.output.Write(p) // bytes.Buffer can't fail return plen, nil } // Stop any previous go-commit-watcher Docker tasks, so they don't // pile up upon restarts of the coordinator. func stopWatchers() { out, err := exec.Command("docker", "ps").Output() if err != nil { return } for _, line := range strings.Split(string(out), "\n") { if !strings.Contains(line, "go-commit-watcher:") { continue } f := strings.Fields(line) exec.Command("docker", "rm", "-f", "-v", f[0]).Run() } } func startWatching(conf watchConfig) (err error) { defer func() { if err != nil { restartWatcherSoon(conf) } }() log.Printf("Starting watcher for %v", conf.repo) if err := condUpdateImage("go-commit-watcher"); err != nil { log.Printf("Failed to setup container for commit watcher: %v", err) return err } cmd := exec.Command("docker", append([]string{"run", "-d"}, conf.dockerRunArgs()...)...) all, err := cmd.CombinedOutput() if err != nil { log.Printf("Docker run for commit watcher = err:%v, output: %s", err, all) return err } container := strings.TrimSpace(string(all)) // Start a goroutine to wait for the watcher to die. go func() { exec.Command("docker", "wait", container).Run() exec.Command("docker", "rm", "-v", container).Run() log.Printf("Watcher crashed. Restarting soon.") restartWatcherSoon(conf) }() return nil } func restartWatcherSoon(conf watchConfig) { time.AfterFunc(30*time.Second, func() { startWatching(conf) }) } func builderKey(builder string) string { master := masterKey() if len(master) == 0 { return "" } h := hmac.New(md5.New, master) io.WriteString(h, builder) return fmt.Sprintf("%x", h.Sum(nil)) } func masterKey() []byte { keyOnce.Do(loadKey) return masterKeyCache } var ( keyOnce sync.Once masterKeyCache []byte ) func loadKey() { if *masterKeyFile != "" { b, err := ioutil.ReadFile(*masterKeyFile) if err != nil { log.Fatal(err) } masterKeyCache = bytes.TrimSpace(b) return } req, _ := http.NewRequest("GET", "http://metadata.google.internal/computeMetadata/v1/project/attributes/builder-master-key", nil) req.Header.Set("Metadata-Flavor", "Google") res, err := http.DefaultClient.Do(req) if err != nil { log.Fatal("No builder master key available") } defer res.Body.Close() if res.StatusCode != 200 { log.Fatalf("No builder-master-key project attribute available.") } slurp, err := ioutil.ReadAll(res.Body) if err != nil { log.Fatal(err) } masterKeyCache = bytes.TrimSpace(slurp) } func cleanUpOldContainers() { for { for _, cid := range oldContainers() { log.Printf("Cleaning old container %v", cid) exec.Command("docker", "rm", "-v", cid).Run() } time.Sleep(30 * time.Second) } } func oldContainers() []string { out, _ := exec.Command("docker", "ps", "-a", "--filter=status=exited", "--no-trunc", "-q").Output() return strings.Fields(string(out)) } // cleanUpOldVMs loops forever and periodically enumerates virtual // machines and deletes those which have expired. // // A VM is considered expired if it has a "delete-at" metadata // attribute having a unix timestamp before the current time. // // This is the safety mechanism to delete VMs which stray from the // normal deleting process. VMs are created to run a single build and // should be shut down by a controlling process. Due to various types // of failures, they might get stranded. To prevent them from getting // stranded and wasting resources forever, we instead set the // "delete-at" metadata attribute on them when created to some time // that's well beyond their expected lifetime. func cleanUpOldVMs() { if computeService == nil { return } for { for _, zone := range strings.Split(*cleanZones, ",") { zone = strings.TrimSpace(zone) if err := cleanZoneVMs(zone); err != nil { log.Printf("Error cleaning VMs in zone %q: %v", zone, err) } } time.Sleep(time.Minute) } } // cleanZoneVMs is part of cleanUpOldVMs, operating on a single zone. func cleanZoneVMs(zone string) error { // Fetch the first 500 (default) running instances and clean // thoes. We expect that we'll be running many fewer than // that. Even if we have more, eventually the first 500 will // either end or be cleaned, and then the next call will get a // partially-different 500. // TODO(bradfitz): revist this code if we ever start running // thousands of VMs. list, err := computeService.Instances.List(projectID, zone).Do() if err != nil { return fmt.Errorf("listing instances: %v", err) } for _, inst := range list.Items { if !strings.HasPrefix(inst.Name, "buildlet-") { // We only delete ones we created. continue } if inst.Metadata == nil { // Defensive. Not seen in practice. continue } sawDeleteAt := false for _, it := range inst.Metadata.Items { if it.Key == "delete-at" { sawDeleteAt = true unixDeadline, err := strconv.ParseInt(it.Value, 10, 64) if err != nil { log.Printf("invalid delete-at value %q seen; ignoring", it.Value) } if err == nil && time.Now().Unix() > unixDeadline { log.Printf("Deleting expired VM %q in zone %q ...", inst.Name, zone) deleteVM(zone, inst.Name) } } } if sawDeleteAt && !vmIsBuilding(inst.Name) { log.Printf("Deleting VM %q in zone %q from an earlier coordinator generation ...", inst.Name, zone) deleteVM(zone, inst.Name) } } return nil } func deleteVM(zone, instName string) { op, err := computeService.Instances.Delete(projectID, zone, instName).Do() if err != nil { log.Printf("Failed to delete instance %q in zone %q: %v", instName, zone, err) return } log.Printf("Sent request to delete instance %q in zone %q. Operation ID == %v", instName, zone, op.Id) } func hasComputeScope() bool { if !metadata.OnGCE() { return false } scopes, err := metadata.Scopes("default") if err != nil { log.Printf("failed to query metadata default scopes: %v", err) return false } for _, v := range scopes { if v == compute.DevstorageFull_controlScope { return true } } return false } // dash is copied from the builder binary. It runs the given method and command on the dashboard. // // TODO(bradfitz,adg): unify this somewhere? // // If args is non-nil it is encoded as the URL query string. // If req is non-nil it is JSON-encoded and passed as the body of the HTTP POST. // If resp is non-nil the server's response is decoded into the value pointed // to by resp (resp must be a pointer). func dash(meth, cmd string, args url.Values, req, resp interface{}) error { const builderVersion = 1 // keep in sync with dashboard/app/build/handler.go argsCopy := url.Values{"version": {fmt.Sprint(builderVersion)}} for k, v := range args { if k == "version" { panic(`dash: reserved args key: "version"`) } argsCopy[k] = v } var r *http.Response var err error cmd = "https://build.golang.org/" + cmd + "?" + argsCopy.Encode() switch meth { case "GET": if req != nil { log.Panicf("%s to %s with req", meth, cmd) } r, err = http.Get(cmd) case "POST": var body io.Reader if req != nil { b, err := json.Marshal(req) if err != nil { return err } body = bytes.NewBuffer(b) } r, err = http.Post(cmd, "text/json", body) default: log.Panicf("%s: invalid method %q", cmd, meth) panic("invalid method: " + meth) } if err != nil { return err } defer r.Body.Close() if r.StatusCode != http.StatusOK { return fmt.Errorf("bad http response: %v", r.Status) } body := new(bytes.Buffer) if _, err := body.ReadFrom(r.Body); err != nil { return err } // Read JSON-encoded Response into provided resp // and return an error if present. var result = struct { Response interface{} Error string }{ // Put the provided resp in here as it can be a pointer to // some value we should unmarshal into. Response: resp, } if err = json.Unmarshal(body.Bytes(), &result); err != nil { log.Printf("json unmarshal %#q: %s\n", body.Bytes(), err) return err } if result.Error != "" { return errors.New(result.Error) } return nil } func versionTgz(rev string) io.Reader { var buf bytes.Buffer zw := gzip.NewWriter(&buf) tw := tar.NewWriter(zw) contents := fmt.Sprintf("devel " + rev) check(tw.WriteHeader(&tar.Header{ Name: "VERSION", Mode: 0644, Size: int64(len(contents)), })) _, err := io.WriteString(tw, contents) check(err) check(tw.Close()) check(zw.Close()) return bytes.NewReader(buf.Bytes()) } // check is only for things which should be impossible (not even rare) // to fail. func check(err error) { if err != nil { panic("previously assumed to never fail: " + err.Error()) } }