diff --git a/misc/linkcheck/linkcheck.go b/misc/linkcheck/linkcheck.go new file mode 100644 index 00000000000..01e9879a196 --- /dev/null +++ b/misc/linkcheck/linkcheck.go @@ -0,0 +1,167 @@ +// Copyright 2013 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// The linkcheck command finds missing links in the godoc website. +// It crawls a URL recursively and notes URLs and URL fragments +// that it's seen and prints a report of missing links at the end. +package main + +import ( + "flag" + "fmt" + "io/ioutil" + "log" + "net/http" + "regexp" + "strings" + "sync" +) + +var ( + root = flag.String("root", "http://localhost:6060", "Root to crawl") + verbose = flag.Bool("verbose", false, "verbose") +) + +var wg sync.WaitGroup // outstanding fetches +var urlq = make(chan string) // URLs to crawl + +// urlFrag is a URL and its optional #fragment (without the #) +type urlFrag struct { + url, frag string +} + +var ( + mu sync.Mutex + crawled = make(map[string]bool) // URL without fragment -> true + neededFrags = make(map[urlFrag][]string) // URL#frag -> who needs it +) + +var aRx = regexp.MustCompile(`]+)`) + +// Owned by crawlLoop goroutine: +var ( + linkSources = make(map[string][]string) // url no fragment -> sources + fragExists = make(map[urlFrag]bool) + problems []string +) + +func localLinks(body string) (links []string) { + seen := map[string]bool{} + mv := aRx.FindAllStringSubmatch(body, -1) + for _, m := range mv { + ref := m[1] + if strings.HasPrefix(ref, "/src/") { + continue + } + if !seen[ref] { + seen[ref] = true + links = append(links, m[1]) + } + } + return +} + +var idRx = regexp.MustCompile(`\bid=['"]?([^\s'">]+)`) + +func pageIDs(body string) (ids []string) { + mv := idRx.FindAllStringSubmatch(body, -1) + for _, m := range mv { + ids = append(ids, m[1]) + } + return +} + +// url may contain a #fragment, and the fragment is then noted as needing to exist. +func crawl(url string, sourceURL string) { + if strings.Contains(url, "/devel/release") { + return + } + mu.Lock() + defer mu.Unlock() + var frag string + if i := strings.Index(url, "#"); i >= 0 { + frag = url[i+1:] + url = url[:i] + if frag != "" { + uf := urlFrag{url, frag} + neededFrags[uf] = append(neededFrags[uf], sourceURL) + } + } + if crawled[url] { + return + } + crawled[url] = true + + wg.Add(1) + go func() { + urlq <- url + }() +} + +func addProblem(url, errmsg string) { + msg := fmt.Sprintf("Error on %s: %s (from %s)", url, errmsg, linkSources[url]) + log.Print(msg) + problems = append(problems, msg) +} + +func crawlLoop() { + for url := range urlq { + res, err := http.Get(url) + if err != nil { + addProblem(url, fmt.Sprintf("Error fetching: %v", err)) + wg.Done() + continue + } + if res.StatusCode != 200 { + addProblem(url, fmt.Sprintf("Status code = %d", res.StatusCode)) + wg.Done() + continue + } + slurp, err := ioutil.ReadAll(res.Body) + res.Body.Close() + if err != nil { + log.Fatalf("Error reading %s body: %v", url, err) + } + if *verbose { + log.Printf("Len of %s: %d", url, len(slurp)) + } + body := string(slurp) + for _, ref := range localLinks(body) { + if *verbose { + log.Printf(" links to %s", ref) + } + dest := *root + ref + linkSources[dest] = append(linkSources[dest], url) + crawl(dest, url) + } + for _, id := range pageIDs(body) { + if *verbose { + log.Printf(" url %s has #%s", url, id) + } + fragExists[urlFrag{url, id}] = true + } + + wg.Done() + } +} + +func main() { + flag.Parse() + + go crawlLoop() + crawl(*root, "") + crawl(*root+"/doc/go1.1.html", "") + + wg.Wait() + close(urlq) + for uf, needers := range neededFrags { + if !fragExists[uf] { + problems = append(problems, fmt.Sprintf("Missing fragment for %+v from %v", uf, needers)) + } + } + + for _, s := range problems { + fmt.Println(s) + } +}