go/misc/linkcheck/linkcheck.go

// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// The linkcheck command finds missing links in the godoc website.
// It crawls a URL recursively and notes URLs and URL fragments
// that it's seen and prints a report of missing links at the end.
package main

import (
	"errors"
	"flag"
	"fmt"
	"io/ioutil"
	"log"
	"net/http"
	"os"
	"regexp"
	"strings"
	"sync"
)

var (
	root    = flag.String("root", "http://localhost:6060", "Root to crawl")
	verbose = flag.Bool("verbose", false, "verbose")
)

var wg sync.WaitGroup        // outstanding fetches
var urlq = make(chan string) // URLs to crawl

// urlFrag is a URL and its optional #fragment (without the #)
type urlFrag struct {
	url, frag string
}

var (
	mu          sync.Mutex
	crawled     = make(map[string]bool)      // URL without fragment -> true
	neededFrags = make(map[urlFrag][]string) // URL#frag -> who needs it
)

var aRx = regexp.MustCompile(`<a href=['"]?(/[^\s'">]+)`)

// Owned by crawlLoop goroutine:
var (
	linkSources = make(map[string][]string) // url no fragment -> sources
	fragExists  = make(map[urlFrag]bool)
	problems    []string
)

func localLinks(body string) (links []string) {
	seen := map[string]bool{}
	mv := aRx.FindAllStringSubmatch(body, -1)
	for _, m := range mv {
		ref := m[1]
		if strings.HasPrefix(ref, "/src/") {
			continue
		}
		if !seen[ref] {
			seen[ref] = true
			links = append(links, m[1])
		}
	}
	return
}

var idRx = regexp.MustCompile(`\bid=['"]?([^\s'">]+)`)

func pageIDs(body string) (ids []string) {
	mv := idRx.FindAllStringSubmatch(body, -1)
	for _, m := range mv {
		ids = append(ids, m[1])
	}
	return
}

// url may contain a #fragment, and the fragment is then noted as needing to exist.
func crawl(url string, sourceURL string) {
	if strings.Contains(url, "/devel/release") {
		return
	}
	mu.Lock()
	defer mu.Unlock()
	var frag string
	if i := strings.Index(url, "#"); i >= 0 {
		frag = url[i+1:]
		url = url[:i]
		if frag != "" {
			uf := urlFrag{url, frag}
			neededFrags[uf] = append(neededFrags[uf], sourceURL)
		}
	}
	if crawled[url] {
		return
	}
	crawled[url] = true

	wg.Add(1)
	go func() {
		urlq <- url
	}()
}

func addProblem(url, errmsg string) {
	msg := fmt.Sprintf("Error on %s: %s (from %s)", url, errmsg, linkSources[url])
	if *verbose {
		log.Print(msg)
	}
	problems = append(problems, msg)
}

func crawlLoop() {
	for url := range urlq {
		if err := doCrawl(url); err != nil {
			addProblem(url, err.Error())
		}
	}
}

func doCrawl(url string) error {
	defer wg.Done()

	req, err := http.NewRequest("GET", url, nil)
	if err != nil {
		return err
	}
	res, err := http.DefaultTransport.RoundTrip(req)
	if err != nil {
		return err
	}
	// Handle redirects.
	if res.StatusCode/100 == 3 {
		newURL, err := res.Location()
		if err != nil {
			return fmt.Errorf("resolving redirect: %v", err)
		}
		if !strings.HasPrefix(newURL.String(), *root) {
			// Skip off-site redirects.
			return nil
		}
		crawl(newURL.String(), url)
		return nil
	}
	if res.StatusCode != 200 {
		return errors.New(res.Status)
	}
	slurp, err := ioutil.ReadAll(res.Body)
	res.Body.Close()
	if err != nil {
		log.Fatalf("Error reading %s body: %v", url, err)
	}
	if *verbose {
		log.Printf("Len of %s: %d", url, len(slurp))
	}
	body := string(slurp)
	for _, ref := range localLinks(body) {
		if *verbose {
			log.Printf("  links to %s", ref)
		}
		dest := *root + ref
		linkSources[dest] = append(linkSources[dest], url)
		crawl(dest, url)
	}
	for _, id := range pageIDs(body) {
		if *verbose {
			log.Printf(" url %s has #%s", url, id)
		}
		fragExists[urlFrag{url, id}] = true
	}
	return nil
}

func main() {
	flag.Parse()

	go crawlLoop()
	crawl(*root, "")

	wg.Wait()
	close(urlq)
	for uf, needers := range neededFrags {
		if !fragExists[uf] {
			problems = append(problems, fmt.Sprintf("Missing fragment for %+v from %v", uf, needers))
		}
	}

	for _, s := range problems {
		fmt.Println(s)
	}
	if len(problems) > 0 {
		os.Exit(1)
	}
}
misc: add linkcheck tool Fixes #5378 R=golang-dev, r CC=golang-dev https://golang.org/cl/13247044 2013-08-29 13:08:11 -06:00			`// Copyright 2013 The Go Authors. All rights reserved.`
			`// Use of this source code is governed by a BSD-style`
			`// license that can be found in the LICENSE file.`

			`// The linkcheck command finds missing links in the godoc website.`
			`// It crawls a URL recursively and notes URLs and URL fragments`
			`// that it's seen and prints a report of missing links at the end.`
			`package main`

			`import (`
misc/linkcheck: better redirect handling, use meaningful exit code Prevent linkcheck from following redirects that lead beyond the outside the root URL. Return a non-zero exit code when there are problems. Some minor refactoring for clarity. R=golang-dev, bradfitz CC=golang-dev https://golang.org/cl/14425049 2013-10-25 08:31:02 -06:00			`"errors"`
misc: add linkcheck tool Fixes #5378 R=golang-dev, r CC=golang-dev https://golang.org/cl/13247044 2013-08-29 13:08:11 -06:00			`"flag"`
			`"fmt"`
			`"io/ioutil"`
			`"log"`
			`"net/http"`
misc/linkcheck: better redirect handling, use meaningful exit code Prevent linkcheck from following redirects that lead beyond the outside the root URL. Return a non-zero exit code when there are problems. Some minor refactoring for clarity. R=golang-dev, bradfitz CC=golang-dev https://golang.org/cl/14425049 2013-10-25 08:31:02 -06:00			`"os"`
misc: add linkcheck tool Fixes #5378 R=golang-dev, r CC=golang-dev https://golang.org/cl/13247044 2013-08-29 13:08:11 -06:00			`"regexp"`
			`"strings"`
			`"sync"`
			`)`

			`var (`
			`root = flag.String("root", "http://localhost:6060", "Root to crawl")`
			`verbose = flag.Bool("verbose", false, "verbose")`
			`)`

			`var wg sync.WaitGroup // outstanding fetches`
			`var urlq = make(chan string) // URLs to crawl`

			`// urlFrag is a URL and its optional #fragment (without the #)`
			`type urlFrag struct {`
			`url, frag string`
			`}`

			`var (`
			`mu sync.Mutex`
			`crawled = make(map[string]bool) // URL without fragment -> true`
			`neededFrags = make(map[urlFrag][]string) // URL#frag -> who needs it`
			`)`

			var aRx = regexp.MustCompile(`<a href=['"]?(/[^\s'">]+)`)

			`// Owned by crawlLoop goroutine:`
			`var (`
			`linkSources = make(map[string][]string) // url no fragment -> sources`
			`fragExists = make(map[urlFrag]bool)`
			`problems []string`
			`)`

			`func localLinks(body string) (links []string) {`
			`seen := map[string]bool{}`
			`mv := aRx.FindAllStringSubmatch(body, -1)`
			`for _, m := range mv {`
			`ref := m[1]`
			`if strings.HasPrefix(ref, "/src/") {`
			`continue`
			`}`
			`if !seen[ref] {`
			`seen[ref] = true`
			`links = append(links, m[1])`
			`}`
			`}`
			`return`
			`}`

			var idRx = regexp.MustCompile(`\bid=['"]?([^\s'">]+)`)

			`func pageIDs(body string) (ids []string) {`
			`mv := idRx.FindAllStringSubmatch(body, -1)`
			`for _, m := range mv {`
			`ids = append(ids, m[1])`
			`}`
			`return`
			`}`

			`// url may contain a #fragment, and the fragment is then noted as needing to exist.`
			`func crawl(url string, sourceURL string) {`
			`if strings.Contains(url, "/devel/release") {`
			`return`
			`}`
			`mu.Lock()`
			`defer mu.Unlock()`
			`var frag string`
			`if i := strings.Index(url, "#"); i >= 0 {`
			`frag = url[i+1:]`
			`url = url[:i]`
			`if frag != "" {`
			`uf := urlFrag{url, frag}`
			`neededFrags[uf] = append(neededFrags[uf], sourceURL)`
			`}`
			`}`
			`if crawled[url] {`
			`return`
			`}`
			`crawled[url] = true`

			`wg.Add(1)`
			`go func() {`
			`urlq <- url`
			`}()`
			`}`

			`func addProblem(url, errmsg string) {`
			`msg := fmt.Sprintf("Error on %s: %s (from %s)", url, errmsg, linkSources[url])`
misc/linkcheck: better redirect handling, use meaningful exit code Prevent linkcheck from following redirects that lead beyond the outside the root URL. Return a non-zero exit code when there are problems. Some minor refactoring for clarity. R=golang-dev, bradfitz CC=golang-dev https://golang.org/cl/14425049 2013-10-25 08:31:02 -06:00			`if *verbose {`
			`log.Print(msg)`
			`}`
misc: add linkcheck tool Fixes #5378 R=golang-dev, r CC=golang-dev https://golang.org/cl/13247044 2013-08-29 13:08:11 -06:00			`problems = append(problems, msg)`
			`}`

			`func crawlLoop() {`
			`for url := range urlq {`
misc/linkcheck: better redirect handling, use meaningful exit code Prevent linkcheck from following redirects that lead beyond the outside the root URL. Return a non-zero exit code when there are problems. Some minor refactoring for clarity. R=golang-dev, bradfitz CC=golang-dev https://golang.org/cl/14425049 2013-10-25 08:31:02 -06:00			`if err := doCrawl(url); err != nil {`
			`addProblem(url, err.Error())`
misc: add linkcheck tool Fixes #5378 R=golang-dev, r CC=golang-dev https://golang.org/cl/13247044 2013-08-29 13:08:11 -06:00			`}`
misc/linkcheck: better redirect handling, use meaningful exit code Prevent linkcheck from following redirects that lead beyond the outside the root URL. Return a non-zero exit code when there are problems. Some minor refactoring for clarity. R=golang-dev, bradfitz CC=golang-dev https://golang.org/cl/14425049 2013-10-25 08:31:02 -06:00			`}`
			`}`

			`func doCrawl(url string) error {`
			`defer wg.Done()`

			`req, err := http.NewRequest("GET", url, nil)`
			`if err != nil {`
			`return err`
			`}`
			`res, err := http.DefaultTransport.RoundTrip(req)`
			`if err != nil {`
			`return err`
			`}`
			`// Handle redirects.`
			`if res.StatusCode/100 == 3 {`
			`newURL, err := res.Location()`
misc: add linkcheck tool Fixes #5378 R=golang-dev, r CC=golang-dev https://golang.org/cl/13247044 2013-08-29 13:08:11 -06:00			`if err != nil {`
misc/linkcheck: better redirect handling, use meaningful exit code Prevent linkcheck from following redirects that lead beyond the outside the root URL. Return a non-zero exit code when there are problems. Some minor refactoring for clarity. R=golang-dev, bradfitz CC=golang-dev https://golang.org/cl/14425049 2013-10-25 08:31:02 -06:00			`return fmt.Errorf("resolving redirect: %v", err)`
misc: add linkcheck tool Fixes #5378 R=golang-dev, r CC=golang-dev https://golang.org/cl/13247044 2013-08-29 13:08:11 -06:00			`}`
misc/linkcheck: better redirect handling, use meaningful exit code Prevent linkcheck from following redirects that lead beyond the outside the root URL. Return a non-zero exit code when there are problems. Some minor refactoring for clarity. R=golang-dev, bradfitz CC=golang-dev https://golang.org/cl/14425049 2013-10-25 08:31:02 -06:00			`if !strings.HasPrefix(newURL.String(), *root) {`
			`// Skip off-site redirects.`
			`return nil`
misc: add linkcheck tool Fixes #5378 R=golang-dev, r CC=golang-dev https://golang.org/cl/13247044 2013-08-29 13:08:11 -06:00			`}`
misc/linkcheck: better redirect handling, use meaningful exit code Prevent linkcheck from following redirects that lead beyond the outside the root URL. Return a non-zero exit code when there are problems. Some minor refactoring for clarity. R=golang-dev, bradfitz CC=golang-dev https://golang.org/cl/14425049 2013-10-25 08:31:02 -06:00			`crawl(newURL.String(), url)`
			`return nil`
			`}`
			`if res.StatusCode != 200 {`
			`return errors.New(res.Status)`
			`}`
			`slurp, err := ioutil.ReadAll(res.Body)`
			`res.Body.Close()`
			`if err != nil {`
			`log.Fatalf("Error reading %s body: %v", url, err)`
			`}`
			`if *verbose {`
			`log.Printf("Len of %s: %d", url, len(slurp))`
			`}`
			`body := string(slurp)`
			`for _, ref := range localLinks(body) {`
			`if *verbose {`
			`log.Printf(" links to %s", ref)`
misc: add linkcheck tool Fixes #5378 R=golang-dev, r CC=golang-dev https://golang.org/cl/13247044 2013-08-29 13:08:11 -06:00			`}`
misc/linkcheck: better redirect handling, use meaningful exit code Prevent linkcheck from following redirects that lead beyond the outside the root URL. Return a non-zero exit code when there are problems. Some minor refactoring for clarity. R=golang-dev, bradfitz CC=golang-dev https://golang.org/cl/14425049 2013-10-25 08:31:02 -06:00			`dest := *root + ref`
			`linkSources[dest] = append(linkSources[dest], url)`
			`crawl(dest, url)`
			`}`
			`for _, id := range pageIDs(body) {`
			`if *verbose {`
			`log.Printf(" url %s has #%s", url, id)`
misc: add linkcheck tool Fixes #5378 R=golang-dev, r CC=golang-dev https://golang.org/cl/13247044 2013-08-29 13:08:11 -06:00			`}`
misc/linkcheck: better redirect handling, use meaningful exit code Prevent linkcheck from following redirects that lead beyond the outside the root URL. Return a non-zero exit code when there are problems. Some minor refactoring for clarity. R=golang-dev, bradfitz CC=golang-dev https://golang.org/cl/14425049 2013-10-25 08:31:02 -06:00			`fragExists[urlFrag{url, id}] = true`
misc: add linkcheck tool Fixes #5378 R=golang-dev, r CC=golang-dev https://golang.org/cl/13247044 2013-08-29 13:08:11 -06:00			`}`
misc/linkcheck: better redirect handling, use meaningful exit code Prevent linkcheck from following redirects that lead beyond the outside the root URL. Return a non-zero exit code when there are problems. Some minor refactoring for clarity. R=golang-dev, bradfitz CC=golang-dev https://golang.org/cl/14425049 2013-10-25 08:31:02 -06:00			`return nil`
misc: add linkcheck tool Fixes #5378 R=golang-dev, r CC=golang-dev https://golang.org/cl/13247044 2013-08-29 13:08:11 -06:00			`}`

			`func main() {`
			`flag.Parse()`

			`go crawlLoop()`
			`crawl(*root, "")`

			`wg.Wait()`
			`close(urlq)`
			`for uf, needers := range neededFrags {`
			`if !fragExists[uf] {`
			`problems = append(problems, fmt.Sprintf("Missing fragment for %+v from %v", uf, needers))`
			`}`
			`}`

			`for _, s := range problems {`
			`fmt.Println(s)`
			`}`
misc/linkcheck: better redirect handling, use meaningful exit code Prevent linkcheck from following redirects that lead beyond the outside the root URL. Return a non-zero exit code when there are problems. Some minor refactoring for clarity. R=golang-dev, bradfitz CC=golang-dev https://golang.org/cl/14425049 2013-10-25 08:31:02 -06:00			`if len(problems) > 0 {`
			`os.Exit(1)`
			`}`
misc: add linkcheck tool Fixes #5378 R=golang-dev, r CC=golang-dev https://golang.org/cl/13247044 2013-08-29 13:08:11 -06:00			`}`