1
0
mirror of https://github.com/golang/go synced 2024-11-17 20:24:46 -07:00
go/misc/linkcheck/linkcheck.go

194 lines
4.0 KiB
Go
Raw Normal View History

// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// The linkcheck command finds missing links in the godoc website.
// It crawls a URL recursively and notes URLs and URL fragments
// that it's seen and prints a report of missing links at the end.
package main
import (
"errors"
"flag"
"fmt"
"io/ioutil"
"log"
"net/http"
"os"
"regexp"
"strings"
"sync"
)
var (
root = flag.String("root", "http://localhost:6060", "Root to crawl")
verbose = flag.Bool("verbose", false, "verbose")
)
var wg sync.WaitGroup // outstanding fetches
var urlq = make(chan string) // URLs to crawl
// urlFrag is a URL and its optional #fragment (without the #)
type urlFrag struct {
url, frag string
}
var (
mu sync.Mutex
crawled = make(map[string]bool) // URL without fragment -> true
neededFrags = make(map[urlFrag][]string) // URL#frag -> who needs it
)
var aRx = regexp.MustCompile(`<a href=['"]?(/[^\s'">]+)`)
// Owned by crawlLoop goroutine:
var (
linkSources = make(map[string][]string) // url no fragment -> sources
fragExists = make(map[urlFrag]bool)
problems []string
)
func localLinks(body string) (links []string) {
seen := map[string]bool{}
mv := aRx.FindAllStringSubmatch(body, -1)
for _, m := range mv {
ref := m[1]
if strings.HasPrefix(ref, "/src/") {
continue
}
if !seen[ref] {
seen[ref] = true
links = append(links, m[1])
}
}
return
}
var idRx = regexp.MustCompile(`\bid=['"]?([^\s'">]+)`)
func pageIDs(body string) (ids []string) {
mv := idRx.FindAllStringSubmatch(body, -1)
for _, m := range mv {
ids = append(ids, m[1])
}
return
}
// url may contain a #fragment, and the fragment is then noted as needing to exist.
func crawl(url string, sourceURL string) {
if strings.Contains(url, "/devel/release") {
return
}
mu.Lock()
defer mu.Unlock()
var frag string
if i := strings.Index(url, "#"); i >= 0 {
frag = url[i+1:]
url = url[:i]
if frag != "" {
uf := urlFrag{url, frag}
neededFrags[uf] = append(neededFrags[uf], sourceURL)
}
}
if crawled[url] {
return
}
crawled[url] = true
wg.Add(1)
go func() {
urlq <- url
}()
}
func addProblem(url, errmsg string) {
msg := fmt.Sprintf("Error on %s: %s (from %s)", url, errmsg, linkSources[url])
if *verbose {
log.Print(msg)
}
problems = append(problems, msg)
}
func crawlLoop() {
for url := range urlq {
if err := doCrawl(url); err != nil {
addProblem(url, err.Error())
}
}
}
func doCrawl(url string) error {
defer wg.Done()
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return err
}
res, err := http.DefaultTransport.RoundTrip(req)
if err != nil {
return err
}
// Handle redirects.
if res.StatusCode/100 == 3 {
newURL, err := res.Location()
if err != nil {
return fmt.Errorf("resolving redirect: %v", err)
}
if !strings.HasPrefix(newURL.String(), *root) {
// Skip off-site redirects.
return nil
}
crawl(newURL.String(), url)
return nil
}
if res.StatusCode != 200 {
return errors.New(res.Status)
}
slurp, err := ioutil.ReadAll(res.Body)
res.Body.Close()
if err != nil {
log.Fatalf("Error reading %s body: %v", url, err)
}
if *verbose {
log.Printf("Len of %s: %d", url, len(slurp))
}
body := string(slurp)
for _, ref := range localLinks(body) {
if *verbose {
log.Printf(" links to %s", ref)
}
dest := *root + ref
linkSources[dest] = append(linkSources[dest], url)
crawl(dest, url)
}
for _, id := range pageIDs(body) {
if *verbose {
log.Printf(" url %s has #%s", url, id)
}
fragExists[urlFrag{url, id}] = true
}
return nil
}
func main() {
flag.Parse()
go crawlLoop()
crawl(*root, "")
wg.Wait()
close(urlq)
for uf, needers := range neededFrags {
if !fragExists[uf] {
problems = append(problems, fmt.Sprintf("Missing fragment for %+v from %v", uf, needers))
}
}
for _, s := range problems {
fmt.Println(s)
}
if len(problems) > 0 {
os.Exit(1)
}
}