mirror of
https://github.com/golang/go
synced 2024-11-17 19:54:45 -07:00
e7426010c5
Prevent linkcheck from following redirects that lead beyond the outside the root URL. Return a non-zero exit code when there are problems. Some minor refactoring for clarity. R=golang-dev, bradfitz CC=golang-dev https://golang.org/cl/14425049
194 lines
4.0 KiB
Go
194 lines
4.0 KiB
Go
// Copyright 2013 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// The linkcheck command finds missing links in the godoc website.
|
|
// It crawls a URL recursively and notes URLs and URL fragments
|
|
// that it's seen and prints a report of missing links at the end.
|
|
package main
|
|
|
|
import (
|
|
"errors"
|
|
"flag"
|
|
"fmt"
|
|
"io/ioutil"
|
|
"log"
|
|
"net/http"
|
|
"os"
|
|
"regexp"
|
|
"strings"
|
|
"sync"
|
|
)
|
|
|
|
var (
|
|
root = flag.String("root", "http://localhost:6060", "Root to crawl")
|
|
verbose = flag.Bool("verbose", false, "verbose")
|
|
)
|
|
|
|
var wg sync.WaitGroup // outstanding fetches
|
|
var urlq = make(chan string) // URLs to crawl
|
|
|
|
// urlFrag is a URL and its optional #fragment (without the #)
|
|
type urlFrag struct {
|
|
url, frag string
|
|
}
|
|
|
|
var (
|
|
mu sync.Mutex
|
|
crawled = make(map[string]bool) // URL without fragment -> true
|
|
neededFrags = make(map[urlFrag][]string) // URL#frag -> who needs it
|
|
)
|
|
|
|
var aRx = regexp.MustCompile(`<a href=['"]?(/[^\s'">]+)`)
|
|
|
|
// Owned by crawlLoop goroutine:
|
|
var (
|
|
linkSources = make(map[string][]string) // url no fragment -> sources
|
|
fragExists = make(map[urlFrag]bool)
|
|
problems []string
|
|
)
|
|
|
|
func localLinks(body string) (links []string) {
|
|
seen := map[string]bool{}
|
|
mv := aRx.FindAllStringSubmatch(body, -1)
|
|
for _, m := range mv {
|
|
ref := m[1]
|
|
if strings.HasPrefix(ref, "/src/") {
|
|
continue
|
|
}
|
|
if !seen[ref] {
|
|
seen[ref] = true
|
|
links = append(links, m[1])
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
var idRx = regexp.MustCompile(`\bid=['"]?([^\s'">]+)`)
|
|
|
|
func pageIDs(body string) (ids []string) {
|
|
mv := idRx.FindAllStringSubmatch(body, -1)
|
|
for _, m := range mv {
|
|
ids = append(ids, m[1])
|
|
}
|
|
return
|
|
}
|
|
|
|
// url may contain a #fragment, and the fragment is then noted as needing to exist.
|
|
func crawl(url string, sourceURL string) {
|
|
if strings.Contains(url, "/devel/release") {
|
|
return
|
|
}
|
|
mu.Lock()
|
|
defer mu.Unlock()
|
|
var frag string
|
|
if i := strings.Index(url, "#"); i >= 0 {
|
|
frag = url[i+1:]
|
|
url = url[:i]
|
|
if frag != "" {
|
|
uf := urlFrag{url, frag}
|
|
neededFrags[uf] = append(neededFrags[uf], sourceURL)
|
|
}
|
|
}
|
|
if crawled[url] {
|
|
return
|
|
}
|
|
crawled[url] = true
|
|
|
|
wg.Add(1)
|
|
go func() {
|
|
urlq <- url
|
|
}()
|
|
}
|
|
|
|
func addProblem(url, errmsg string) {
|
|
msg := fmt.Sprintf("Error on %s: %s (from %s)", url, errmsg, linkSources[url])
|
|
if *verbose {
|
|
log.Print(msg)
|
|
}
|
|
problems = append(problems, msg)
|
|
}
|
|
|
|
func crawlLoop() {
|
|
for url := range urlq {
|
|
if err := doCrawl(url); err != nil {
|
|
addProblem(url, err.Error())
|
|
}
|
|
}
|
|
}
|
|
|
|
func doCrawl(url string) error {
|
|
defer wg.Done()
|
|
|
|
req, err := http.NewRequest("GET", url, nil)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
res, err := http.DefaultTransport.RoundTrip(req)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// Handle redirects.
|
|
if res.StatusCode/100 == 3 {
|
|
newURL, err := res.Location()
|
|
if err != nil {
|
|
return fmt.Errorf("resolving redirect: %v", err)
|
|
}
|
|
if !strings.HasPrefix(newURL.String(), *root) {
|
|
// Skip off-site redirects.
|
|
return nil
|
|
}
|
|
crawl(newURL.String(), url)
|
|
return nil
|
|
}
|
|
if res.StatusCode != 200 {
|
|
return errors.New(res.Status)
|
|
}
|
|
slurp, err := ioutil.ReadAll(res.Body)
|
|
res.Body.Close()
|
|
if err != nil {
|
|
log.Fatalf("Error reading %s body: %v", url, err)
|
|
}
|
|
if *verbose {
|
|
log.Printf("Len of %s: %d", url, len(slurp))
|
|
}
|
|
body := string(slurp)
|
|
for _, ref := range localLinks(body) {
|
|
if *verbose {
|
|
log.Printf(" links to %s", ref)
|
|
}
|
|
dest := *root + ref
|
|
linkSources[dest] = append(linkSources[dest], url)
|
|
crawl(dest, url)
|
|
}
|
|
for _, id := range pageIDs(body) {
|
|
if *verbose {
|
|
log.Printf(" url %s has #%s", url, id)
|
|
}
|
|
fragExists[urlFrag{url, id}] = true
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func main() {
|
|
flag.Parse()
|
|
|
|
go crawlLoop()
|
|
crawl(*root, "")
|
|
|
|
wg.Wait()
|
|
close(urlq)
|
|
for uf, needers := range neededFrags {
|
|
if !fragExists[uf] {
|
|
problems = append(problems, fmt.Sprintf("Missing fragment for %+v from %v", uf, needers))
|
|
}
|
|
}
|
|
|
|
for _, s := range problems {
|
|
fmt.Println(s)
|
|
}
|
|
if len(problems) > 0 {
|
|
os.Exit(1)
|
|
}
|
|
}
|