mirror of
https://github.com/golang/go
synced 2024-11-24 21:10:04 -07:00
misc: add linkcheck tool
Fixes #5378 R=golang-dev, r CC=golang-dev https://golang.org/cl/13247044
This commit is contained in:
parent
280c8b90e2
commit
2fb8022e58
167
misc/linkcheck/linkcheck.go
Normal file
167
misc/linkcheck/linkcheck.go
Normal file
@ -0,0 +1,167 @@
|
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// The linkcheck command finds missing links in the godoc website.
|
||||
// It crawls a URL recursively and notes URLs and URL fragments
|
||||
// that it's seen and prints a report of missing links at the end.
|
||||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"net/http"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync"
|
||||
)
|
||||
|
||||
var (
|
||||
root = flag.String("root", "http://localhost:6060", "Root to crawl")
|
||||
verbose = flag.Bool("verbose", false, "verbose")
|
||||
)
|
||||
|
||||
var wg sync.WaitGroup // outstanding fetches
|
||||
var urlq = make(chan string) // URLs to crawl
|
||||
|
||||
// urlFrag is a URL and its optional #fragment (without the #)
|
||||
type urlFrag struct {
|
||||
url, frag string
|
||||
}
|
||||
|
||||
var (
|
||||
mu sync.Mutex
|
||||
crawled = make(map[string]bool) // URL without fragment -> true
|
||||
neededFrags = make(map[urlFrag][]string) // URL#frag -> who needs it
|
||||
)
|
||||
|
||||
var aRx = regexp.MustCompile(`<a href=['"]?(/[^\s'">]+)`)
|
||||
|
||||
// Owned by crawlLoop goroutine:
|
||||
var (
|
||||
linkSources = make(map[string][]string) // url no fragment -> sources
|
||||
fragExists = make(map[urlFrag]bool)
|
||||
problems []string
|
||||
)
|
||||
|
||||
func localLinks(body string) (links []string) {
|
||||
seen := map[string]bool{}
|
||||
mv := aRx.FindAllStringSubmatch(body, -1)
|
||||
for _, m := range mv {
|
||||
ref := m[1]
|
||||
if strings.HasPrefix(ref, "/src/") {
|
||||
continue
|
||||
}
|
||||
if !seen[ref] {
|
||||
seen[ref] = true
|
||||
links = append(links, m[1])
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
var idRx = regexp.MustCompile(`\bid=['"]?([^\s'">]+)`)
|
||||
|
||||
func pageIDs(body string) (ids []string) {
|
||||
mv := idRx.FindAllStringSubmatch(body, -1)
|
||||
for _, m := range mv {
|
||||
ids = append(ids, m[1])
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// url may contain a #fragment, and the fragment is then noted as needing to exist.
|
||||
func crawl(url string, sourceURL string) {
|
||||
if strings.Contains(url, "/devel/release") {
|
||||
return
|
||||
}
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
var frag string
|
||||
if i := strings.Index(url, "#"); i >= 0 {
|
||||
frag = url[i+1:]
|
||||
url = url[:i]
|
||||
if frag != "" {
|
||||
uf := urlFrag{url, frag}
|
||||
neededFrags[uf] = append(neededFrags[uf], sourceURL)
|
||||
}
|
||||
}
|
||||
if crawled[url] {
|
||||
return
|
||||
}
|
||||
crawled[url] = true
|
||||
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
urlq <- url
|
||||
}()
|
||||
}
|
||||
|
||||
func addProblem(url, errmsg string) {
|
||||
msg := fmt.Sprintf("Error on %s: %s (from %s)", url, errmsg, linkSources[url])
|
||||
log.Print(msg)
|
||||
problems = append(problems, msg)
|
||||
}
|
||||
|
||||
func crawlLoop() {
|
||||
for url := range urlq {
|
||||
res, err := http.Get(url)
|
||||
if err != nil {
|
||||
addProblem(url, fmt.Sprintf("Error fetching: %v", err))
|
||||
wg.Done()
|
||||
continue
|
||||
}
|
||||
if res.StatusCode != 200 {
|
||||
addProblem(url, fmt.Sprintf("Status code = %d", res.StatusCode))
|
||||
wg.Done()
|
||||
continue
|
||||
}
|
||||
slurp, err := ioutil.ReadAll(res.Body)
|
||||
res.Body.Close()
|
||||
if err != nil {
|
||||
log.Fatalf("Error reading %s body: %v", url, err)
|
||||
}
|
||||
if *verbose {
|
||||
log.Printf("Len of %s: %d", url, len(slurp))
|
||||
}
|
||||
body := string(slurp)
|
||||
for _, ref := range localLinks(body) {
|
||||
if *verbose {
|
||||
log.Printf(" links to %s", ref)
|
||||
}
|
||||
dest := *root + ref
|
||||
linkSources[dest] = append(linkSources[dest], url)
|
||||
crawl(dest, url)
|
||||
}
|
||||
for _, id := range pageIDs(body) {
|
||||
if *verbose {
|
||||
log.Printf(" url %s has #%s", url, id)
|
||||
}
|
||||
fragExists[urlFrag{url, id}] = true
|
||||
}
|
||||
|
||||
wg.Done()
|
||||
}
|
||||
}
|
||||
|
||||
func main() {
|
||||
flag.Parse()
|
||||
|
||||
go crawlLoop()
|
||||
crawl(*root, "")
|
||||
crawl(*root+"/doc/go1.1.html", "")
|
||||
|
||||
wg.Wait()
|
||||
close(urlq)
|
||||
for uf, needers := range neededFrags {
|
||||
if !fragExists[uf] {
|
||||
problems = append(problems, fmt.Sprintf("Missing fragment for %+v from %v", uf, needers))
|
||||
}
|
||||
}
|
||||
|
||||
for _, s := range problems {
|
||||
fmt.Println(s)
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user