mirror of
https://github.com/golang/go
synced 2024-11-11 18:51:37 -07:00
misc/linkcheck: better redirect handling, use meaningful exit code
Prevent linkcheck from following redirects that lead beyond the outside the root URL. Return a non-zero exit code when there are problems. Some minor refactoring for clarity. R=golang-dev, bradfitz CC=golang-dev https://golang.org/cl/14425049
This commit is contained in:
parent
2d6a13997a
commit
e7426010c5
@ -8,11 +8,13 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync"
|
||||
@ -101,57 +103,78 @@ func crawl(url string, sourceURL string) {
|
||||
|
||||
func addProblem(url, errmsg string) {
|
||||
msg := fmt.Sprintf("Error on %s: %s (from %s)", url, errmsg, linkSources[url])
|
||||
log.Print(msg)
|
||||
if *verbose {
|
||||
log.Print(msg)
|
||||
}
|
||||
problems = append(problems, msg)
|
||||
}
|
||||
|
||||
func crawlLoop() {
|
||||
for url := range urlq {
|
||||
res, err := http.Get(url)
|
||||
if err != nil {
|
||||
addProblem(url, fmt.Sprintf("Error fetching: %v", err))
|
||||
wg.Done()
|
||||
continue
|
||||
if err := doCrawl(url); err != nil {
|
||||
addProblem(url, err.Error())
|
||||
}
|
||||
if res.StatusCode != 200 {
|
||||
addProblem(url, fmt.Sprintf("Status code = %d", res.StatusCode))
|
||||
wg.Done()
|
||||
continue
|
||||
}
|
||||
slurp, err := ioutil.ReadAll(res.Body)
|
||||
res.Body.Close()
|
||||
if err != nil {
|
||||
log.Fatalf("Error reading %s body: %v", url, err)
|
||||
}
|
||||
if *verbose {
|
||||
log.Printf("Len of %s: %d", url, len(slurp))
|
||||
}
|
||||
body := string(slurp)
|
||||
for _, ref := range localLinks(body) {
|
||||
if *verbose {
|
||||
log.Printf(" links to %s", ref)
|
||||
}
|
||||
dest := *root + ref
|
||||
linkSources[dest] = append(linkSources[dest], url)
|
||||
crawl(dest, url)
|
||||
}
|
||||
for _, id := range pageIDs(body) {
|
||||
if *verbose {
|
||||
log.Printf(" url %s has #%s", url, id)
|
||||
}
|
||||
fragExists[urlFrag{url, id}] = true
|
||||
}
|
||||
|
||||
wg.Done()
|
||||
}
|
||||
}
|
||||
|
||||
func doCrawl(url string) error {
|
||||
defer wg.Done()
|
||||
|
||||
req, err := http.NewRequest("GET", url, nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
res, err := http.DefaultTransport.RoundTrip(req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// Handle redirects.
|
||||
if res.StatusCode/100 == 3 {
|
||||
newURL, err := res.Location()
|
||||
if err != nil {
|
||||
return fmt.Errorf("resolving redirect: %v", err)
|
||||
}
|
||||
if !strings.HasPrefix(newURL.String(), *root) {
|
||||
// Skip off-site redirects.
|
||||
return nil
|
||||
}
|
||||
crawl(newURL.String(), url)
|
||||
return nil
|
||||
}
|
||||
if res.StatusCode != 200 {
|
||||
return errors.New(res.Status)
|
||||
}
|
||||
slurp, err := ioutil.ReadAll(res.Body)
|
||||
res.Body.Close()
|
||||
if err != nil {
|
||||
log.Fatalf("Error reading %s body: %v", url, err)
|
||||
}
|
||||
if *verbose {
|
||||
log.Printf("Len of %s: %d", url, len(slurp))
|
||||
}
|
||||
body := string(slurp)
|
||||
for _, ref := range localLinks(body) {
|
||||
if *verbose {
|
||||
log.Printf(" links to %s", ref)
|
||||
}
|
||||
dest := *root + ref
|
||||
linkSources[dest] = append(linkSources[dest], url)
|
||||
crawl(dest, url)
|
||||
}
|
||||
for _, id := range pageIDs(body) {
|
||||
if *verbose {
|
||||
log.Printf(" url %s has #%s", url, id)
|
||||
}
|
||||
fragExists[urlFrag{url, id}] = true
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func main() {
|
||||
flag.Parse()
|
||||
|
||||
go crawlLoop()
|
||||
crawl(*root, "")
|
||||
crawl(*root+"/doc/go1.1.html", "")
|
||||
|
||||
wg.Wait()
|
||||
close(urlq)
|
||||
@ -164,4 +187,7 @@ func main() {
|
||||
for _, s := range problems {
|
||||
fmt.Println(s)
|
||||
}
|
||||
if len(problems) > 0 {
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user