blob: d9bfd2f767eb190cff1cb75170c5ebbd549aeb99 [file] [log] [blame]
Brad Fitzpatrick2fb80222013-08-29 12:08:11 -07001// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// The linkcheck command finds missing links in the godoc website.
6// It crawls a URL recursively and notes URLs and URL fragments
7// that it's seen and prints a report of missing links at the end.
8package main
9
10import (
Andrew Gerrande7426012013-10-25 17:31:02 +030011 "errors"
Brad Fitzpatrick2fb80222013-08-29 12:08:11 -070012 "flag"
13 "fmt"
14 "io/ioutil"
15 "log"
16 "net/http"
Andrew Gerrande7426012013-10-25 17:31:02 +030017 "os"
Brad Fitzpatrick2fb80222013-08-29 12:08:11 -070018 "regexp"
19 "strings"
20 "sync"
21)
22
23var (
24 root = flag.String("root", "http://localhost:6060", "Root to crawl")
25 verbose = flag.Bool("verbose", false, "verbose")
26)
27
28var wg sync.WaitGroup // outstanding fetches
29var urlq = make(chan string) // URLs to crawl
30
31// urlFrag is a URL and its optional #fragment (without the #)
32type urlFrag struct {
33 url, frag string
34}
35
36var (
37 mu sync.Mutex
38 crawled = make(map[string]bool) // URL without fragment -> true
39 neededFrags = make(map[urlFrag][]string) // URL#frag -> who needs it
40)
41
42var aRx = regexp.MustCompile(`<a href=['"]?(/[^\s'">]+)`)
43
44// Owned by crawlLoop goroutine:
45var (
46 linkSources = make(map[string][]string) // url no fragment -> sources
47 fragExists = make(map[urlFrag]bool)
48 problems []string
49)
50
51func localLinks(body string) (links []string) {
52 seen := map[string]bool{}
53 mv := aRx.FindAllStringSubmatch(body, -1)
54 for _, m := range mv {
55 ref := m[1]
56 if strings.HasPrefix(ref, "/src/") {
57 continue
58 }
59 if !seen[ref] {
60 seen[ref] = true
61 links = append(links, m[1])
62 }
63 }
64 return
65}
66
67var idRx = regexp.MustCompile(`\bid=['"]?([^\s'">]+)`)
68
69func pageIDs(body string) (ids []string) {
70 mv := idRx.FindAllStringSubmatch(body, -1)
71 for _, m := range mv {
72 ids = append(ids, m[1])
73 }
74 return
75}
76
77// url may contain a #fragment, and the fragment is then noted as needing to exist.
78func crawl(url string, sourceURL string) {
79 if strings.Contains(url, "/devel/release") {
80 return
81 }
82 mu.Lock()
83 defer mu.Unlock()
84 var frag string
85 if i := strings.Index(url, "#"); i >= 0 {
86 frag = url[i+1:]
87 url = url[:i]
88 if frag != "" {
89 uf := urlFrag{url, frag}
90 neededFrags[uf] = append(neededFrags[uf], sourceURL)
91 }
92 }
93 if crawled[url] {
94 return
95 }
96 crawled[url] = true
97
98 wg.Add(1)
99 go func() {
100 urlq <- url
101 }()
102}
103
104func addProblem(url, errmsg string) {
105 msg := fmt.Sprintf("Error on %s: %s (from %s)", url, errmsg, linkSources[url])
Andrew Gerrande7426012013-10-25 17:31:02 +0300106 if *verbose {
107 log.Print(msg)
108 }
Brad Fitzpatrick2fb80222013-08-29 12:08:11 -0700109 problems = append(problems, msg)
110}
111
112func crawlLoop() {
113 for url := range urlq {
Andrew Gerrande7426012013-10-25 17:31:02 +0300114 if err := doCrawl(url); err != nil {
115 addProblem(url, err.Error())
Brad Fitzpatrick2fb80222013-08-29 12:08:11 -0700116 }
Brad Fitzpatrick2fb80222013-08-29 12:08:11 -0700117 }
118}
119
Andrew Gerrande7426012013-10-25 17:31:02 +0300120func doCrawl(url string) error {
121 defer wg.Done()
122
123 req, err := http.NewRequest("GET", url, nil)
124 if err != nil {
125 return err
126 }
127 res, err := http.DefaultTransport.RoundTrip(req)
128 if err != nil {
129 return err
130 }
131 // Handle redirects.
132 if res.StatusCode/100 == 3 {
133 newURL, err := res.Location()
134 if err != nil {
135 return fmt.Errorf("resolving redirect: %v", err)
136 }
137 if !strings.HasPrefix(newURL.String(), *root) {
138 // Skip off-site redirects.
139 return nil
140 }
141 crawl(newURL.String(), url)
142 return nil
143 }
144 if res.StatusCode != 200 {
145 return errors.New(res.Status)
146 }
147 slurp, err := ioutil.ReadAll(res.Body)
148 res.Body.Close()
149 if err != nil {
150 log.Fatalf("Error reading %s body: %v", url, err)
151 }
152 if *verbose {
153 log.Printf("Len of %s: %d", url, len(slurp))
154 }
155 body := string(slurp)
156 for _, ref := range localLinks(body) {
157 if *verbose {
158 log.Printf(" links to %s", ref)
159 }
160 dest := *root + ref
161 linkSources[dest] = append(linkSources[dest], url)
162 crawl(dest, url)
163 }
164 for _, id := range pageIDs(body) {
165 if *verbose {
166 log.Printf(" url %s has #%s", url, id)
167 }
168 fragExists[urlFrag{url, id}] = true
169 }
170 return nil
171}
172
Brad Fitzpatrick2fb80222013-08-29 12:08:11 -0700173func main() {
174 flag.Parse()
175
176 go crawlLoop()
177 crawl(*root, "")
Brad Fitzpatrick2fb80222013-08-29 12:08:11 -0700178
179 wg.Wait()
180 close(urlq)
181 for uf, needers := range neededFrags {
182 if !fragExists[uf] {
183 problems = append(problems, fmt.Sprintf("Missing fragment for %+v from %v", uf, needers))
184 }
185 }
186
187 for _, s := range problems {
188 fmt.Println(s)
189 }
Andrew Gerrande7426012013-10-25 17:31:02 +0300190 if len(problems) > 0 {
191 os.Exit(1)
192 }
Brad Fitzpatrick2fb80222013-08-29 12:08:11 -0700193}