|  | // Copyright 2013 The Go Authors. All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style | 
|  | // license that can be found in the LICENSE file. | 
|  |  | 
|  | // The linkcheck command finds missing links in the godoc website. | 
|  | // It crawls a URL recursively and notes URLs and URL fragments | 
|  | // that it's seen and prints a report of missing links at the end. | 
|  | package main | 
|  |  | 
|  | import ( | 
|  | "errors" | 
|  | "flag" | 
|  | "fmt" | 
|  | "io/ioutil" | 
|  | "log" | 
|  | "net/http" | 
|  | "os" | 
|  | "regexp" | 
|  | "strings" | 
|  | "sync" | 
|  | ) | 
|  |  | 
|  | var ( | 
|  | root    = flag.String("root", "http://localhost:6060", "Root to crawl") | 
|  | verbose = flag.Bool("verbose", false, "verbose") | 
|  | ) | 
|  |  | 
|  | var wg sync.WaitGroup        // outstanding fetches | 
|  | var urlq = make(chan string) // URLs to crawl | 
|  |  | 
|  | // urlFrag is a URL and its optional #fragment (without the #) | 
|  | type urlFrag struct { | 
|  | url, frag string | 
|  | } | 
|  |  | 
|  | var ( | 
|  | mu          sync.Mutex | 
|  | crawled     = make(map[string]bool)      // URL without fragment -> true | 
|  | neededFrags = make(map[urlFrag][]string) // URL#frag -> who needs it | 
|  | ) | 
|  |  | 
|  | var aRx = regexp.MustCompile(`<a href=['"]?(/[^\s'">]+)`) | 
|  |  | 
|  | // Owned by crawlLoop goroutine: | 
|  | var ( | 
|  | linkSources = make(map[string][]string) // url no fragment -> sources | 
|  | fragExists  = make(map[urlFrag]bool) | 
|  | problems    []string | 
|  | ) | 
|  |  | 
|  | func localLinks(body string) (links []string) { | 
|  | seen := map[string]bool{} | 
|  | mv := aRx.FindAllStringSubmatch(body, -1) | 
|  | for _, m := range mv { | 
|  | ref := m[1] | 
|  | if strings.HasPrefix(ref, "/src/") { | 
|  | continue | 
|  | } | 
|  | if !seen[ref] { | 
|  | seen[ref] = true | 
|  | links = append(links, m[1]) | 
|  | } | 
|  | } | 
|  | return | 
|  | } | 
|  |  | 
|  | var idRx = regexp.MustCompile(`\bid=['"]?([^\s'">]+)`) | 
|  |  | 
|  | func pageIDs(body string) (ids []string) { | 
|  | mv := idRx.FindAllStringSubmatch(body, -1) | 
|  | for _, m := range mv { | 
|  | ids = append(ids, m[1]) | 
|  | } | 
|  | return | 
|  | } | 
|  |  | 
|  | // url may contain a #fragment, and the fragment is then noted as needing to exist. | 
|  | func crawl(url string, sourceURL string) { | 
|  | if strings.Contains(url, "/devel/release") { | 
|  | return | 
|  | } | 
|  | mu.Lock() | 
|  | defer mu.Unlock() | 
|  | var frag string | 
|  | if i := strings.Index(url, "#"); i >= 0 { | 
|  | frag = url[i+1:] | 
|  | url = url[:i] | 
|  | if frag != "" { | 
|  | uf := urlFrag{url, frag} | 
|  | neededFrags[uf] = append(neededFrags[uf], sourceURL) | 
|  | } | 
|  | } | 
|  | if crawled[url] { | 
|  | return | 
|  | } | 
|  | crawled[url] = true | 
|  |  | 
|  | wg.Add(1) | 
|  | go func() { | 
|  | urlq <- url | 
|  | }() | 
|  | } | 
|  |  | 
|  | func addProblem(url, errmsg string) { | 
|  | msg := fmt.Sprintf("Error on %s: %s (from %s)", url, errmsg, linkSources[url]) | 
|  | if *verbose { | 
|  | log.Print(msg) | 
|  | } | 
|  | problems = append(problems, msg) | 
|  | } | 
|  |  | 
|  | func crawlLoop() { | 
|  | for url := range urlq { | 
|  | if err := doCrawl(url); err != nil { | 
|  | addProblem(url, err.Error()) | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | func doCrawl(url string) error { | 
|  | defer wg.Done() | 
|  |  | 
|  | req, err := http.NewRequest("GET", url, nil) | 
|  | if err != nil { | 
|  | return err | 
|  | } | 
|  | res, err := http.DefaultTransport.RoundTrip(req) | 
|  | if err != nil { | 
|  | return err | 
|  | } | 
|  | // Handle redirects. | 
|  | if res.StatusCode/100 == 3 { | 
|  | newURL, err := res.Location() | 
|  | if err != nil { | 
|  | return fmt.Errorf("resolving redirect: %v", err) | 
|  | } | 
|  | if !strings.HasPrefix(newURL.String(), *root) { | 
|  | // Skip off-site redirects. | 
|  | return nil | 
|  | } | 
|  | crawl(newURL.String(), url) | 
|  | return nil | 
|  | } | 
|  | if res.StatusCode != 200 { | 
|  | return errors.New(res.Status) | 
|  | } | 
|  | slurp, err := ioutil.ReadAll(res.Body) | 
|  | res.Body.Close() | 
|  | if err != nil { | 
|  | log.Fatalf("Error reading %s body: %v", url, err) | 
|  | } | 
|  | if *verbose { | 
|  | log.Printf("Len of %s: %d", url, len(slurp)) | 
|  | } | 
|  | body := string(slurp) | 
|  | for _, ref := range localLinks(body) { | 
|  | if *verbose { | 
|  | log.Printf("  links to %s", ref) | 
|  | } | 
|  | dest := *root + ref | 
|  | linkSources[dest] = append(linkSources[dest], url) | 
|  | crawl(dest, url) | 
|  | } | 
|  | for _, id := range pageIDs(body) { | 
|  | if *verbose { | 
|  | log.Printf(" url %s has #%s", url, id) | 
|  | } | 
|  | fragExists[urlFrag{url, id}] = true | 
|  | } | 
|  | return nil | 
|  | } | 
|  |  | 
|  | func main() { | 
|  | flag.Parse() | 
|  |  | 
|  | go crawlLoop() | 
|  | crawl(*root, "") | 
|  |  | 
|  | wg.Wait() | 
|  | close(urlq) | 
|  | for uf, needers := range neededFrags { | 
|  | if !fragExists[uf] { | 
|  | problems = append(problems, fmt.Sprintf("Missing fragment for %+v from %v", uf, needers)) | 
|  | } | 
|  | } | 
|  |  | 
|  | for _, s := range problems { | 
|  | fmt.Println(s) | 
|  | } | 
|  | if len(problems) > 0 { | 
|  | os.Exit(1) | 
|  | } | 
|  | } |