blob: 311984a3f01886e93d2f84771170f5751f8ac68e [file] [log] [blame]
// Copyright 2024 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package crawl
import (
"context"
"errors"
"net/http"
"net/url"
"slices"
"strings"
"testing"
"golang.org/x/net/html"
"golang.org/x/oscar/internal/storage"
"golang.org/x/oscar/internal/testutil"
)
func TestCrawl(t *testing.T) {
check := testutil.Checker(t)
lg := testutil.Slogger(t)
db := storage.MemDB()
newCrawl := func(tc *http.Client) *Crawler {
c := New(lg, db, tc)
c.Allow(allow...)
c.Deny(deny...)
c.Clean(clean)
c.Add("https://go.dev/")
c.Add("https://go.dev/")
return c
}
tc := readTestClient(t, "testdata/godev.txt")
c := newCrawl(tc)
testutil.StopPanic(func() {
c.Add("https://go.dev/#foo")
t.Errorf("Add with URL fragment did not panic")
})
check(c.Run(context.Background()))
for _, u := range needVisited {
if p, ok := c.Get(u); !ok {
t.Errorf("Crawl %s: should have visited, did not", u)
} else if len(p.HTML) > 0 {
t.Errorf("Crawl %s: should have visited but not recorded HTML; found HTML", u)
}
}
for _, u := range needHTML {
if p, ok := c.Get(u); !ok {
t.Errorf("Crawl %s: should have recorded HTML, did not visit", u)
} else if len(p.HTML) == 0 {
t.Errorf("Crawl %s: should have recorded HTML, visited but no HTML", u)
}
}
for _, u := range needSkipped {
if _, ok := c.Get(u); ok {
t.Errorf("Crawl %s: should have skipped, found queued page", u)
}
}
// Check for various errors.
for p := range c.PageWatcher("test1").Recent() {
if strings.Contains(p.URL, "/err/") && p.Error == "" {
t.Errorf("crawl %s: no error", p.URL)
}
}
// Check that default recrawl does not recrawl.
didRoot2 := false
c = newCrawl(&http.Client{
Transport: transportFunc(func(req *http.Request) (*http.Response, error) {
if req.URL.Path == "/root2" {
didRoot2 = true
return tc.Transport.RoundTrip(req)
}
t.Fatalf("crawler recrawled too soon")
panic("unreachable")
}),
})
c.Add("https://go.dev/root2") // not seen yet
check(c.Run(context.Background()))
if !didRoot2 {
t.Errorf("did not crawl /root2")
}
// Check that Recrawl(0) does recrawl, but also terminates.
n := 0
c = newCrawl(&http.Client{
Transport: transportFunc(func(req *http.Request) (*http.Response, error) {
n++
return tc.Transport.RoundTrip(req)
}),
})
c.SetRecrawl(0)
check(c.Run(context.Background()))
if n < 3 {
t.Fatalf("Run after Recrawl(0) crawled %d pages, want ≥ 3", n)
}
}
var allow = []string{
"https://go.dev/",
}
var deny = []string{
"https://go.dev/api/",
"https://go.dev/change/",
"https://go.dev/cl/",
"https://go.dev/design/",
"https://go.dev/dl/",
"https://go.dev/issue/",
"https://go.dev/lib/",
"https://go.dev/misc/",
"https://go.dev/play",
"https://go.dev/s/",
"https://go.dev/src/",
"https://go.dev/test/",
}
var needVisited = []string{
"https://go.dev/doc/faq/",
"https://go.dev/err/bad-status",
"https://go.dev/err/bad-content-type",
"https://go.dev/err/redirect-no-location",
"https://go.dev/err/redirect-bad-url",
"https://go.dev/err/body-too-large",
"https://go.dev/err/body-read-error",
}
var needHTML = []string{
"https://go.dev/",
"https://go.dev/doc/faq",
"https://go.dev/pkg/math/?m=old",
"https://go.dev/pkg/strings/?m=old",
"https://go.dev/player/okay",
}
var needSkipped = []string{
"https://go.dev/play/p/asdf",
"https://go.dev/s/short",
"https://www.google.com/",
"https://go.dev/root2",
"https://go.dev/err/clean-error",
"https://go.dev/err/disallow-after-clean",
}
func clean(u *url.URL) error {
if u.Host == "go.dev" {
u.RawQuery = ""
u.ForceQuery = false
if strings.HasPrefix(u.Path, "/pkg") || strings.HasPrefix(u.Path, "/cmd") {
u.RawQuery = "m=old"
}
}
if strings.HasPrefix(u.Path, "/err/disallow-after-clean") {
u.Path = "/s/disallow"
}
if strings.HasPrefix(u.Path, "/err/clean-error") {
return errors.New("clean error!")
}
return nil
}
func TestLinks(t *testing.T) {
check := testutil.Checker(t)
u, err := url.Parse("https://go.dev/")
check(err)
doc, err := html.Parse(strings.NewReader(`
<a href="a1"></a><a href="https://www.google.com/a2"></a><a href="a3"></a><a href="a4"></a>
`))
check(err)
want := []string{
"https://go.dev/a1",
"https://www.google.com/a2",
"https://go.dev/a3",
"https://go.dev/a4",
}
var have []string
for u := range links(testutil.Slogger(t), u, doc) {
have = append(have, u.String())
}
if !slices.Equal(have, want) {
t.Errorf("links:\nhave %q\nwant %q", have, want)
}
want = want[:2]
have = have[:0]
for u := range links(testutil.Slogger(t), u, doc) {
have = append(have, u.String())
if len(have) == 2 {
break
}
}
if !slices.Equal(have, want) {
t.Errorf("links with early break:\nhave %q\nwant %q", have, want)
}
}