|
|
|
@ -24,9 +24,11 @@ |
|
|
|
|
// defer store.Close()
|
|
|
|
|
//
|
|
|
|
|
// pinnedURLs := []string{"https://some.website.com"}
|
|
|
|
|
// patterns := []string{"website.com"}
|
|
|
|
|
// followRegexps := []string{"website.com"}
|
|
|
|
|
//
|
|
|
|
|
// dl, err := deadlinks.New(ctx, store, pinnedURLs, patterns, nil)
|
|
|
|
|
// dl, err := deadlinks.New(ctx, store, pinnedURLs, &deadlinks.Opts{
|
|
|
|
|
// FollowRegexps: followRegexps,
|
|
|
|
|
// })
|
|
|
|
|
//
|
|
|
|
|
// `Update` is then used to crawl all links, starting with `pinnedURLs`:
|
|
|
|
|
//
|
|
|
|
@ -56,7 +58,7 @@ |
|
|
|
|
// })
|
|
|
|
|
// defer store.Close()
|
|
|
|
|
//
|
|
|
|
|
// dl, err := deadlinks.New(ctx, store, pinnedURLs, patterns, nil)
|
|
|
|
|
// dl, err := deadlinks.New(ctx, store, pinnedURLs, nil)
|
|
|
|
|
//
|
|
|
|
|
// # Further Customization
|
|
|
|
|
//
|
|
|
|
@ -83,6 +85,13 @@ type Opts struct { |
|
|
|
|
NewClient func() Client // Defaults to `func () Client { return NewClient(nil) }`
|
|
|
|
|
Parser Parser // Defaults to `NewParser()`
|
|
|
|
|
|
|
|
|
|
// If a URL matches any of these regexps then any links found within it will
|
|
|
|
|
// be followed and checked for liveness themselves.
|
|
|
|
|
FollowRegexps []string |
|
|
|
|
|
|
|
|
|
// If a URL matches any of these regexps then it will not be checked at all.
|
|
|
|
|
IgnoreRegexps []string |
|
|
|
|
|
|
|
|
|
// Concurrency determines the maximum number of URLs which can be checked
|
|
|
|
|
// simultaneously.
|
|
|
|
|
//
|
|
|
|
@ -131,32 +140,44 @@ func (o *Opts) withDefaults() *Opts { |
|
|
|
|
// out-of-the-box, and will traverse between them as necessary based on URL
|
|
|
|
|
// schemas. See the `NewClient` and `NewParser` functions for more details.
|
|
|
|
|
type DeadLinks struct { |
|
|
|
|
opts Opts |
|
|
|
|
store Store |
|
|
|
|
patterns []*regexp.Regexp |
|
|
|
|
clients []Client |
|
|
|
|
opts Opts |
|
|
|
|
store Store |
|
|
|
|
follows, ignores []*regexp.Regexp |
|
|
|
|
clients []Client |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
func compileRegexps(strs []string) ([]*regexp.Regexp, error) { |
|
|
|
|
var ( |
|
|
|
|
rr = make([]*regexp.Regexp, len(strs)) |
|
|
|
|
err error |
|
|
|
|
) |
|
|
|
|
for i, str := range strs { |
|
|
|
|
if rr[i], err = regexp.Compile(str); err != nil { |
|
|
|
|
return nil, fmt.Errorf("compiling regexp %q: %w", str, err) |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
return rr, nil |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// New initializes and returns a DeadLinks instance which will track the
|
|
|
|
|
// liveness of the given set of pinned URLs, as well as all URLs linked to from
|
|
|
|
|
// those. If a linked URL matches one of the given regexp patterns then any
|
|
|
|
|
// URLs linked to from it will be tracked as well.
|
|
|
|
|
// liveness of the given set of pinned URLs, and potentially URLs linked to from
|
|
|
|
|
// those.
|
|
|
|
|
//
|
|
|
|
|
// If a non-empty Store is passed to New then whatever set of previously pinned
|
|
|
|
|
// URLs were present will be overwritten with the given ones.
|
|
|
|
|
// If a previously used Store is passed to New then whatever set of previously
|
|
|
|
|
// pinned URLs were present will be overwritten with the given ones.
|
|
|
|
|
func New( |
|
|
|
|
ctx context.Context, |
|
|
|
|
store Store, |
|
|
|
|
pinnedURLStrs, |
|
|
|
|
patternStrs []string, |
|
|
|
|
pinnedURLStrs []string, |
|
|
|
|
opts *Opts, |
|
|
|
|
) ( |
|
|
|
|
*DeadLinks, error, |
|
|
|
|
) { |
|
|
|
|
opts = opts.withDefaults() |
|
|
|
|
|
|
|
|
|
var ( |
|
|
|
|
err error |
|
|
|
|
pinnedURLs = make([]URL, len(pinnedURLStrs)) |
|
|
|
|
patterns = make([]*regexp.Regexp, len(patternStrs)) |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
for i, u := range pinnedURLStrs { |
|
|
|
@ -165,16 +186,21 @@ func New( |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
for i, p := range patternStrs { |
|
|
|
|
if patterns[i], err = regexp.Compile(p); err != nil { |
|
|
|
|
return nil, fmt.Errorf("compiling regexp %q: %w", p, err) |
|
|
|
|
} |
|
|
|
|
follows, err := compileRegexps(opts.FollowRegexps) |
|
|
|
|
if err != nil { |
|
|
|
|
return nil, fmt.Errorf("compiling follows: %w", err) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
ignores, err := compileRegexps(opts.IgnoreRegexps) |
|
|
|
|
if err != nil { |
|
|
|
|
return nil, fmt.Errorf("compiling ignores: %w", err) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
d := &DeadLinks{ |
|
|
|
|
opts: *opts.withDefaults(), |
|
|
|
|
store: store, |
|
|
|
|
patterns: patterns, |
|
|
|
|
opts: *opts, |
|
|
|
|
store: store, |
|
|
|
|
follows: follows, |
|
|
|
|
ignores: ignores, |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
d.clients = make([]Client, d.opts.Concurrency) |
|
|
|
@ -198,10 +224,10 @@ func (d *DeadLinks) onError(ctx context.Context, err error) { |
|
|
|
|
d.opts.OnError(err) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
func (d *DeadLinks) shouldFollowURL(url URL) bool { |
|
|
|
|
func matchesAnyRegexp(url URL, rr []*regexp.Regexp) bool { |
|
|
|
|
urlStr := string(url) |
|
|
|
|
for _, pattern := range d.patterns { |
|
|
|
|
if pattern.MatchString(urlStr) { |
|
|
|
|
for _, r := range rr { |
|
|
|
|
if r.MatchString(urlStr) { |
|
|
|
|
return true |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
@ -213,6 +239,10 @@ func (d *DeadLinks) getURL( |
|
|
|
|
) ( |
|
|
|
|
[]URL, error, |
|
|
|
|
) { |
|
|
|
|
if matchesAnyRegexp(url, d.ignores) { |
|
|
|
|
return nil, nil |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
ctx, cancel := context.WithTimeout(ctx, d.opts.RequestTimeout) |
|
|
|
|
defer cancel() |
|
|
|
|
|
|
|
|
@ -227,7 +257,7 @@ func (d *DeadLinks) getURL( |
|
|
|
|
mimeType = mimeType[:i] |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if !d.shouldFollowURL(url) { |
|
|
|
|
if !matchesAnyRegexp(url, d.follows) { |
|
|
|
|
return nil, nil |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|