Add ability to ignore URLs

This commit is contained in:
Brian Picciano 2024-01-04 21:40:12 +01:00
parent ff553fa8cb
commit 3620eb9d0b
2 changed files with 39 additions and 13 deletions

View File

@ -34,6 +34,7 @@ func main() {
maxAge = flag.Duration("max-age", 0, "Maximum duration since last check of a resource, before it must be checked again. Must be used with -store-path") maxAge = flag.Duration("max-age", 0, "Maximum duration since last check of a resource, before it must be checked again. Must be used with -store-path")
urls = flagStrings("url", "URL which is always checked. Must be given at least once") urls = flagStrings("url", "URL which is always checked. Must be given at least once")
follows = flagStrings("follow", "URLs matching this regex will have their links checked as well. Can be specified multiple times") follows = flagStrings("follow", "URLs matching this regex will have their links checked as well. Can be specified multiple times")
ignores = flagStrings("ignore", "URLs matching this regex will not be checked at all")
concurrency = flag.Int("concurrency", runtime.NumCPU()/2, "Number simultaneous requests to make at a time") concurrency = flag.Int("concurrency", runtime.NumCPU()/2, "Number simultaneous requests to make at a time")
httpUserAgent = flag.String("http-user-agent", "", "User-agent to use for http requests") httpUserAgent = flag.String("http-user-agent", "", "User-agent to use for http requests")
) )
@ -63,6 +64,7 @@ func main() {
})} })}
}, },
FollowRegexps: *follows.strs, FollowRegexps: *follows.strs,
IgnoreRegexps: *ignores.strs,
Concurrency: *concurrency, Concurrency: *concurrency,
OnError: func(err error) { OnError: func(err error) {
log.Printf("runtime error: %v", err) log.Printf("runtime error: %v", err)

View File

@ -89,6 +89,9 @@ type Opts struct {
// be followed and checked for liveness themselves. // be followed and checked for liveness themselves.
FollowRegexps []string FollowRegexps []string
// If a URL matches any of these regexps then it will not be checked at all.
IgnoreRegexps []string
// Concurrency determines the maximum number of URLs which can be checked // Concurrency determines the maximum number of URLs which can be checked
// simultaneously. // simultaneously.
// //
@ -137,10 +140,23 @@ func (o *Opts) withDefaults() *Opts {
// out-of-the-box, and will traverse between them as necessary based on URL // out-of-the-box, and will traverse between them as necessary based on URL
// schemas. See the `NewClient` and `NewParser` functions for more details. // schemas. See the `NewClient` and `NewParser` functions for more details.
type DeadLinks struct { type DeadLinks struct {
opts Opts opts Opts
store Store store Store
follows []*regexp.Regexp follows, ignores []*regexp.Regexp
clients []Client clients []Client
}
func compileRegexps(strs []string) ([]*regexp.Regexp, error) {
var (
rr = make([]*regexp.Regexp, len(strs))
err error
)
for i, str := range strs {
if rr[i], err = regexp.Compile(str); err != nil {
return nil, fmt.Errorf("compiling regexp %q: %w", str, err)
}
}
return rr, nil
} }
// New initializes and returns a DeadLinks instance which will track the // New initializes and returns a DeadLinks instance which will track the
@ -162,7 +178,6 @@ func New(
var ( var (
err error err error
pinnedURLs = make([]URL, len(pinnedURLStrs)) pinnedURLs = make([]URL, len(pinnedURLStrs))
follows = make([]*regexp.Regexp, len(opts.FollowRegexps))
) )
for i, u := range pinnedURLStrs { for i, u := range pinnedURLStrs {
@ -171,16 +186,21 @@ func New(
} }
} }
for i, p := range opts.FollowRegexps { follows, err := compileRegexps(opts.FollowRegexps)
if follows[i], err = regexp.Compile(p); err != nil { if err != nil {
return nil, fmt.Errorf("compiling regexp %q: %w", p, err) return nil, fmt.Errorf("compiling follows: %w", err)
} }
ignores, err := compileRegexps(opts.IgnoreRegexps)
if err != nil {
return nil, fmt.Errorf("compiling ignores: %w", err)
} }
d := &DeadLinks{ d := &DeadLinks{
opts: *opts, opts: *opts,
store: store, store: store,
follows: follows, follows: follows,
ignores: ignores,
} }
d.clients = make([]Client, d.opts.Concurrency) d.clients = make([]Client, d.opts.Concurrency)
@ -204,10 +224,10 @@ func (d *DeadLinks) onError(ctx context.Context, err error) {
d.opts.OnError(err) d.opts.OnError(err)
} }
func (d *DeadLinks) shouldFollowURL(url URL) bool { func matchesAnyRegexp(url URL, rr []*regexp.Regexp) bool {
urlStr := string(url) urlStr := string(url)
for _, follow := range d.follows { for _, r := range rr {
if follow.MatchString(urlStr) { if r.MatchString(urlStr) {
return true return true
} }
} }
@ -219,6 +239,10 @@ func (d *DeadLinks) getURL(
) ( ) (
[]URL, error, []URL, error,
) { ) {
if matchesAnyRegexp(url, d.ignores) {
return nil, nil
}
ctx, cancel := context.WithTimeout(ctx, d.opts.RequestTimeout) ctx, cancel := context.WithTimeout(ctx, d.opts.RequestTimeout)
defer cancel() defer cancel()
@ -233,7 +257,7 @@ func (d *DeadLinks) getURL(
mimeType = mimeType[:i] mimeType = mimeType[:i]
} }
if !d.shouldFollowURL(url) { if !matchesAnyRegexp(url, d.follows) {
return nil, nil return nil, nil
} }