diff --git a/cmd/deadlinks/main.go b/cmd/deadlinks/main.go index ecd8e68..e3914b6 100644 --- a/cmd/deadlinks/main.go +++ b/cmd/deadlinks/main.go @@ -34,6 +34,7 @@ func main() { maxAge = flag.Duration("max-age", 0, "Maximum duration since last check of a resource, before it must be checked again. Must be used with -store-path") urls = flagStrings("url", "URL which is always checked. Must be given at least once") follows = flagStrings("follow", "URLs matching this regex will have their links checked as well. Can be specified multiple times") + ignores = flagStrings("ignore", "URLs matching this regex will not be checked at all") concurrency = flag.Int("concurrency", runtime.NumCPU()/2, "Number simultaneous requests to make at a time") httpUserAgent = flag.String("http-user-agent", "", "User-agent to use for http requests") ) @@ -63,6 +64,7 @@ func main() { })} }, FollowRegexps: *follows.strs, + IgnoreRegexps: *ignores.strs, Concurrency: *concurrency, OnError: func(err error) { log.Printf("runtime error: %v", err) diff --git a/deadlinks.go b/deadlinks.go index 281ad3f..880ea6e 100644 --- a/deadlinks.go +++ b/deadlinks.go @@ -89,6 +89,9 @@ type Opts struct { // be followed and checked for liveness themselves. FollowRegexps []string + // If a URL matches any of these regexps then it will not be checked at all. + IgnoreRegexps []string + // Concurrency determines the maximum number of URLs which can be checked // simultaneously. // @@ -137,10 +140,23 @@ func (o *Opts) withDefaults() *Opts { // out-of-the-box, and will traverse between them as necessary based on URL // schemas. See the `NewClient` and `NewParser` functions for more details. type DeadLinks struct { - opts Opts - store Store - follows []*regexp.Regexp - clients []Client + opts Opts + store Store + follows, ignores []*regexp.Regexp + clients []Client +} + +func compileRegexps(strs []string) ([]*regexp.Regexp, error) { + var ( + rr = make([]*regexp.Regexp, len(strs)) + err error + ) + for i, str := range strs { + if rr[i], err = regexp.Compile(str); err != nil { + return nil, fmt.Errorf("compiling regexp %q: %w", str, err) + } + } + return rr, nil } // New initializes and returns a DeadLinks instance which will track the @@ -162,7 +178,6 @@ func New( var ( err error pinnedURLs = make([]URL, len(pinnedURLStrs)) - follows = make([]*regexp.Regexp, len(opts.FollowRegexps)) ) for i, u := range pinnedURLStrs { @@ -171,16 +186,21 @@ func New( } } - for i, p := range opts.FollowRegexps { - if follows[i], err = regexp.Compile(p); err != nil { - return nil, fmt.Errorf("compiling regexp %q: %w", p, err) - } + follows, err := compileRegexps(opts.FollowRegexps) + if err != nil { + return nil, fmt.Errorf("compiling follows: %w", err) + } + + ignores, err := compileRegexps(opts.IgnoreRegexps) + if err != nil { + return nil, fmt.Errorf("compiling ignores: %w", err) } d := &DeadLinks{ opts: *opts, store: store, follows: follows, + ignores: ignores, } d.clients = make([]Client, d.opts.Concurrency) @@ -204,10 +224,10 @@ func (d *DeadLinks) onError(ctx context.Context, err error) { d.opts.OnError(err) } -func (d *DeadLinks) shouldFollowURL(url URL) bool { +func matchesAnyRegexp(url URL, rr []*regexp.Regexp) bool { urlStr := string(url) - for _, follow := range d.follows { - if follow.MatchString(urlStr) { + for _, r := range rr { + if r.MatchString(urlStr) { return true } } @@ -219,6 +239,10 @@ func (d *DeadLinks) getURL( ) ( []URL, error, ) { + if matchesAnyRegexp(url, d.ignores) { + return nil, nil + } + ctx, cancel := context.WithTimeout(ctx, d.opts.RequestTimeout) defer cancel() @@ -233,7 +257,7 @@ func (d *DeadLinks) getURL( mimeType = mimeType[:i] } - if !d.shouldFollowURL(url) { + if !matchesAnyRegexp(url, d.follows) { return nil, nil }