Add ability to ignore URLs
This commit is contained in:
parent
ff553fa8cb
commit
3620eb9d0b
@ -34,6 +34,7 @@ func main() {
|
|||||||
maxAge = flag.Duration("max-age", 0, "Maximum duration since last check of a resource, before it must be checked again. Must be used with -store-path")
|
maxAge = flag.Duration("max-age", 0, "Maximum duration since last check of a resource, before it must be checked again. Must be used with -store-path")
|
||||||
urls = flagStrings("url", "URL which is always checked. Must be given at least once")
|
urls = flagStrings("url", "URL which is always checked. Must be given at least once")
|
||||||
follows = flagStrings("follow", "URLs matching this regex will have their links checked as well. Can be specified multiple times")
|
follows = flagStrings("follow", "URLs matching this regex will have their links checked as well. Can be specified multiple times")
|
||||||
|
ignores = flagStrings("ignore", "URLs matching this regex will not be checked at all")
|
||||||
concurrency = flag.Int("concurrency", runtime.NumCPU()/2, "Number simultaneous requests to make at a time")
|
concurrency = flag.Int("concurrency", runtime.NumCPU()/2, "Number simultaneous requests to make at a time")
|
||||||
httpUserAgent = flag.String("http-user-agent", "", "User-agent to use for http requests")
|
httpUserAgent = flag.String("http-user-agent", "", "User-agent to use for http requests")
|
||||||
)
|
)
|
||||||
@ -63,6 +64,7 @@ func main() {
|
|||||||
})}
|
})}
|
||||||
},
|
},
|
||||||
FollowRegexps: *follows.strs,
|
FollowRegexps: *follows.strs,
|
||||||
|
IgnoreRegexps: *ignores.strs,
|
||||||
Concurrency: *concurrency,
|
Concurrency: *concurrency,
|
||||||
OnError: func(err error) {
|
OnError: func(err error) {
|
||||||
log.Printf("runtime error: %v", err)
|
log.Printf("runtime error: %v", err)
|
||||||
|
50
deadlinks.go
50
deadlinks.go
@ -89,6 +89,9 @@ type Opts struct {
|
|||||||
// be followed and checked for liveness themselves.
|
// be followed and checked for liveness themselves.
|
||||||
FollowRegexps []string
|
FollowRegexps []string
|
||||||
|
|
||||||
|
// If a URL matches any of these regexps then it will not be checked at all.
|
||||||
|
IgnoreRegexps []string
|
||||||
|
|
||||||
// Concurrency determines the maximum number of URLs which can be checked
|
// Concurrency determines the maximum number of URLs which can be checked
|
||||||
// simultaneously.
|
// simultaneously.
|
||||||
//
|
//
|
||||||
@ -137,10 +140,23 @@ func (o *Opts) withDefaults() *Opts {
|
|||||||
// out-of-the-box, and will traverse between them as necessary based on URL
|
// out-of-the-box, and will traverse between them as necessary based on URL
|
||||||
// schemas. See the `NewClient` and `NewParser` functions for more details.
|
// schemas. See the `NewClient` and `NewParser` functions for more details.
|
||||||
type DeadLinks struct {
|
type DeadLinks struct {
|
||||||
opts Opts
|
opts Opts
|
||||||
store Store
|
store Store
|
||||||
follows []*regexp.Regexp
|
follows, ignores []*regexp.Regexp
|
||||||
clients []Client
|
clients []Client
|
||||||
|
}
|
||||||
|
|
||||||
|
func compileRegexps(strs []string) ([]*regexp.Regexp, error) {
|
||||||
|
var (
|
||||||
|
rr = make([]*regexp.Regexp, len(strs))
|
||||||
|
err error
|
||||||
|
)
|
||||||
|
for i, str := range strs {
|
||||||
|
if rr[i], err = regexp.Compile(str); err != nil {
|
||||||
|
return nil, fmt.Errorf("compiling regexp %q: %w", str, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return rr, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// New initializes and returns a DeadLinks instance which will track the
|
// New initializes and returns a DeadLinks instance which will track the
|
||||||
@ -162,7 +178,6 @@ func New(
|
|||||||
var (
|
var (
|
||||||
err error
|
err error
|
||||||
pinnedURLs = make([]URL, len(pinnedURLStrs))
|
pinnedURLs = make([]URL, len(pinnedURLStrs))
|
||||||
follows = make([]*regexp.Regexp, len(opts.FollowRegexps))
|
|
||||||
)
|
)
|
||||||
|
|
||||||
for i, u := range pinnedURLStrs {
|
for i, u := range pinnedURLStrs {
|
||||||
@ -171,16 +186,21 @@ func New(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for i, p := range opts.FollowRegexps {
|
follows, err := compileRegexps(opts.FollowRegexps)
|
||||||
if follows[i], err = regexp.Compile(p); err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("compiling regexp %q: %w", p, err)
|
return nil, fmt.Errorf("compiling follows: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ignores, err := compileRegexps(opts.IgnoreRegexps)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("compiling ignores: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
d := &DeadLinks{
|
d := &DeadLinks{
|
||||||
opts: *opts,
|
opts: *opts,
|
||||||
store: store,
|
store: store,
|
||||||
follows: follows,
|
follows: follows,
|
||||||
|
ignores: ignores,
|
||||||
}
|
}
|
||||||
|
|
||||||
d.clients = make([]Client, d.opts.Concurrency)
|
d.clients = make([]Client, d.opts.Concurrency)
|
||||||
@ -204,10 +224,10 @@ func (d *DeadLinks) onError(ctx context.Context, err error) {
|
|||||||
d.opts.OnError(err)
|
d.opts.OnError(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (d *DeadLinks) shouldFollowURL(url URL) bool {
|
func matchesAnyRegexp(url URL, rr []*regexp.Regexp) bool {
|
||||||
urlStr := string(url)
|
urlStr := string(url)
|
||||||
for _, follow := range d.follows {
|
for _, r := range rr {
|
||||||
if follow.MatchString(urlStr) {
|
if r.MatchString(urlStr) {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -219,6 +239,10 @@ func (d *DeadLinks) getURL(
|
|||||||
) (
|
) (
|
||||||
[]URL, error,
|
[]URL, error,
|
||||||
) {
|
) {
|
||||||
|
if matchesAnyRegexp(url, d.ignores) {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
ctx, cancel := context.WithTimeout(ctx, d.opts.RequestTimeout)
|
ctx, cancel := context.WithTimeout(ctx, d.opts.RequestTimeout)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
@ -233,7 +257,7 @@ func (d *DeadLinks) getURL(
|
|||||||
mimeType = mimeType[:i]
|
mimeType = mimeType[:i]
|
||||||
}
|
}
|
||||||
|
|
||||||
if !d.shouldFollowURL(url) {
|
if !matchesAnyRegexp(url, d.follows) {
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user