Compare commits

..

2 Commits

Author SHA1 Message Date
3620eb9d0b Add ability to ignore URLs 2024-01-04 21:40:12 +01:00
ff553fa8cb Rename 'patterns' to 'follows' 2024-01-04 21:31:32 +01:00
3 changed files with 65 additions and 33 deletions

View File

@ -30,14 +30,14 @@ Any links which are dead will be output to stdout as YAML objects, each
containing the dead URL, the error encountered, and which pages link to it.
In order to recursively crawl through links you can give one or more regex
patterns. Any URL which matches a pattern will have its links checked as well
(and if any of those link URLs match a pattern their links will be checked, and
so on):
patterns. Any URL which matches a pattern will have its links followed and
checked as well (and if any of those linked URLs match a pattern their links
will be checked, and so on):
```
deadlinks \
-url='https://mediocregopher.com' -url='gemini://mediocregopher.com' \
-pattern='://mediocregopher.com'
-follow='://mediocregopher.com'
```
There are further options available which affect the utility's behavior, see

View File

@ -33,7 +33,8 @@ func main() {
storePath = flag.String("store-path", "", "Path to sqlite storage file. If not given then a temporary in-memory storage is used")
maxAge = flag.Duration("max-age", 0, "Maximum duration since last check of a resource, before it must be checked again. Must be used with -store-path")
urls = flagStrings("url", "URL which is always checked. Must be given at least once")
patterns = flagStrings("pattern", "URLs matching this regex will have their links checked as well. Can be specified multiple times")
follows = flagStrings("follow", "URLs matching this regex will have their links checked as well. Can be specified multiple times")
ignores = flagStrings("ignore", "URLs matching this regex will not be checked at all")
concurrency = flag.Int("concurrency", runtime.NumCPU()/2, "Number simultaneous requests to make at a time")
httpUserAgent = flag.String("http-user-agent", "", "User-agent to use for http requests")
)
@ -56,14 +57,15 @@ func main() {
ctx,
store,
*urls.strs,
*patterns.strs,
&deadlinks.Opts{
NewClient: func() deadlinks.Client {
return loggingClient{deadlinks.NewClient(&deadlinks.ClientOpts{
HTTPUserAgent: *httpUserAgent,
})}
},
Concurrency: *concurrency,
FollowRegexps: *follows.strs,
IgnoreRegexps: *ignores.strs,
Concurrency: *concurrency,
OnError: func(err error) {
log.Printf("runtime error: %v", err)
},

View File

@ -24,9 +24,11 @@
// defer store.Close()
//
// pinnedURLs := []string{"https://some.website.com"}
// patterns := []string{"website.com"}
// followRegexps := []string{"website.com"}
//
// dl, err := deadlinks.New(ctx, store, pinnedURLs, patterns, nil)
// dl, err := deadlinks.New(ctx, store, pinnedURLs, &deadlinks.Opts{
// FollowRegexps: followRegexps,
// })
//
// `Update` is then used to crawl all links, starting with `pinnedURLs`:
//
@ -56,7 +58,7 @@
// })
// defer store.Close()
//
// dl, err := deadlinks.New(ctx, store, pinnedURLs, patterns, nil)
// dl, err := deadlinks.New(ctx, store, pinnedURLs, nil)
//
// # Further Customization
//
@ -83,6 +85,13 @@ type Opts struct {
NewClient func() Client // Defaults to `func () Client { return NewClient(nil) }`
Parser Parser // Defaults to `NewParser()`
// If a URL matches any of these regexps then any links found within it will
// be followed and checked for liveness themselves.
FollowRegexps []string
// If a URL matches any of these regexps then it will not be checked at all.
IgnoreRegexps []string
// Concurrency determines the maximum number of URLs which can be checked
// simultaneously.
//
@ -131,32 +140,44 @@ func (o *Opts) withDefaults() *Opts {
// out-of-the-box, and will traverse between them as necessary based on URL
// schemas. See the `NewClient` and `NewParser` functions for more details.
type DeadLinks struct {
opts Opts
store Store
patterns []*regexp.Regexp
clients []Client
opts Opts
store Store
follows, ignores []*regexp.Regexp
clients []Client
}
func compileRegexps(strs []string) ([]*regexp.Regexp, error) {
var (
rr = make([]*regexp.Regexp, len(strs))
err error
)
for i, str := range strs {
if rr[i], err = regexp.Compile(str); err != nil {
return nil, fmt.Errorf("compiling regexp %q: %w", str, err)
}
}
return rr, nil
}
// New initializes and returns a DeadLinks instance which will track the
// liveness of the given set of pinned URLs, as well as all URLs linked to from
// those. If a linked URL matches one of the given regexp patterns then any
// URLs linked to from it will be tracked as well.
// liveness of the given set of pinned URLs, and potentially URLs linked to from
// those.
//
// If a non-empty Store is passed to New then whatever set of previously pinned
// URLs were present will be overwritten with the given ones.
// If a previously used Store is passed to New then whatever set of previously
// pinned URLs were present will be overwritten with the given ones.
func New(
ctx context.Context,
store Store,
pinnedURLStrs,
patternStrs []string,
pinnedURLStrs []string,
opts *Opts,
) (
*DeadLinks, error,
) {
opts = opts.withDefaults()
var (
err error
pinnedURLs = make([]URL, len(pinnedURLStrs))
patterns = make([]*regexp.Regexp, len(patternStrs))
)
for i, u := range pinnedURLStrs {
@ -165,16 +186,21 @@ func New(
}
}
for i, p := range patternStrs {
if patterns[i], err = regexp.Compile(p); err != nil {
return nil, fmt.Errorf("compiling regexp %q: %w", p, err)
}
follows, err := compileRegexps(opts.FollowRegexps)
if err != nil {
return nil, fmt.Errorf("compiling follows: %w", err)
}
ignores, err := compileRegexps(opts.IgnoreRegexps)
if err != nil {
return nil, fmt.Errorf("compiling ignores: %w", err)
}
d := &DeadLinks{
opts: *opts.withDefaults(),
store: store,
patterns: patterns,
opts: *opts,
store: store,
follows: follows,
ignores: ignores,
}
d.clients = make([]Client, d.opts.Concurrency)
@ -198,10 +224,10 @@ func (d *DeadLinks) onError(ctx context.Context, err error) {
d.opts.OnError(err)
}
func (d *DeadLinks) shouldFollowURL(url URL) bool {
func matchesAnyRegexp(url URL, rr []*regexp.Regexp) bool {
urlStr := string(url)
for _, pattern := range d.patterns {
if pattern.MatchString(urlStr) {
for _, r := range rr {
if r.MatchString(urlStr) {
return true
}
}
@ -213,6 +239,10 @@ func (d *DeadLinks) getURL(
) (
[]URL, error,
) {
if matchesAnyRegexp(url, d.ignores) {
return nil, nil
}
ctx, cancel := context.WithTimeout(ctx, d.opts.RequestTimeout)
defer cancel()
@ -227,7 +257,7 @@ func (d *DeadLinks) getURL(
mimeType = mimeType[:i]
}
if !d.shouldFollowURL(url) {
if !matchesAnyRegexp(url, d.follows) {
return nil, nil
}