Rename 'patterns' to 'follows'

This commit is contained in:
Brian Picciano 2024-01-04 21:31:32 +01:00
parent f012eeebbf
commit ff553fa8cb
3 changed files with 35 additions and 29 deletions

View File

@ -30,14 +30,14 @@ Any links which are dead will be output to stdout as YAML objects, each
containing the dead URL, the error encountered, and which pages link to it. containing the dead URL, the error encountered, and which pages link to it.
In order to recursively crawl through links you can give one or more regex In order to recursively crawl through links you can give one or more regex
patterns. Any URL which matches a pattern will have its links checked as well patterns. Any URL which matches a pattern will have its links followed and
(and if any of those link URLs match a pattern their links will be checked, and checked as well (and if any of those linked URLs match a pattern their links
so on): will be checked, and so on):
``` ```
deadlinks \ deadlinks \
-url='https://mediocregopher.com' -url='gemini://mediocregopher.com' \ -url='https://mediocregopher.com' -url='gemini://mediocregopher.com' \
-pattern='://mediocregopher.com' -follow='://mediocregopher.com'
``` ```
There are further options available which affect the utility's behavior, see There are further options available which affect the utility's behavior, see

View File

@ -33,7 +33,7 @@ func main() {
storePath = flag.String("store-path", "", "Path to sqlite storage file. If not given then a temporary in-memory storage is used") storePath = flag.String("store-path", "", "Path to sqlite storage file. If not given then a temporary in-memory storage is used")
maxAge = flag.Duration("max-age", 0, "Maximum duration since last check of a resource, before it must be checked again. Must be used with -store-path") maxAge = flag.Duration("max-age", 0, "Maximum duration since last check of a resource, before it must be checked again. Must be used with -store-path")
urls = flagStrings("url", "URL which is always checked. Must be given at least once") urls = flagStrings("url", "URL which is always checked. Must be given at least once")
patterns = flagStrings("pattern", "URLs matching this regex will have their links checked as well. Can be specified multiple times") follows = flagStrings("follow", "URLs matching this regex will have their links checked as well. Can be specified multiple times")
concurrency = flag.Int("concurrency", runtime.NumCPU()/2, "Number simultaneous requests to make at a time") concurrency = flag.Int("concurrency", runtime.NumCPU()/2, "Number simultaneous requests to make at a time")
httpUserAgent = flag.String("http-user-agent", "", "User-agent to use for http requests") httpUserAgent = flag.String("http-user-agent", "", "User-agent to use for http requests")
) )
@ -56,13 +56,13 @@ func main() {
ctx, ctx,
store, store,
*urls.strs, *urls.strs,
*patterns.strs,
&deadlinks.Opts{ &deadlinks.Opts{
NewClient: func() deadlinks.Client { NewClient: func() deadlinks.Client {
return loggingClient{deadlinks.NewClient(&deadlinks.ClientOpts{ return loggingClient{deadlinks.NewClient(&deadlinks.ClientOpts{
HTTPUserAgent: *httpUserAgent, HTTPUserAgent: *httpUserAgent,
})} })}
}, },
FollowRegexps: *follows.strs,
Concurrency: *concurrency, Concurrency: *concurrency,
OnError: func(err error) { OnError: func(err error) {
log.Printf("runtime error: %v", err) log.Printf("runtime error: %v", err)

View File

@ -24,9 +24,11 @@
// defer store.Close() // defer store.Close()
// //
// pinnedURLs := []string{"https://some.website.com"} // pinnedURLs := []string{"https://some.website.com"}
// patterns := []string{"website.com"} // followRegexps := []string{"website.com"}
// //
// dl, err := deadlinks.New(ctx, store, pinnedURLs, patterns, nil) // dl, err := deadlinks.New(ctx, store, pinnedURLs, &deadlinks.Opts{
// FollowRegexps: followRegexps,
// })
// //
// `Update` is then used to crawl all links, starting with `pinnedURLs`: // `Update` is then used to crawl all links, starting with `pinnedURLs`:
// //
@ -56,7 +58,7 @@
// }) // })
// defer store.Close() // defer store.Close()
// //
// dl, err := deadlinks.New(ctx, store, pinnedURLs, patterns, nil) // dl, err := deadlinks.New(ctx, store, pinnedURLs, nil)
// //
// # Further Customization // # Further Customization
// //
@ -83,6 +85,10 @@ type Opts struct {
NewClient func() Client // Defaults to `func () Client { return NewClient(nil) }` NewClient func() Client // Defaults to `func () Client { return NewClient(nil) }`
Parser Parser // Defaults to `NewParser()` Parser Parser // Defaults to `NewParser()`
// If a URL matches any of these regexps then any links found within it will
// be followed and checked for liveness themselves.
FollowRegexps []string
// Concurrency determines the maximum number of URLs which can be checked // Concurrency determines the maximum number of URLs which can be checked
// simultaneously. // simultaneously.
// //
@ -133,30 +139,30 @@ func (o *Opts) withDefaults() *Opts {
type DeadLinks struct { type DeadLinks struct {
opts Opts opts Opts
store Store store Store
patterns []*regexp.Regexp follows []*regexp.Regexp
clients []Client clients []Client
} }
// New initializes and returns a DeadLinks instance which will track the // New initializes and returns a DeadLinks instance which will track the
// liveness of the given set of pinned URLs, as well as all URLs linked to from // liveness of the given set of pinned URLs, and potentially URLs linked to from
// those. If a linked URL matches one of the given regexp patterns then any // those.
// URLs linked to from it will be tracked as well.
// //
// If a non-empty Store is passed to New then whatever set of previously pinned // If a previously used Store is passed to New then whatever set of previously
// URLs were present will be overwritten with the given ones. // pinned URLs were present will be overwritten with the given ones.
func New( func New(
ctx context.Context, ctx context.Context,
store Store, store Store,
pinnedURLStrs, pinnedURLStrs []string,
patternStrs []string,
opts *Opts, opts *Opts,
) ( ) (
*DeadLinks, error, *DeadLinks, error,
) { ) {
opts = opts.withDefaults()
var ( var (
err error err error
pinnedURLs = make([]URL, len(pinnedURLStrs)) pinnedURLs = make([]URL, len(pinnedURLStrs))
patterns = make([]*regexp.Regexp, len(patternStrs)) follows = make([]*regexp.Regexp, len(opts.FollowRegexps))
) )
for i, u := range pinnedURLStrs { for i, u := range pinnedURLStrs {
@ -165,16 +171,16 @@ func New(
} }
} }
for i, p := range patternStrs { for i, p := range opts.FollowRegexps {
if patterns[i], err = regexp.Compile(p); err != nil { if follows[i], err = regexp.Compile(p); err != nil {
return nil, fmt.Errorf("compiling regexp %q: %w", p, err) return nil, fmt.Errorf("compiling regexp %q: %w", p, err)
} }
} }
d := &DeadLinks{ d := &DeadLinks{
opts: *opts.withDefaults(), opts: *opts,
store: store, store: store,
patterns: patterns, follows: follows,
} }
d.clients = make([]Client, d.opts.Concurrency) d.clients = make([]Client, d.opts.Concurrency)
@ -200,8 +206,8 @@ func (d *DeadLinks) onError(ctx context.Context, err error) {
func (d *DeadLinks) shouldFollowURL(url URL) bool { func (d *DeadLinks) shouldFollowURL(url URL) bool {
urlStr := string(url) urlStr := string(url)
for _, pattern := range d.patterns { for _, follow := range d.follows {
if pattern.MatchString(urlStr) { if follow.MatchString(urlStr) {
return true return true
} }
} }