Compare commits

..

No commits in common. "3620eb9d0ba495406d46120c7d72d4e1132f99b9" and "f012eeebbffd75f7d0ea49089110936b65a42714" have entirely different histories.

3 changed files with 33 additions and 65 deletions

View File

@ -30,14 +30,14 @@ Any links which are dead will be output to stdout as YAML objects, each
containing the dead URL, the error encountered, and which pages link to it. containing the dead URL, the error encountered, and which pages link to it.
In order to recursively crawl through links you can give one or more regex In order to recursively crawl through links you can give one or more regex
patterns. Any URL which matches a pattern will have its links followed and patterns. Any URL which matches a pattern will have its links checked as well
checked as well (and if any of those linked URLs match a pattern their links (and if any of those link URLs match a pattern their links will be checked, and
will be checked, and so on): so on):
``` ```
deadlinks \ deadlinks \
-url='https://mediocregopher.com' -url='gemini://mediocregopher.com' \ -url='https://mediocregopher.com' -url='gemini://mediocregopher.com' \
-follow='://mediocregopher.com' -pattern='://mediocregopher.com'
``` ```
There are further options available which affect the utility's behavior, see There are further options available which affect the utility's behavior, see

View File

@ -33,8 +33,7 @@ func main() {
storePath = flag.String("store-path", "", "Path to sqlite storage file. If not given then a temporary in-memory storage is used") storePath = flag.String("store-path", "", "Path to sqlite storage file. If not given then a temporary in-memory storage is used")
maxAge = flag.Duration("max-age", 0, "Maximum duration since last check of a resource, before it must be checked again. Must be used with -store-path") maxAge = flag.Duration("max-age", 0, "Maximum duration since last check of a resource, before it must be checked again. Must be used with -store-path")
urls = flagStrings("url", "URL which is always checked. Must be given at least once") urls = flagStrings("url", "URL which is always checked. Must be given at least once")
follows = flagStrings("follow", "URLs matching this regex will have their links checked as well. Can be specified multiple times") patterns = flagStrings("pattern", "URLs matching this regex will have their links checked as well. Can be specified multiple times")
ignores = flagStrings("ignore", "URLs matching this regex will not be checked at all")
concurrency = flag.Int("concurrency", runtime.NumCPU()/2, "Number simultaneous requests to make at a time") concurrency = flag.Int("concurrency", runtime.NumCPU()/2, "Number simultaneous requests to make at a time")
httpUserAgent = flag.String("http-user-agent", "", "User-agent to use for http requests") httpUserAgent = flag.String("http-user-agent", "", "User-agent to use for http requests")
) )
@ -57,15 +56,14 @@ func main() {
ctx, ctx,
store, store,
*urls.strs, *urls.strs,
*patterns.strs,
&deadlinks.Opts{ &deadlinks.Opts{
NewClient: func() deadlinks.Client { NewClient: func() deadlinks.Client {
return loggingClient{deadlinks.NewClient(&deadlinks.ClientOpts{ return loggingClient{deadlinks.NewClient(&deadlinks.ClientOpts{
HTTPUserAgent: *httpUserAgent, HTTPUserAgent: *httpUserAgent,
})} })}
}, },
FollowRegexps: *follows.strs, Concurrency: *concurrency,
IgnoreRegexps: *ignores.strs,
Concurrency: *concurrency,
OnError: func(err error) { OnError: func(err error) {
log.Printf("runtime error: %v", err) log.Printf("runtime error: %v", err)
}, },

View File

@ -24,11 +24,9 @@
// defer store.Close() // defer store.Close()
// //
// pinnedURLs := []string{"https://some.website.com"} // pinnedURLs := []string{"https://some.website.com"}
// followRegexps := []string{"website.com"} // patterns := []string{"website.com"}
// //
// dl, err := deadlinks.New(ctx, store, pinnedURLs, &deadlinks.Opts{ // dl, err := deadlinks.New(ctx, store, pinnedURLs, patterns, nil)
// FollowRegexps: followRegexps,
// })
// //
// `Update` is then used to crawl all links, starting with `pinnedURLs`: // `Update` is then used to crawl all links, starting with `pinnedURLs`:
// //
@ -58,7 +56,7 @@
// }) // })
// defer store.Close() // defer store.Close()
// //
// dl, err := deadlinks.New(ctx, store, pinnedURLs, nil) // dl, err := deadlinks.New(ctx, store, pinnedURLs, patterns, nil)
// //
// # Further Customization // # Further Customization
// //
@ -85,13 +83,6 @@ type Opts struct {
NewClient func() Client // Defaults to `func () Client { return NewClient(nil) }` NewClient func() Client // Defaults to `func () Client { return NewClient(nil) }`
Parser Parser // Defaults to `NewParser()` Parser Parser // Defaults to `NewParser()`
// If a URL matches any of these regexps then any links found within it will
// be followed and checked for liveness themselves.
FollowRegexps []string
// If a URL matches any of these regexps then it will not be checked at all.
IgnoreRegexps []string
// Concurrency determines the maximum number of URLs which can be checked // Concurrency determines the maximum number of URLs which can be checked
// simultaneously. // simultaneously.
// //
@ -140,44 +131,32 @@ func (o *Opts) withDefaults() *Opts {
// out-of-the-box, and will traverse between them as necessary based on URL // out-of-the-box, and will traverse between them as necessary based on URL
// schemas. See the `NewClient` and `NewParser` functions for more details. // schemas. See the `NewClient` and `NewParser` functions for more details.
type DeadLinks struct { type DeadLinks struct {
opts Opts opts Opts
store Store store Store
follows, ignores []*regexp.Regexp patterns []*regexp.Regexp
clients []Client clients []Client
}
func compileRegexps(strs []string) ([]*regexp.Regexp, error) {
var (
rr = make([]*regexp.Regexp, len(strs))
err error
)
for i, str := range strs {
if rr[i], err = regexp.Compile(str); err != nil {
return nil, fmt.Errorf("compiling regexp %q: %w", str, err)
}
}
return rr, nil
} }
// New initializes and returns a DeadLinks instance which will track the // New initializes and returns a DeadLinks instance which will track the
// liveness of the given set of pinned URLs, and potentially URLs linked to from // liveness of the given set of pinned URLs, as well as all URLs linked to from
// those. // those. If a linked URL matches one of the given regexp patterns then any
// URLs linked to from it will be tracked as well.
// //
// If a previously used Store is passed to New then whatever set of previously // If a non-empty Store is passed to New then whatever set of previously pinned
// pinned URLs were present will be overwritten with the given ones. // URLs were present will be overwritten with the given ones.
func New( func New(
ctx context.Context, ctx context.Context,
store Store, store Store,
pinnedURLStrs []string, pinnedURLStrs,
patternStrs []string,
opts *Opts, opts *Opts,
) ( ) (
*DeadLinks, error, *DeadLinks, error,
) { ) {
opts = opts.withDefaults()
var ( var (
err error err error
pinnedURLs = make([]URL, len(pinnedURLStrs)) pinnedURLs = make([]URL, len(pinnedURLStrs))
patterns = make([]*regexp.Regexp, len(patternStrs))
) )
for i, u := range pinnedURLStrs { for i, u := range pinnedURLStrs {
@ -186,21 +165,16 @@ func New(
} }
} }
follows, err := compileRegexps(opts.FollowRegexps) for i, p := range patternStrs {
if err != nil { if patterns[i], err = regexp.Compile(p); err != nil {
return nil, fmt.Errorf("compiling follows: %w", err) return nil, fmt.Errorf("compiling regexp %q: %w", p, err)
} }
ignores, err := compileRegexps(opts.IgnoreRegexps)
if err != nil {
return nil, fmt.Errorf("compiling ignores: %w", err)
} }
d := &DeadLinks{ d := &DeadLinks{
opts: *opts, opts: *opts.withDefaults(),
store: store, store: store,
follows: follows, patterns: patterns,
ignores: ignores,
} }
d.clients = make([]Client, d.opts.Concurrency) d.clients = make([]Client, d.opts.Concurrency)
@ -224,10 +198,10 @@ func (d *DeadLinks) onError(ctx context.Context, err error) {
d.opts.OnError(err) d.opts.OnError(err)
} }
func matchesAnyRegexp(url URL, rr []*regexp.Regexp) bool { func (d *DeadLinks) shouldFollowURL(url URL) bool {
urlStr := string(url) urlStr := string(url)
for _, r := range rr { for _, pattern := range d.patterns {
if r.MatchString(urlStr) { if pattern.MatchString(urlStr) {
return true return true
} }
} }
@ -239,10 +213,6 @@ func (d *DeadLinks) getURL(
) ( ) (
[]URL, error, []URL, error,
) { ) {
if matchesAnyRegexp(url, d.ignores) {
return nil, nil
}
ctx, cancel := context.WithTimeout(ctx, d.opts.RequestTimeout) ctx, cancel := context.WithTimeout(ctx, d.opts.RequestTimeout)
defer cancel() defer cancel()
@ -257,7 +227,7 @@ func (d *DeadLinks) getURL(
mimeType = mimeType[:i] mimeType = mimeType[:i]
} }
if !matchesAnyRegexp(url, d.follows) { if !d.shouldFollowURL(url) {
return nil, nil return nil, nil
} }