Compare commits

..

No commits in common. "3620eb9d0ba495406d46120c7d72d4e1132f99b9" and "f012eeebbffd75f7d0ea49089110936b65a42714" have entirely different histories.

3 changed files with 33 additions and 65 deletions

View File

@ -30,14 +30,14 @@ Any links which are dead will be output to stdout as YAML objects, each
containing the dead URL, the error encountered, and which pages link to it.
In order to recursively crawl through links you can give one or more regex
patterns. Any URL which matches a pattern will have its links followed and
checked as well (and if any of those linked URLs match a pattern their links
will be checked, and so on):
patterns. Any URL which matches a pattern will have its links checked as well
(and if any of those link URLs match a pattern their links will be checked, and
so on):
```
deadlinks \
-url='https://mediocregopher.com' -url='gemini://mediocregopher.com' \
-follow='://mediocregopher.com'
-pattern='://mediocregopher.com'
```
There are further options available which affect the utility's behavior, see

View File

@ -33,8 +33,7 @@ func main() {
storePath = flag.String("store-path", "", "Path to sqlite storage file. If not given then a temporary in-memory storage is used")
maxAge = flag.Duration("max-age", 0, "Maximum duration since last check of a resource, before it must be checked again. Must be used with -store-path")
urls = flagStrings("url", "URL which is always checked. Must be given at least once")
follows = flagStrings("follow", "URLs matching this regex will have their links checked as well. Can be specified multiple times")
ignores = flagStrings("ignore", "URLs matching this regex will not be checked at all")
patterns = flagStrings("pattern", "URLs matching this regex will have their links checked as well. Can be specified multiple times")
concurrency = flag.Int("concurrency", runtime.NumCPU()/2, "Number simultaneous requests to make at a time")
httpUserAgent = flag.String("http-user-agent", "", "User-agent to use for http requests")
)
@ -57,14 +56,13 @@ func main() {
ctx,
store,
*urls.strs,
*patterns.strs,
&deadlinks.Opts{
NewClient: func() deadlinks.Client {
return loggingClient{deadlinks.NewClient(&deadlinks.ClientOpts{
HTTPUserAgent: *httpUserAgent,
})}
},
FollowRegexps: *follows.strs,
IgnoreRegexps: *ignores.strs,
Concurrency: *concurrency,
OnError: func(err error) {
log.Printf("runtime error: %v", err)

View File

@ -24,11 +24,9 @@
// defer store.Close()
//
// pinnedURLs := []string{"https://some.website.com"}
// followRegexps := []string{"website.com"}
// patterns := []string{"website.com"}
//
// dl, err := deadlinks.New(ctx, store, pinnedURLs, &deadlinks.Opts{
// FollowRegexps: followRegexps,
// })
// dl, err := deadlinks.New(ctx, store, pinnedURLs, patterns, nil)
//
// `Update` is then used to crawl all links, starting with `pinnedURLs`:
//
@ -58,7 +56,7 @@
// })
// defer store.Close()
//
// dl, err := deadlinks.New(ctx, store, pinnedURLs, nil)
// dl, err := deadlinks.New(ctx, store, pinnedURLs, patterns, nil)
//
// # Further Customization
//
@ -85,13 +83,6 @@ type Opts struct {
NewClient func() Client // Defaults to `func () Client { return NewClient(nil) }`
Parser Parser // Defaults to `NewParser()`
// If a URL matches any of these regexps then any links found within it will
// be followed and checked for liveness themselves.
FollowRegexps []string
// If a URL matches any of these regexps then it will not be checked at all.
IgnoreRegexps []string
// Concurrency determines the maximum number of URLs which can be checked
// simultaneously.
//
@ -142,42 +133,30 @@ func (o *Opts) withDefaults() *Opts {
type DeadLinks struct {
opts Opts
store Store
follows, ignores []*regexp.Regexp
patterns []*regexp.Regexp
clients []Client
}
func compileRegexps(strs []string) ([]*regexp.Regexp, error) {
var (
rr = make([]*regexp.Regexp, len(strs))
err error
)
for i, str := range strs {
if rr[i], err = regexp.Compile(str); err != nil {
return nil, fmt.Errorf("compiling regexp %q: %w", str, err)
}
}
return rr, nil
}
// New initializes and returns a DeadLinks instance which will track the
// liveness of the given set of pinned URLs, and potentially URLs linked to from
// those.
// liveness of the given set of pinned URLs, as well as all URLs linked to from
// those. If a linked URL matches one of the given regexp patterns then any
// URLs linked to from it will be tracked as well.
//
// If a previously used Store is passed to New then whatever set of previously
// pinned URLs were present will be overwritten with the given ones.
// If a non-empty Store is passed to New then whatever set of previously pinned
// URLs were present will be overwritten with the given ones.
func New(
ctx context.Context,
store Store,
pinnedURLStrs []string,
pinnedURLStrs,
patternStrs []string,
opts *Opts,
) (
*DeadLinks, error,
) {
opts = opts.withDefaults()
var (
err error
pinnedURLs = make([]URL, len(pinnedURLStrs))
patterns = make([]*regexp.Regexp, len(patternStrs))
)
for i, u := range pinnedURLStrs {
@ -186,21 +165,16 @@ func New(
}
}
follows, err := compileRegexps(opts.FollowRegexps)
if err != nil {
return nil, fmt.Errorf("compiling follows: %w", err)
for i, p := range patternStrs {
if patterns[i], err = regexp.Compile(p); err != nil {
return nil, fmt.Errorf("compiling regexp %q: %w", p, err)
}
ignores, err := compileRegexps(opts.IgnoreRegexps)
if err != nil {
return nil, fmt.Errorf("compiling ignores: %w", err)
}
d := &DeadLinks{
opts: *opts,
opts: *opts.withDefaults(),
store: store,
follows: follows,
ignores: ignores,
patterns: patterns,
}
d.clients = make([]Client, d.opts.Concurrency)
@ -224,10 +198,10 @@ func (d *DeadLinks) onError(ctx context.Context, err error) {
d.opts.OnError(err)
}
func matchesAnyRegexp(url URL, rr []*regexp.Regexp) bool {
func (d *DeadLinks) shouldFollowURL(url URL) bool {
urlStr := string(url)
for _, r := range rr {
if r.MatchString(urlStr) {
for _, pattern := range d.patterns {
if pattern.MatchString(urlStr) {
return true
}
}
@ -239,10 +213,6 @@ func (d *DeadLinks) getURL(
) (
[]URL, error,
) {
if matchesAnyRegexp(url, d.ignores) {
return nil, nil
}
ctx, cancel := context.WithTimeout(ctx, d.opts.RequestTimeout)
defer cancel()
@ -257,7 +227,7 @@ func (d *DeadLinks) getURL(
mimeType = mimeType[:i]
}
if !matchesAnyRegexp(url, d.follows) {
if !d.shouldFollowURL(url) {
return nil, nil
}