|
|
|
@ -15,93 +15,229 @@ |
|
|
|
|
package deadlinks |
|
|
|
|
|
|
|
|
|
import ( |
|
|
|
|
"context" |
|
|
|
|
"errors" |
|
|
|
|
"fmt" |
|
|
|
|
"net/url" |
|
|
|
|
"regexp" |
|
|
|
|
"runtime" |
|
|
|
|
"sync" |
|
|
|
|
"time" |
|
|
|
|
|
|
|
|
|
"code.betamike.com/mediocregopher/mediocre-go-lib/miter" |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
// URL is a standard universal resource identifier, normalized particularly for
|
|
|
|
|
// this package.
|
|
|
|
|
type URL string |
|
|
|
|
// Opts are optional fields which can be provided to New. A nil Opts is
|
|
|
|
|
// equivalent to an empty one.
|
|
|
|
|
type Opts struct { |
|
|
|
|
Client Client // Defaults to `NewClient(nil)`
|
|
|
|
|
Parser Parser // Defaults to `NewParser()`
|
|
|
|
|
|
|
|
|
|
// ParseURL parses and returns a URL based on the given string, or an error.
|
|
|
|
|
func ParseURL(urlStr string) (URL, error) { |
|
|
|
|
u, err := url.Parse(urlStr) |
|
|
|
|
if err != nil { |
|
|
|
|
return "", err |
|
|
|
|
// Concurrency determines the maximum number of URLs which can be checked
|
|
|
|
|
// simultaneously.
|
|
|
|
|
//
|
|
|
|
|
// Default: `runtime.NumCPU()`
|
|
|
|
|
Concurrency int |
|
|
|
|
|
|
|
|
|
// OnError, if set, will be called whenever DeadLinks encounters an error
|
|
|
|
|
// internally that it would otherwise skip over.
|
|
|
|
|
OnError func(error) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
func (o *Opts) withDefaults() *Opts { |
|
|
|
|
if o == nil { |
|
|
|
|
o = new(Opts) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if o.Client == nil { |
|
|
|
|
o.Client = NewClient(nil) |
|
|
|
|
} |
|
|
|
|
return URL(u.String()), nil |
|
|
|
|
|
|
|
|
|
if o.Parser == nil { |
|
|
|
|
o.Parser = NewParser() |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if o.Concurrency == 0 { |
|
|
|
|
o.Concurrency = runtime.NumCPU() |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return o |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// DeadLinks crawls a configured space of URLs and keeps track of any broken
|
|
|
|
|
// links which it finds.
|
|
|
|
|
//
|
|
|
|
|
// DeadLinks supports multiple web protocols and document formats
|
|
|
|
|
// out-of-the-box, and will traverse between them as necessary based on URL
|
|
|
|
|
// schemas. See the `NewClient` and `NewParser` functions for more details.
|
|
|
|
|
type DeadLinks struct { |
|
|
|
|
opts Opts |
|
|
|
|
store Store |
|
|
|
|
patterns []*regexp.Regexp |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
func parseURLs(urlStrs []string) ([]URL, error) { |
|
|
|
|
// New initializes and returns a DeadLinks instance which will track the
|
|
|
|
|
// liveness of the given set of pinned URLs, as well as all URLs linked to from
|
|
|
|
|
// those. If a linked URL matches one of the given regexp patterns then any
|
|
|
|
|
// URLs linked to from it will be tracked as well.
|
|
|
|
|
//
|
|
|
|
|
// If a non-empty Store is passed to New then whatever set of previously pinned
|
|
|
|
|
// URLs were present will be overwritten with the given ones.
|
|
|
|
|
func New( |
|
|
|
|
ctx context.Context, |
|
|
|
|
store Store, |
|
|
|
|
pinnedURLStrs, |
|
|
|
|
patternStrs []string, |
|
|
|
|
opts *Opts, |
|
|
|
|
) ( |
|
|
|
|
*DeadLinks, error, |
|
|
|
|
) { |
|
|
|
|
var ( |
|
|
|
|
res = make([]URL, 0, len(urlStrs)) |
|
|
|
|
errs []error |
|
|
|
|
err error |
|
|
|
|
pinnedURLs = make([]URL, len(pinnedURLStrs)) |
|
|
|
|
patterns = make([]*regexp.Regexp, len(patternStrs)) |
|
|
|
|
) |
|
|
|
|
for _, urlStr := range urlStrs { |
|
|
|
|
u, err := ParseURL(urlStr) |
|
|
|
|
if err == nil { |
|
|
|
|
res = append(res, u) |
|
|
|
|
} else { |
|
|
|
|
errs = append(errs, err) |
|
|
|
|
|
|
|
|
|
for i, u := range pinnedURLStrs { |
|
|
|
|
if pinnedURLs[i], err = ParseURL(u); err != nil { |
|
|
|
|
return nil, fmt.Errorf("parsing url %q: %w", u, err) |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
return res, errors.Join(errs...) |
|
|
|
|
|
|
|
|
|
for i, p := range patternStrs { |
|
|
|
|
if patterns[i], err = regexp.Compile(p); err != nil { |
|
|
|
|
return nil, fmt.Errorf("compiling regexp %q: %w", p, err) |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
d := &DeadLinks{ |
|
|
|
|
opts: *opts.withDefaults(), |
|
|
|
|
store: store, |
|
|
|
|
patterns: patterns, |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if err := d.store.SetPinned(ctx, pinnedURLs); err != nil { |
|
|
|
|
return nil, fmt.Errorf("pinning URLs: %w", err) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return d, nil |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
func (u URL) toStd() *url.URL { |
|
|
|
|
uu, err := url.Parse(string(u)) |
|
|
|
|
if err != nil { |
|
|
|
|
panic(fmt.Sprintf("parsing URL %q: %v", u, err)) |
|
|
|
|
func (d *DeadLinks) onError(ctx context.Context, err error) { |
|
|
|
|
if d.opts.OnError == nil || |
|
|
|
|
(ctx.Err() != nil && errors.Is(err, ctx.Err())) { |
|
|
|
|
return |
|
|
|
|
} |
|
|
|
|
return uu |
|
|
|
|
|
|
|
|
|
d.opts.OnError(err) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// ResolveReference is equivalend to the method of the same name in `net/url`.
|
|
|
|
|
func (u URL) ResolveReference(u2Str string) (URL, error) { |
|
|
|
|
u2, err := url.Parse(u2Str) |
|
|
|
|
func (d *DeadLinks) shouldFollowURL(url URL) bool { |
|
|
|
|
urlStr := string(url) |
|
|
|
|
for _, pattern := range d.patterns { |
|
|
|
|
if pattern.MatchString(urlStr) { |
|
|
|
|
return true |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
return false |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
func (d *DeadLinks) getURL(ctx context.Context, url URL) ([]URL, error) { |
|
|
|
|
mimeType, body, err := d.opts.Client.Get(ctx, url) |
|
|
|
|
if err != nil { |
|
|
|
|
return "", err |
|
|
|
|
return nil, err |
|
|
|
|
} |
|
|
|
|
defer body.Close() |
|
|
|
|
|
|
|
|
|
if !d.shouldFollowURL(url) { |
|
|
|
|
return nil, nil |
|
|
|
|
} |
|
|
|
|
return URL(u.toStd().ResolveReference(u2).String()), nil |
|
|
|
|
|
|
|
|
|
return d.opts.Parser.Parse(mimeType, body) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// ResourceStatus describes what state a particular Resource is in.
|
|
|
|
|
type ResourceStatus int |
|
|
|
|
// checkURL only returns an error if storing the results of the check fails.
|
|
|
|
|
func (d *DeadLinks) checkURL(ctx context.Context, url URL) error { |
|
|
|
|
var ( |
|
|
|
|
now = time.Now() |
|
|
|
|
status = ResourceStatusOK |
|
|
|
|
errorStr string |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
// Enumeration of ResourceStatus values.
|
|
|
|
|
const ( |
|
|
|
|
ResourceStatusUnknown ResourceStatus = iota |
|
|
|
|
ResourceStatusOK |
|
|
|
|
ResourceStatusError |
|
|
|
|
) |
|
|
|
|
outgoingURLs, err := d.getURL(ctx, url) |
|
|
|
|
if err != nil { |
|
|
|
|
status = ResourceStatusError |
|
|
|
|
errorStr = err.Error() |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
func (ds ResourceStatus) String() string { |
|
|
|
|
switch ds { |
|
|
|
|
case ResourceStatusUnknown: |
|
|
|
|
return "UNKNOWN" |
|
|
|
|
case ResourceStatusOK: |
|
|
|
|
return "OK" |
|
|
|
|
case ResourceStatusError: |
|
|
|
|
return "ERROR" |
|
|
|
|
default: |
|
|
|
|
panic(fmt.Sprintf("unknown ResourceStatus: %#v", ds)) |
|
|
|
|
err = d.store.Update(ctx, now, url, status, errorStr, outgoingURLs) |
|
|
|
|
if err != nil { |
|
|
|
|
return fmt.Errorf( |
|
|
|
|
"failed storing status:%v (errorStr:%q) and %d outgoingURLs: %w", |
|
|
|
|
status, errorStr, len(outgoingURLs), err, |
|
|
|
|
) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return nil |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Resource describes the current state of a resource, with the resource being
|
|
|
|
|
// uniquely identified by a URL.
|
|
|
|
|
type Resource struct { |
|
|
|
|
URL URL |
|
|
|
|
Status ResourceStatus |
|
|
|
|
Pinned bool |
|
|
|
|
LastChecked time.Time |
|
|
|
|
// Update runs through all pinned or previously discovered URLs which were
|
|
|
|
|
// last checked prior to the given time (or which have never been checked) and
|
|
|
|
|
// updates the internal storage with any new URLs and links to dead URLs which
|
|
|
|
|
// it finds.
|
|
|
|
|
func (d *DeadLinks) Update( |
|
|
|
|
ctx context.Context, lastCheckedBefore time.Time, |
|
|
|
|
) error { |
|
|
|
|
var ( |
|
|
|
|
wg = new(sync.WaitGroup) |
|
|
|
|
ch = make(chan URL, d.opts.Concurrency) |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
wg.Add(d.opts.Concurrency) |
|
|
|
|
for i := 0; i < d.opts.Concurrency; i++ { |
|
|
|
|
go func() { |
|
|
|
|
defer wg.Done() |
|
|
|
|
for url := range ch { |
|
|
|
|
if err := d.checkURL(ctx, url); err != nil { |
|
|
|
|
d.onError(ctx, fmt.Errorf("checking url %q: %w", url, err)) |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
}() |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
var err error |
|
|
|
|
|
|
|
|
|
// Because checking URLs can result in new URLs being inserted into the
|
|
|
|
|
// Store, we query the Store in a loop until it stops producing
|
|
|
|
|
// unvisited/stale URLs.
|
|
|
|
|
for { |
|
|
|
|
var ( |
|
|
|
|
n int |
|
|
|
|
urlsIter = d.store.GetURLsByLastChecked(lastCheckedBefore) |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
err = miter.ForEach(ctx, urlsIter, func(url URL) error { |
|
|
|
|
ch <- url |
|
|
|
|
n++ |
|
|
|
|
return nil |
|
|
|
|
}) |
|
|
|
|
|
|
|
|
|
if err != nil || n == 0 { |
|
|
|
|
break |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
close(ch) |
|
|
|
|
wg.Wait() |
|
|
|
|
|
|
|
|
|
// only set if Status == ResourceStatusError
|
|
|
|
|
ErrorString string |
|
|
|
|
if err != nil { |
|
|
|
|
return fmt.Errorf("iterating urls needing checked: %w", err) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Indicate the URLs of resources which link to/are linked from this
|
|
|
|
|
// resource.
|
|
|
|
|
IncomingLinkURLs, OutgoingLinkURLs []URL |
|
|
|
|
if err := d.store.GC(ctx); err != nil { |
|
|
|
|
return fmt.Errorf("garbage collecting: %w", err) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return nil |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// TODO expose GetByStatus
|
|
|
|
|