From 8a89597d7a9fc33debb96e05362b00ea4e6a748a Mon Sep 17 00:00:00 2001 From: Brian Picciano Date: Fri, 29 Dec 2023 17:09:49 +0100 Subject: [PATCH] Implement basic functionality of top-level DeadLinks type --- deadlinks.go | 262 ++++++++++++++++++++++++++++++++++++++------------- resource.go | 45 +++++++++ url.go | 53 +++++++++++ 3 files changed, 297 insertions(+), 63 deletions(-) create mode 100644 resource.go create mode 100644 url.go diff --git a/deadlinks.go b/deadlinks.go index 9e551bc..02dfcc4 100644 --- a/deadlinks.go +++ b/deadlinks.go @@ -15,93 +15,229 @@ package deadlinks import ( + "context" "errors" "fmt" - "net/url" + "regexp" + "runtime" + "sync" "time" + + "code.betamike.com/mediocregopher/mediocre-go-lib/miter" ) -// URL is a standard universal resource identifier, normalized particularly for -// this package. -type URL string +// Opts are optional fields which can be provided to New. A nil Opts is +// equivalent to an empty one. +type Opts struct { + Client Client // Defaults to `NewClient(nil)` + Parser Parser // Defaults to `NewParser()` -// ParseURL parses and returns a URL based on the given string, or an error. -func ParseURL(urlStr string) (URL, error) { - u, err := url.Parse(urlStr) - if err != nil { - return "", err - } - return URL(u.String()), nil + // Concurrency determines the maximum number of URLs which can be checked + // simultaneously. + // + // Default: `runtime.NumCPU()` + Concurrency int + + // OnError, if set, will be called whenever DeadLinks encounters an error + // internally that it would otherwise skip over. + OnError func(error) } -func parseURLs(urlStrs []string) ([]URL, error) { +func (o *Opts) withDefaults() *Opts { + if o == nil { + o = new(Opts) + } + + if o.Client == nil { + o.Client = NewClient(nil) + } + + if o.Parser == nil { + o.Parser = NewParser() + } + + if o.Concurrency == 0 { + o.Concurrency = runtime.NumCPU() + } + + return o +} + +// DeadLinks crawls a configured space of URLs and keeps track of any broken +// links which it finds. +// +// DeadLinks supports multiple web protocols and document formats +// out-of-the-box, and will traverse between them as necessary based on URL +// schemas. See the `NewClient` and `NewParser` functions for more details. +type DeadLinks struct { + opts Opts + store Store + patterns []*regexp.Regexp +} + +// New initializes and returns a DeadLinks instance which will track the +// liveness of the given set of pinned URLs, as well as all URLs linked to from +// those. If a linked URL matches one of the given regexp patterns then any +// URLs linked to from it will be tracked as well. +// +// If a non-empty Store is passed to New then whatever set of previously pinned +// URLs were present will be overwritten with the given ones. +func New( + ctx context.Context, + store Store, + pinnedURLStrs, + patternStrs []string, + opts *Opts, +) ( + *DeadLinks, error, +) { var ( - res = make([]URL, 0, len(urlStrs)) - errs []error + err error + pinnedURLs = make([]URL, len(pinnedURLStrs)) + patterns = make([]*regexp.Regexp, len(patternStrs)) ) - for _, urlStr := range urlStrs { - u, err := ParseURL(urlStr) - if err == nil { - res = append(res, u) - } else { - errs = append(errs, err) + + for i, u := range pinnedURLStrs { + if pinnedURLs[i], err = ParseURL(u); err != nil { + return nil, fmt.Errorf("parsing url %q: %w", u, err) } } - return res, errors.Join(errs...) + + for i, p := range patternStrs { + if patterns[i], err = regexp.Compile(p); err != nil { + return nil, fmt.Errorf("compiling regexp %q: %w", p, err) + } + } + + d := &DeadLinks{ + opts: *opts.withDefaults(), + store: store, + patterns: patterns, + } + + if err := d.store.SetPinned(ctx, pinnedURLs); err != nil { + return nil, fmt.Errorf("pinning URLs: %w", err) + } + + return d, nil } -func (u URL) toStd() *url.URL { - uu, err := url.Parse(string(u)) +func (d *DeadLinks) onError(ctx context.Context, err error) { + if d.opts.OnError == nil || + (ctx.Err() != nil && errors.Is(err, ctx.Err())) { + return + } + + d.opts.OnError(err) +} + +func (d *DeadLinks) shouldFollowURL(url URL) bool { + urlStr := string(url) + for _, pattern := range d.patterns { + if pattern.MatchString(urlStr) { + return true + } + } + return false +} + +func (d *DeadLinks) getURL(ctx context.Context, url URL) ([]URL, error) { + mimeType, body, err := d.opts.Client.Get(ctx, url) if err != nil { - panic(fmt.Sprintf("parsing URL %q: %v", u, err)) + return nil, err } - return uu + defer body.Close() + + if !d.shouldFollowURL(url) { + return nil, nil + } + + return d.opts.Parser.Parse(mimeType, body) } -// ResolveReference is equivalend to the method of the same name in `net/url`. -func (u URL) ResolveReference(u2Str string) (URL, error) { - u2, err := url.Parse(u2Str) +// checkURL only returns an error if storing the results of the check fails. +func (d *DeadLinks) checkURL(ctx context.Context, url URL) error { + var ( + now = time.Now() + status = ResourceStatusOK + errorStr string + ) + + outgoingURLs, err := d.getURL(ctx, url) if err != nil { - return "", err + status = ResourceStatusError + errorStr = err.Error() } - return URL(u.toStd().ResolveReference(u2).String()), nil -} -// ResourceStatus describes what state a particular Resource is in. -type ResourceStatus int - -// Enumeration of ResourceStatus values. -const ( - ResourceStatusUnknown ResourceStatus = iota - ResourceStatusOK - ResourceStatusError -) - -func (ds ResourceStatus) String() string { - switch ds { - case ResourceStatusUnknown: - return "UNKNOWN" - case ResourceStatusOK: - return "OK" - case ResourceStatusError: - return "ERROR" - default: - panic(fmt.Sprintf("unknown ResourceStatus: %#v", ds)) + err = d.store.Update(ctx, now, url, status, errorStr, outgoingURLs) + if err != nil { + return fmt.Errorf( + "failed storing status:%v (errorStr:%q) and %d outgoingURLs: %w", + status, errorStr, len(outgoingURLs), err, + ) } + + return nil } -// Resource describes the current state of a resource, with the resource being -// uniquely identified by a URL. -type Resource struct { - URL URL - Status ResourceStatus - Pinned bool - LastChecked time.Time +// Update runs through all pinned or previously discovered URLs which were +// last checked prior to the given time (or which have never been checked) and +// updates the internal storage with any new URLs and links to dead URLs which +// it finds. +func (d *DeadLinks) Update( + ctx context.Context, lastCheckedBefore time.Time, +) error { + var ( + wg = new(sync.WaitGroup) + ch = make(chan URL, d.opts.Concurrency) + ) - // only set if Status == ResourceStatusError - ErrorString string + wg.Add(d.opts.Concurrency) + for i := 0; i < d.opts.Concurrency; i++ { + go func() { + defer wg.Done() + for url := range ch { + if err := d.checkURL(ctx, url); err != nil { + d.onError(ctx, fmt.Errorf("checking url %q: %w", url, err)) + } + } + }() + } - // Indicate the URLs of resources which link to/are linked from this - // resource. - IncomingLinkURLs, OutgoingLinkURLs []URL + var err error + + // Because checking URLs can result in new URLs being inserted into the + // Store, we query the Store in a loop until it stops producing + // unvisited/stale URLs. + for { + var ( + n int + urlsIter = d.store.GetURLsByLastChecked(lastCheckedBefore) + ) + + err = miter.ForEach(ctx, urlsIter, func(url URL) error { + ch <- url + n++ + return nil + }) + + if err != nil || n == 0 { + break + } + } + + close(ch) + wg.Wait() + + if err != nil { + return fmt.Errorf("iterating urls needing checked: %w", err) + } + + if err := d.store.GC(ctx); err != nil { + return fmt.Errorf("garbage collecting: %w", err) + } + + return nil } + +// TODO expose GetByStatus diff --git a/resource.go b/resource.go new file mode 100644 index 0000000..7395bda --- /dev/null +++ b/resource.go @@ -0,0 +1,45 @@ +package deadlinks + +import ( + "fmt" + "time" +) + +// ResourceStatus describes what state a particular Resource is in. +type ResourceStatus int + +// Enumeration of ResourceStatus values. +const ( + ResourceStatusUnknown ResourceStatus = iota + ResourceStatusOK + ResourceStatusError +) + +func (ds ResourceStatus) String() string { + switch ds { + case ResourceStatusUnknown: + return "UNKNOWN" + case ResourceStatusOK: + return "OK" + case ResourceStatusError: + return "ERROR" + default: + panic(fmt.Sprintf("unknown ResourceStatus: %#v", ds)) + } +} + +// Resource describes the current state of a resource, with the resource being +// uniquely identified by a URL. +type Resource struct { + URL URL + Status ResourceStatus + Pinned bool + LastChecked time.Time + + // only set if Status == ResourceStatusError + ErrorString string + + // Indicate the URLs of resources which link to/are linked from this + // resource. + IncomingLinkURLs, OutgoingLinkURLs []URL +} diff --git a/url.go b/url.go new file mode 100644 index 0000000..86fc379 --- /dev/null +++ b/url.go @@ -0,0 +1,53 @@ +package deadlinks + +import ( + "errors" + "fmt" + "net/url" +) + +// URL is a standard universal resource identifier, normalized particularly for +// this package. +type URL string + +// ParseURL parses and returns a URL based on the given string, or an error. +func ParseURL(urlStr string) (URL, error) { + u, err := url.Parse(urlStr) + if err != nil { + return "", err + } + return URL(u.String()), nil +} + +func parseURLs(urlStrs []string) ([]URL, error) { + var ( + res = make([]URL, 0, len(urlStrs)) + errs []error + ) + for _, urlStr := range urlStrs { + u, err := ParseURL(urlStr) + if err == nil { + res = append(res, u) + } else { + errs = append(errs, err) + } + } + return res, errors.Join(errs...) +} + +func (u URL) toStd() *url.URL { + uu, err := url.Parse(string(u)) + if err != nil { + panic(fmt.Sprintf("parsing URL %q: %v", u, err)) + } + return uu +} + +// ResolveReference is equivalend to the method of the same name in `net/url`. +func (u URL) ResolveReference(u2Str string) (URL, error) { + u2, err := url.Parse(u2Str) + if err != nil { + return "", err + } + return URL(u.toStd().ResolveReference(u2).String()), nil +}