Implement basic functionality of top-level DeadLinks type

This commit is contained in:
Brian Picciano 2023-12-29 17:09:49 +01:00
parent e03e4037d2
commit 8a89597d7a
3 changed files with 297 additions and 63 deletions

View File

@ -15,93 +15,229 @@
package deadlinks package deadlinks
import ( import (
"context"
"errors" "errors"
"fmt" "fmt"
"net/url" "regexp"
"runtime"
"sync"
"time" "time"
"code.betamike.com/mediocregopher/mediocre-go-lib/miter"
) )
// URL is a standard universal resource identifier, normalized particularly for // Opts are optional fields which can be provided to New. A nil Opts is
// this package. // equivalent to an empty one.
type URL string type Opts struct {
Client Client // Defaults to `NewClient(nil)`
Parser Parser // Defaults to `NewParser()`
// ParseURL parses and returns a URL based on the given string, or an error. // Concurrency determines the maximum number of URLs which can be checked
func ParseURL(urlStr string) (URL, error) { // simultaneously.
u, err := url.Parse(urlStr) //
if err != nil { // Default: `runtime.NumCPU()`
return "", err Concurrency int
}
return URL(u.String()), nil // OnError, if set, will be called whenever DeadLinks encounters an error
// internally that it would otherwise skip over.
OnError func(error)
} }
func parseURLs(urlStrs []string) ([]URL, error) { func (o *Opts) withDefaults() *Opts {
if o == nil {
o = new(Opts)
}
if o.Client == nil {
o.Client = NewClient(nil)
}
if o.Parser == nil {
o.Parser = NewParser()
}
if o.Concurrency == 0 {
o.Concurrency = runtime.NumCPU()
}
return o
}
// DeadLinks crawls a configured space of URLs and keeps track of any broken
// links which it finds.
//
// DeadLinks supports multiple web protocols and document formats
// out-of-the-box, and will traverse between them as necessary based on URL
// schemas. See the `NewClient` and `NewParser` functions for more details.
type DeadLinks struct {
opts Opts
store Store
patterns []*regexp.Regexp
}
// New initializes and returns a DeadLinks instance which will track the
// liveness of the given set of pinned URLs, as well as all URLs linked to from
// those. If a linked URL matches one of the given regexp patterns then any
// URLs linked to from it will be tracked as well.
//
// If a non-empty Store is passed to New then whatever set of previously pinned
// URLs were present will be overwritten with the given ones.
func New(
ctx context.Context,
store Store,
pinnedURLStrs,
patternStrs []string,
opts *Opts,
) (
*DeadLinks, error,
) {
var ( var (
res = make([]URL, 0, len(urlStrs)) err error
errs []error pinnedURLs = make([]URL, len(pinnedURLStrs))
) patterns = make([]*regexp.Regexp, len(patternStrs))
for _, urlStr := range urlStrs {
u, err := ParseURL(urlStr)
if err == nil {
res = append(res, u)
} else {
errs = append(errs, err)
}
}
return res, errors.Join(errs...)
}
func (u URL) toStd() *url.URL {
uu, err := url.Parse(string(u))
if err != nil {
panic(fmt.Sprintf("parsing URL %q: %v", u, err))
}
return uu
}
// ResolveReference is equivalend to the method of the same name in `net/url`.
func (u URL) ResolveReference(u2Str string) (URL, error) {
u2, err := url.Parse(u2Str)
if err != nil {
return "", err
}
return URL(u.toStd().ResolveReference(u2).String()), nil
}
// ResourceStatus describes what state a particular Resource is in.
type ResourceStatus int
// Enumeration of ResourceStatus values.
const (
ResourceStatusUnknown ResourceStatus = iota
ResourceStatusOK
ResourceStatusError
) )
func (ds ResourceStatus) String() string { for i, u := range pinnedURLStrs {
switch ds { if pinnedURLs[i], err = ParseURL(u); err != nil {
case ResourceStatusUnknown: return nil, fmt.Errorf("parsing url %q: %w", u, err)
return "UNKNOWN"
case ResourceStatusOK:
return "OK"
case ResourceStatusError:
return "ERROR"
default:
panic(fmt.Sprintf("unknown ResourceStatus: %#v", ds))
} }
} }
// Resource describes the current state of a resource, with the resource being for i, p := range patternStrs {
// uniquely identified by a URL. if patterns[i], err = regexp.Compile(p); err != nil {
type Resource struct { return nil, fmt.Errorf("compiling regexp %q: %w", p, err)
URL URL
Status ResourceStatus
Pinned bool
LastChecked time.Time
// only set if Status == ResourceStatusError
ErrorString string
// Indicate the URLs of resources which link to/are linked from this
// resource.
IncomingLinkURLs, OutgoingLinkURLs []URL
} }
}
d := &DeadLinks{
opts: *opts.withDefaults(),
store: store,
patterns: patterns,
}
if err := d.store.SetPinned(ctx, pinnedURLs); err != nil {
return nil, fmt.Errorf("pinning URLs: %w", err)
}
return d, nil
}
func (d *DeadLinks) onError(ctx context.Context, err error) {
if d.opts.OnError == nil ||
(ctx.Err() != nil && errors.Is(err, ctx.Err())) {
return
}
d.opts.OnError(err)
}
func (d *DeadLinks) shouldFollowURL(url URL) bool {
urlStr := string(url)
for _, pattern := range d.patterns {
if pattern.MatchString(urlStr) {
return true
}
}
return false
}
func (d *DeadLinks) getURL(ctx context.Context, url URL) ([]URL, error) {
mimeType, body, err := d.opts.Client.Get(ctx, url)
if err != nil {
return nil, err
}
defer body.Close()
if !d.shouldFollowURL(url) {
return nil, nil
}
return d.opts.Parser.Parse(mimeType, body)
}
// checkURL only returns an error if storing the results of the check fails.
func (d *DeadLinks) checkURL(ctx context.Context, url URL) error {
var (
now = time.Now()
status = ResourceStatusOK
errorStr string
)
outgoingURLs, err := d.getURL(ctx, url)
if err != nil {
status = ResourceStatusError
errorStr = err.Error()
}
err = d.store.Update(ctx, now, url, status, errorStr, outgoingURLs)
if err != nil {
return fmt.Errorf(
"failed storing status:%v (errorStr:%q) and %d outgoingURLs: %w",
status, errorStr, len(outgoingURLs), err,
)
}
return nil
}
// Update runs through all pinned or previously discovered URLs which were
// last checked prior to the given time (or which have never been checked) and
// updates the internal storage with any new URLs and links to dead URLs which
// it finds.
func (d *DeadLinks) Update(
ctx context.Context, lastCheckedBefore time.Time,
) error {
var (
wg = new(sync.WaitGroup)
ch = make(chan URL, d.opts.Concurrency)
)
wg.Add(d.opts.Concurrency)
for i := 0; i < d.opts.Concurrency; i++ {
go func() {
defer wg.Done()
for url := range ch {
if err := d.checkURL(ctx, url); err != nil {
d.onError(ctx, fmt.Errorf("checking url %q: %w", url, err))
}
}
}()
}
var err error
// Because checking URLs can result in new URLs being inserted into the
// Store, we query the Store in a loop until it stops producing
// unvisited/stale URLs.
for {
var (
n int
urlsIter = d.store.GetURLsByLastChecked(lastCheckedBefore)
)
err = miter.ForEach(ctx, urlsIter, func(url URL) error {
ch <- url
n++
return nil
})
if err != nil || n == 0 {
break
}
}
close(ch)
wg.Wait()
if err != nil {
return fmt.Errorf("iterating urls needing checked: %w", err)
}
if err := d.store.GC(ctx); err != nil {
return fmt.Errorf("garbage collecting: %w", err)
}
return nil
}
// TODO expose GetByStatus

45
resource.go Normal file
View File

@ -0,0 +1,45 @@
package deadlinks
import (
"fmt"
"time"
)
// ResourceStatus describes what state a particular Resource is in.
type ResourceStatus int
// Enumeration of ResourceStatus values.
const (
ResourceStatusUnknown ResourceStatus = iota
ResourceStatusOK
ResourceStatusError
)
func (ds ResourceStatus) String() string {
switch ds {
case ResourceStatusUnknown:
return "UNKNOWN"
case ResourceStatusOK:
return "OK"
case ResourceStatusError:
return "ERROR"
default:
panic(fmt.Sprintf("unknown ResourceStatus: %#v", ds))
}
}
// Resource describes the current state of a resource, with the resource being
// uniquely identified by a URL.
type Resource struct {
URL URL
Status ResourceStatus
Pinned bool
LastChecked time.Time
// only set if Status == ResourceStatusError
ErrorString string
// Indicate the URLs of resources which link to/are linked from this
// resource.
IncomingLinkURLs, OutgoingLinkURLs []URL
}

53
url.go Normal file
View File

@ -0,0 +1,53 @@
package deadlinks
import (
"errors"
"fmt"
"net/url"
)
// URL is a standard universal resource identifier, normalized particularly for
// this package.
type URL string
// ParseURL parses and returns a URL based on the given string, or an error.
func ParseURL(urlStr string) (URL, error) {
u, err := url.Parse(urlStr)
if err != nil {
return "", err
}
return URL(u.String()), nil
}
func parseURLs(urlStrs []string) ([]URL, error) {
var (
res = make([]URL, 0, len(urlStrs))
errs []error
)
for _, urlStr := range urlStrs {
u, err := ParseURL(urlStr)
if err == nil {
res = append(res, u)
} else {
errs = append(errs, err)
}
}
return res, errors.Join(errs...)
}
func (u URL) toStd() *url.URL {
uu, err := url.Parse(string(u))
if err != nil {
panic(fmt.Sprintf("parsing URL %q: %v", u, err))
}
return uu
}
// ResolveReference is equivalend to the method of the same name in `net/url`.
func (u URL) ResolveReference(u2Str string) (URL, error) {
u2, err := url.Parse(u2Str)
if err != nil {
return "", err
}
return URL(u.toStd().ResolveReference(u2).String()), nil
}