Implement basic functionality of top-level DeadLinks type
This commit is contained in:
parent
e03e4037d2
commit
8a89597d7a
262
deadlinks.go
262
deadlinks.go
@ -15,93 +15,229 @@
|
||||
package deadlinks
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"runtime"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"code.betamike.com/mediocregopher/mediocre-go-lib/miter"
|
||||
)
|
||||
|
||||
// URL is a standard universal resource identifier, normalized particularly for
|
||||
// this package.
|
||||
type URL string
|
||||
// Opts are optional fields which can be provided to New. A nil Opts is
|
||||
// equivalent to an empty one.
|
||||
type Opts struct {
|
||||
Client Client // Defaults to `NewClient(nil)`
|
||||
Parser Parser // Defaults to `NewParser()`
|
||||
|
||||
// ParseURL parses and returns a URL based on the given string, or an error.
|
||||
func ParseURL(urlStr string) (URL, error) {
|
||||
u, err := url.Parse(urlStr)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return URL(u.String()), nil
|
||||
// Concurrency determines the maximum number of URLs which can be checked
|
||||
// simultaneously.
|
||||
//
|
||||
// Default: `runtime.NumCPU()`
|
||||
Concurrency int
|
||||
|
||||
// OnError, if set, will be called whenever DeadLinks encounters an error
|
||||
// internally that it would otherwise skip over.
|
||||
OnError func(error)
|
||||
}
|
||||
|
||||
func parseURLs(urlStrs []string) ([]URL, error) {
|
||||
func (o *Opts) withDefaults() *Opts {
|
||||
if o == nil {
|
||||
o = new(Opts)
|
||||
}
|
||||
|
||||
if o.Client == nil {
|
||||
o.Client = NewClient(nil)
|
||||
}
|
||||
|
||||
if o.Parser == nil {
|
||||
o.Parser = NewParser()
|
||||
}
|
||||
|
||||
if o.Concurrency == 0 {
|
||||
o.Concurrency = runtime.NumCPU()
|
||||
}
|
||||
|
||||
return o
|
||||
}
|
||||
|
||||
// DeadLinks crawls a configured space of URLs and keeps track of any broken
|
||||
// links which it finds.
|
||||
//
|
||||
// DeadLinks supports multiple web protocols and document formats
|
||||
// out-of-the-box, and will traverse between them as necessary based on URL
|
||||
// schemas. See the `NewClient` and `NewParser` functions for more details.
|
||||
type DeadLinks struct {
|
||||
opts Opts
|
||||
store Store
|
||||
patterns []*regexp.Regexp
|
||||
}
|
||||
|
||||
// New initializes and returns a DeadLinks instance which will track the
|
||||
// liveness of the given set of pinned URLs, as well as all URLs linked to from
|
||||
// those. If a linked URL matches one of the given regexp patterns then any
|
||||
// URLs linked to from it will be tracked as well.
|
||||
//
|
||||
// If a non-empty Store is passed to New then whatever set of previously pinned
|
||||
// URLs were present will be overwritten with the given ones.
|
||||
func New(
|
||||
ctx context.Context,
|
||||
store Store,
|
||||
pinnedURLStrs,
|
||||
patternStrs []string,
|
||||
opts *Opts,
|
||||
) (
|
||||
*DeadLinks, error,
|
||||
) {
|
||||
var (
|
||||
res = make([]URL, 0, len(urlStrs))
|
||||
errs []error
|
||||
err error
|
||||
pinnedURLs = make([]URL, len(pinnedURLStrs))
|
||||
patterns = make([]*regexp.Regexp, len(patternStrs))
|
||||
)
|
||||
for _, urlStr := range urlStrs {
|
||||
u, err := ParseURL(urlStr)
|
||||
if err == nil {
|
||||
res = append(res, u)
|
||||
} else {
|
||||
errs = append(errs, err)
|
||||
|
||||
for i, u := range pinnedURLStrs {
|
||||
if pinnedURLs[i], err = ParseURL(u); err != nil {
|
||||
return nil, fmt.Errorf("parsing url %q: %w", u, err)
|
||||
}
|
||||
}
|
||||
return res, errors.Join(errs...)
|
||||
|
||||
for i, p := range patternStrs {
|
||||
if patterns[i], err = regexp.Compile(p); err != nil {
|
||||
return nil, fmt.Errorf("compiling regexp %q: %w", p, err)
|
||||
}
|
||||
}
|
||||
|
||||
d := &DeadLinks{
|
||||
opts: *opts.withDefaults(),
|
||||
store: store,
|
||||
patterns: patterns,
|
||||
}
|
||||
|
||||
if err := d.store.SetPinned(ctx, pinnedURLs); err != nil {
|
||||
return nil, fmt.Errorf("pinning URLs: %w", err)
|
||||
}
|
||||
|
||||
return d, nil
|
||||
}
|
||||
|
||||
func (u URL) toStd() *url.URL {
|
||||
uu, err := url.Parse(string(u))
|
||||
func (d *DeadLinks) onError(ctx context.Context, err error) {
|
||||
if d.opts.OnError == nil ||
|
||||
(ctx.Err() != nil && errors.Is(err, ctx.Err())) {
|
||||
return
|
||||
}
|
||||
|
||||
d.opts.OnError(err)
|
||||
}
|
||||
|
||||
func (d *DeadLinks) shouldFollowURL(url URL) bool {
|
||||
urlStr := string(url)
|
||||
for _, pattern := range d.patterns {
|
||||
if pattern.MatchString(urlStr) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (d *DeadLinks) getURL(ctx context.Context, url URL) ([]URL, error) {
|
||||
mimeType, body, err := d.opts.Client.Get(ctx, url)
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("parsing URL %q: %v", u, err))
|
||||
return nil, err
|
||||
}
|
||||
return uu
|
||||
defer body.Close()
|
||||
|
||||
if !d.shouldFollowURL(url) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
return d.opts.Parser.Parse(mimeType, body)
|
||||
}
|
||||
|
||||
// ResolveReference is equivalend to the method of the same name in `net/url`.
|
||||
func (u URL) ResolveReference(u2Str string) (URL, error) {
|
||||
u2, err := url.Parse(u2Str)
|
||||
// checkURL only returns an error if storing the results of the check fails.
|
||||
func (d *DeadLinks) checkURL(ctx context.Context, url URL) error {
|
||||
var (
|
||||
now = time.Now()
|
||||
status = ResourceStatusOK
|
||||
errorStr string
|
||||
)
|
||||
|
||||
outgoingURLs, err := d.getURL(ctx, url)
|
||||
if err != nil {
|
||||
return "", err
|
||||
status = ResourceStatusError
|
||||
errorStr = err.Error()
|
||||
}
|
||||
return URL(u.toStd().ResolveReference(u2).String()), nil
|
||||
}
|
||||
|
||||
// ResourceStatus describes what state a particular Resource is in.
|
||||
type ResourceStatus int
|
||||
|
||||
// Enumeration of ResourceStatus values.
|
||||
const (
|
||||
ResourceStatusUnknown ResourceStatus = iota
|
||||
ResourceStatusOK
|
||||
ResourceStatusError
|
||||
)
|
||||
|
||||
func (ds ResourceStatus) String() string {
|
||||
switch ds {
|
||||
case ResourceStatusUnknown:
|
||||
return "UNKNOWN"
|
||||
case ResourceStatusOK:
|
||||
return "OK"
|
||||
case ResourceStatusError:
|
||||
return "ERROR"
|
||||
default:
|
||||
panic(fmt.Sprintf("unknown ResourceStatus: %#v", ds))
|
||||
err = d.store.Update(ctx, now, url, status, errorStr, outgoingURLs)
|
||||
if err != nil {
|
||||
return fmt.Errorf(
|
||||
"failed storing status:%v (errorStr:%q) and %d outgoingURLs: %w",
|
||||
status, errorStr, len(outgoingURLs), err,
|
||||
)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Resource describes the current state of a resource, with the resource being
|
||||
// uniquely identified by a URL.
|
||||
type Resource struct {
|
||||
URL URL
|
||||
Status ResourceStatus
|
||||
Pinned bool
|
||||
LastChecked time.Time
|
||||
// Update runs through all pinned or previously discovered URLs which were
|
||||
// last checked prior to the given time (or which have never been checked) and
|
||||
// updates the internal storage with any new URLs and links to dead URLs which
|
||||
// it finds.
|
||||
func (d *DeadLinks) Update(
|
||||
ctx context.Context, lastCheckedBefore time.Time,
|
||||
) error {
|
||||
var (
|
||||
wg = new(sync.WaitGroup)
|
||||
ch = make(chan URL, d.opts.Concurrency)
|
||||
)
|
||||
|
||||
// only set if Status == ResourceStatusError
|
||||
ErrorString string
|
||||
wg.Add(d.opts.Concurrency)
|
||||
for i := 0; i < d.opts.Concurrency; i++ {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for url := range ch {
|
||||
if err := d.checkURL(ctx, url); err != nil {
|
||||
d.onError(ctx, fmt.Errorf("checking url %q: %w", url, err))
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
// Indicate the URLs of resources which link to/are linked from this
|
||||
// resource.
|
||||
IncomingLinkURLs, OutgoingLinkURLs []URL
|
||||
var err error
|
||||
|
||||
// Because checking URLs can result in new URLs being inserted into the
|
||||
// Store, we query the Store in a loop until it stops producing
|
||||
// unvisited/stale URLs.
|
||||
for {
|
||||
var (
|
||||
n int
|
||||
urlsIter = d.store.GetURLsByLastChecked(lastCheckedBefore)
|
||||
)
|
||||
|
||||
err = miter.ForEach(ctx, urlsIter, func(url URL) error {
|
||||
ch <- url
|
||||
n++
|
||||
return nil
|
||||
})
|
||||
|
||||
if err != nil || n == 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
close(ch)
|
||||
wg.Wait()
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("iterating urls needing checked: %w", err)
|
||||
}
|
||||
|
||||
if err := d.store.GC(ctx); err != nil {
|
||||
return fmt.Errorf("garbage collecting: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// TODO expose GetByStatus
|
||||
|
45
resource.go
Normal file
45
resource.go
Normal file
@ -0,0 +1,45 @@
|
||||
package deadlinks
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ResourceStatus describes what state a particular Resource is in.
|
||||
type ResourceStatus int
|
||||
|
||||
// Enumeration of ResourceStatus values.
|
||||
const (
|
||||
ResourceStatusUnknown ResourceStatus = iota
|
||||
ResourceStatusOK
|
||||
ResourceStatusError
|
||||
)
|
||||
|
||||
func (ds ResourceStatus) String() string {
|
||||
switch ds {
|
||||
case ResourceStatusUnknown:
|
||||
return "UNKNOWN"
|
||||
case ResourceStatusOK:
|
||||
return "OK"
|
||||
case ResourceStatusError:
|
||||
return "ERROR"
|
||||
default:
|
||||
panic(fmt.Sprintf("unknown ResourceStatus: %#v", ds))
|
||||
}
|
||||
}
|
||||
|
||||
// Resource describes the current state of a resource, with the resource being
|
||||
// uniquely identified by a URL.
|
||||
type Resource struct {
|
||||
URL URL
|
||||
Status ResourceStatus
|
||||
Pinned bool
|
||||
LastChecked time.Time
|
||||
|
||||
// only set if Status == ResourceStatusError
|
||||
ErrorString string
|
||||
|
||||
// Indicate the URLs of resources which link to/are linked from this
|
||||
// resource.
|
||||
IncomingLinkURLs, OutgoingLinkURLs []URL
|
||||
}
|
53
url.go
Normal file
53
url.go
Normal file
@ -0,0 +1,53 @@
|
||||
package deadlinks
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/url"
|
||||
)
|
||||
|
||||
// URL is a standard universal resource identifier, normalized particularly for
|
||||
// this package.
|
||||
type URL string
|
||||
|
||||
// ParseURL parses and returns a URL based on the given string, or an error.
|
||||
func ParseURL(urlStr string) (URL, error) {
|
||||
u, err := url.Parse(urlStr)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return URL(u.String()), nil
|
||||
}
|
||||
|
||||
func parseURLs(urlStrs []string) ([]URL, error) {
|
||||
var (
|
||||
res = make([]URL, 0, len(urlStrs))
|
||||
errs []error
|
||||
)
|
||||
for _, urlStr := range urlStrs {
|
||||
u, err := ParseURL(urlStr)
|
||||
if err == nil {
|
||||
res = append(res, u)
|
||||
} else {
|
||||
errs = append(errs, err)
|
||||
}
|
||||
}
|
||||
return res, errors.Join(errs...)
|
||||
}
|
||||
|
||||
func (u URL) toStd() *url.URL {
|
||||
uu, err := url.Parse(string(u))
|
||||
if err != nil {
|
||||
panic(fmt.Sprintf("parsing URL %q: %v", u, err))
|
||||
}
|
||||
return uu
|
||||
}
|
||||
|
||||
// ResolveReference is equivalend to the method of the same name in `net/url`.
|
||||
func (u URL) ResolveReference(u2Str string) (URL, error) {
|
||||
u2, err := url.Parse(u2Str)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return URL(u.toStd().ResolveReference(u2).String()), nil
|
||||
}
|
Loading…
Reference in New Issue
Block a user