Implement basic functionality of top-level DeadLinks type

main
Brian Picciano 5 months ago
parent e03e4037d2
commit 8a89597d7a
  1. 256
      deadlinks.go
  2. 45
      resource.go
  3. 53
      url.go

@ -15,93 +15,229 @@
package deadlinks
import (
"context"
"errors"
"fmt"
"net/url"
"regexp"
"runtime"
"sync"
"time"
"code.betamike.com/mediocregopher/mediocre-go-lib/miter"
)
// URL is a standard universal resource identifier, normalized particularly for
// this package.
type URL string
// Opts are optional fields which can be provided to New. A nil Opts is
// equivalent to an empty one.
type Opts struct {
Client Client // Defaults to `NewClient(nil)`
Parser Parser // Defaults to `NewParser()`
// ParseURL parses and returns a URL based on the given string, or an error.
func ParseURL(urlStr string) (URL, error) {
u, err := url.Parse(urlStr)
if err != nil {
return "", err
// Concurrency determines the maximum number of URLs which can be checked
// simultaneously.
//
// Default: `runtime.NumCPU()`
Concurrency int
// OnError, if set, will be called whenever DeadLinks encounters an error
// internally that it would otherwise skip over.
OnError func(error)
}
func (o *Opts) withDefaults() *Opts {
if o == nil {
o = new(Opts)
}
if o.Client == nil {
o.Client = NewClient(nil)
}
return URL(u.String()), nil
if o.Parser == nil {
o.Parser = NewParser()
}
if o.Concurrency == 0 {
o.Concurrency = runtime.NumCPU()
}
return o
}
// DeadLinks crawls a configured space of URLs and keeps track of any broken
// links which it finds.
//
// DeadLinks supports multiple web protocols and document formats
// out-of-the-box, and will traverse between them as necessary based on URL
// schemas. See the `NewClient` and `NewParser` functions for more details.
type DeadLinks struct {
opts Opts
store Store
patterns []*regexp.Regexp
}
func parseURLs(urlStrs []string) ([]URL, error) {
// New initializes and returns a DeadLinks instance which will track the
// liveness of the given set of pinned URLs, as well as all URLs linked to from
// those. If a linked URL matches one of the given regexp patterns then any
// URLs linked to from it will be tracked as well.
//
// If a non-empty Store is passed to New then whatever set of previously pinned
// URLs were present will be overwritten with the given ones.
func New(
ctx context.Context,
store Store,
pinnedURLStrs,
patternStrs []string,
opts *Opts,
) (
*DeadLinks, error,
) {
var (
res = make([]URL, 0, len(urlStrs))
errs []error
err error
pinnedURLs = make([]URL, len(pinnedURLStrs))
patterns = make([]*regexp.Regexp, len(patternStrs))
)
for _, urlStr := range urlStrs {
u, err := ParseURL(urlStr)
if err == nil {
res = append(res, u)
} else {
errs = append(errs, err)
for i, u := range pinnedURLStrs {
if pinnedURLs[i], err = ParseURL(u); err != nil {
return nil, fmt.Errorf("parsing url %q: %w", u, err)
}
}
return res, errors.Join(errs...)
for i, p := range patternStrs {
if patterns[i], err = regexp.Compile(p); err != nil {
return nil, fmt.Errorf("compiling regexp %q: %w", p, err)
}
}
d := &DeadLinks{
opts: *opts.withDefaults(),
store: store,
patterns: patterns,
}
if err := d.store.SetPinned(ctx, pinnedURLs); err != nil {
return nil, fmt.Errorf("pinning URLs: %w", err)
}
return d, nil
}
func (u URL) toStd() *url.URL {
uu, err := url.Parse(string(u))
if err != nil {
panic(fmt.Sprintf("parsing URL %q: %v", u, err))
func (d *DeadLinks) onError(ctx context.Context, err error) {
if d.opts.OnError == nil ||
(ctx.Err() != nil && errors.Is(err, ctx.Err())) {
return
}
return uu
d.opts.OnError(err)
}
// ResolveReference is equivalend to the method of the same name in `net/url`.
func (u URL) ResolveReference(u2Str string) (URL, error) {
u2, err := url.Parse(u2Str)
func (d *DeadLinks) shouldFollowURL(url URL) bool {
urlStr := string(url)
for _, pattern := range d.patterns {
if pattern.MatchString(urlStr) {
return true
}
}
return false
}
func (d *DeadLinks) getURL(ctx context.Context, url URL) ([]URL, error) {
mimeType, body, err := d.opts.Client.Get(ctx, url)
if err != nil {
return "", err
return nil, err
}
defer body.Close()
if !d.shouldFollowURL(url) {
return nil, nil
}
return URL(u.toStd().ResolveReference(u2).String()), nil
return d.opts.Parser.Parse(mimeType, body)
}
// ResourceStatus describes what state a particular Resource is in.
type ResourceStatus int
// checkURL only returns an error if storing the results of the check fails.
func (d *DeadLinks) checkURL(ctx context.Context, url URL) error {
var (
now = time.Now()
status = ResourceStatusOK
errorStr string
)
// Enumeration of ResourceStatus values.
const (
ResourceStatusUnknown ResourceStatus = iota
ResourceStatusOK
ResourceStatusError
)
outgoingURLs, err := d.getURL(ctx, url)
if err != nil {
status = ResourceStatusError
errorStr = err.Error()
}
func (ds ResourceStatus) String() string {
switch ds {
case ResourceStatusUnknown:
return "UNKNOWN"
case ResourceStatusOK:
return "OK"
case ResourceStatusError:
return "ERROR"
default:
panic(fmt.Sprintf("unknown ResourceStatus: %#v", ds))
err = d.store.Update(ctx, now, url, status, errorStr, outgoingURLs)
if err != nil {
return fmt.Errorf(
"failed storing status:%v (errorStr:%q) and %d outgoingURLs: %w",
status, errorStr, len(outgoingURLs), err,
)
}
return nil
}
// Resource describes the current state of a resource, with the resource being
// uniquely identified by a URL.
type Resource struct {
URL URL
Status ResourceStatus
Pinned bool
LastChecked time.Time
// Update runs through all pinned or previously discovered URLs which were
// last checked prior to the given time (or which have never been checked) and
// updates the internal storage with any new URLs and links to dead URLs which
// it finds.
func (d *DeadLinks) Update(
ctx context.Context, lastCheckedBefore time.Time,
) error {
var (
wg = new(sync.WaitGroup)
ch = make(chan URL, d.opts.Concurrency)
)
wg.Add(d.opts.Concurrency)
for i := 0; i < d.opts.Concurrency; i++ {
go func() {
defer wg.Done()
for url := range ch {
if err := d.checkURL(ctx, url); err != nil {
d.onError(ctx, fmt.Errorf("checking url %q: %w", url, err))
}
}
}()
}
var err error
// Because checking URLs can result in new URLs being inserted into the
// Store, we query the Store in a loop until it stops producing
// unvisited/stale URLs.
for {
var (
n int
urlsIter = d.store.GetURLsByLastChecked(lastCheckedBefore)
)
err = miter.ForEach(ctx, urlsIter, func(url URL) error {
ch <- url
n++
return nil
})
if err != nil || n == 0 {
break
}
}
close(ch)
wg.Wait()
// only set if Status == ResourceStatusError
ErrorString string
if err != nil {
return fmt.Errorf("iterating urls needing checked: %w", err)
}
// Indicate the URLs of resources which link to/are linked from this
// resource.
IncomingLinkURLs, OutgoingLinkURLs []URL
if err := d.store.GC(ctx); err != nil {
return fmt.Errorf("garbage collecting: %w", err)
}
return nil
}
// TODO expose GetByStatus

@ -0,0 +1,45 @@
package deadlinks
import (
"fmt"
"time"
)
// ResourceStatus describes what state a particular Resource is in.
type ResourceStatus int
// Enumeration of ResourceStatus values.
const (
ResourceStatusUnknown ResourceStatus = iota
ResourceStatusOK
ResourceStatusError
)
func (ds ResourceStatus) String() string {
switch ds {
case ResourceStatusUnknown:
return "UNKNOWN"
case ResourceStatusOK:
return "OK"
case ResourceStatusError:
return "ERROR"
default:
panic(fmt.Sprintf("unknown ResourceStatus: %#v", ds))
}
}
// Resource describes the current state of a resource, with the resource being
// uniquely identified by a URL.
type Resource struct {
URL URL
Status ResourceStatus
Pinned bool
LastChecked time.Time
// only set if Status == ResourceStatusError
ErrorString string
// Indicate the URLs of resources which link to/are linked from this
// resource.
IncomingLinkURLs, OutgoingLinkURLs []URL
}

@ -0,0 +1,53 @@
package deadlinks
import (
"errors"
"fmt"
"net/url"
)
// URL is a standard universal resource identifier, normalized particularly for
// this package.
type URL string
// ParseURL parses and returns a URL based on the given string, or an error.
func ParseURL(urlStr string) (URL, error) {
u, err := url.Parse(urlStr)
if err != nil {
return "", err
}
return URL(u.String()), nil
}
func parseURLs(urlStrs []string) ([]URL, error) {
var (
res = make([]URL, 0, len(urlStrs))
errs []error
)
for _, urlStr := range urlStrs {
u, err := ParseURL(urlStr)
if err == nil {
res = append(res, u)
} else {
errs = append(errs, err)
}
}
return res, errors.Join(errs...)
}
func (u URL) toStd() *url.URL {
uu, err := url.Parse(string(u))
if err != nil {
panic(fmt.Sprintf("parsing URL %q: %v", u, err))
}
return uu
}
// ResolveReference is equivalend to the method of the same name in `net/url`.
func (u URL) ResolveReference(u2Str string) (URL, error) {
u2, err := url.Parse(u2Str)
if err != nil {
return "", err
}
return URL(u.toStd().ResolveReference(u2).String()), nil
}
Loading…
Cancel
Save