2023-12-26 22:18:09 +00:00
|
|
|
// Package deadlinks implements a liveness checker for hyperlinks in HTML and
|
|
|
|
// gemtext documents.
|
2023-12-28 14:40:07 +00:00
|
|
|
//
|
2023-12-30 12:43:06 +00:00
|
|
|
// # URLs
|
|
|
|
//
|
|
|
|
// DeadLinks crawls and keeps track of hyperlinks between different
|
|
|
|
// resources, such as webpages and gemtext documents. If a resource is not
|
|
|
|
// linked to from any other resources then DeadLinks forgets about it.
|
|
|
|
//
|
|
|
|
// For this reason it is required to have a starting set of URLs which DeadLinks
|
|
|
|
// will not forget about; these are the pinned URLs. Pinned URLs act as the
|
|
|
|
// starting point for crawling.
|
|
|
|
//
|
|
|
|
// When DeadLinks traverses a URL link, it will check the liveness of that URL's
|
|
|
|
// resource, but it will not by default recur into _that_ resource's links. It
|
|
|
|
// will only do so if the URL matches one of the given regex patterns which
|
|
|
|
// DeadLinks was configured with.
|
|
|
|
//
|
|
|
|
// # Basic Usage
|
|
|
|
//
|
|
|
|
// DeadLinks can be initialized using `New`:
|
|
|
|
//
|
|
|
|
// store := deadlinks.NewSQLiteStore(nil)
|
|
|
|
// defer store.Close()
|
|
|
|
//
|
|
|
|
// pinnedURLs := []string{"https://some.website.com"}
|
|
|
|
// patterns := []string{"website.com"}
|
|
|
|
//
|
|
|
|
// dl, err := deadlinks.New(ctx, store, pinnedURLs, patterns, nil)
|
|
|
|
//
|
|
|
|
// `Update` is then used to crawl all links, starting with `pinnedURLs`:
|
|
|
|
//
|
|
|
|
// err := dl.Update(ctx, time.Now())
|
|
|
|
//
|
|
|
|
// Finally, `GetByStatus` can be used to query all discovered resources based on
|
|
|
|
// their current status. To retrieve all resources which have some error
|
|
|
|
// (indicating a broken link):
|
|
|
|
//
|
|
|
|
// erroredResources, err := miter.ToSlice(
|
|
|
|
// ctx, dl.GetByStatus(deadlinks.ResourceStatusError),
|
|
|
|
// )
|
|
|
|
//
|
|
|
|
// Note that `GetByStatus` returns a `miter.Iterator`, see its documentation for
|
|
|
|
// more options on how to use it beyond `ToSlice`:
|
|
|
|
//
|
|
|
|
// https://godocs.io/code.betamike.com/mediocregopher/mediocre-go-lib/miter
|
|
|
|
//
|
2023-12-28 14:40:07 +00:00
|
|
|
// # Storage
|
|
|
|
//
|
|
|
|
// By default DeadLinks uses an in-memory SQLite database for tracking the
|
|
|
|
// status of resources and the links between them. If memory usage becomes a
|
|
|
|
// problem it is also possible to use a SQLite database file:
|
|
|
|
//
|
|
|
|
// store := deadlinks.NewSQLiteStore(&deadlinks.SQLiteStoreOpts{
|
|
|
|
// Path: "/path/to/db/file.sqlite",
|
|
|
|
// })
|
2023-12-29 19:35:02 +00:00
|
|
|
// defer store.Close()
|
2023-12-28 14:40:07 +00:00
|
|
|
//
|
2023-12-30 12:43:06 +00:00
|
|
|
// dl, err := deadlinks.New(ctx, store, pinnedURLs, patterns, nil)
|
|
|
|
//
|
|
|
|
// # Further Customization
|
|
|
|
//
|
|
|
|
// Most functionality of DeadLinks can be extended or superceded by injecting
|
|
|
|
// alternate interface implementations via the various Opts structs.
|
2023-12-26 22:18:09 +00:00
|
|
|
package deadlinks
|
|
|
|
|
|
|
|
import (
|
2023-12-29 16:09:49 +00:00
|
|
|
"context"
|
2023-12-26 22:18:09 +00:00
|
|
|
"errors"
|
|
|
|
"fmt"
|
2023-12-29 16:09:49 +00:00
|
|
|
"regexp"
|
|
|
|
"runtime"
|
2023-12-29 19:35:02 +00:00
|
|
|
"strings"
|
2023-12-29 16:09:49 +00:00
|
|
|
"sync"
|
2023-12-26 22:18:09 +00:00
|
|
|
"time"
|
2023-12-29 16:09:49 +00:00
|
|
|
|
|
|
|
"code.betamike.com/mediocregopher/mediocre-go-lib/miter"
|
2023-12-26 22:18:09 +00:00
|
|
|
)
|
|
|
|
|
2023-12-29 16:09:49 +00:00
|
|
|
// Opts are optional fields which can be provided to New. A nil Opts is
|
|
|
|
// equivalent to an empty one.
|
|
|
|
type Opts struct {
|
2023-12-30 11:41:08 +00:00
|
|
|
NewClient func() Client // Defaults to `func () Client { return NewClient(nil) }`
|
|
|
|
Parser Parser // Defaults to `NewParser()`
|
2023-12-26 22:18:09 +00:00
|
|
|
|
2023-12-29 16:09:49 +00:00
|
|
|
// Concurrency determines the maximum number of URLs which can be checked
|
|
|
|
// simultaneously.
|
|
|
|
//
|
2023-12-30 11:41:08 +00:00
|
|
|
// Default: `runtime.NumCPU() / 2`
|
2023-12-29 16:09:49 +00:00
|
|
|
Concurrency int
|
|
|
|
|
|
|
|
// OnError, if set, will be called whenever DeadLinks encounters an error
|
|
|
|
// internally that it would otherwise skip over.
|
|
|
|
OnError func(error)
|
2023-12-29 19:35:02 +00:00
|
|
|
|
|
|
|
// RequestTimeout determines how long a request for a resource can run
|
|
|
|
// before the resource is considered unavailable.
|
|
|
|
//
|
2023-12-30 12:43:06 +00:00
|
|
|
// Default: 1 * time.Minute
|
2023-12-29 19:35:02 +00:00
|
|
|
RequestTimeout time.Duration
|
2023-12-29 16:09:49 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (o *Opts) withDefaults() *Opts {
|
|
|
|
if o == nil {
|
|
|
|
o = new(Opts)
|
|
|
|
}
|
|
|
|
|
2023-12-30 11:41:08 +00:00
|
|
|
if o.NewClient == nil {
|
|
|
|
o.NewClient = func() Client { return NewClient(nil) }
|
2023-12-26 22:18:09 +00:00
|
|
|
}
|
2023-12-29 16:09:49 +00:00
|
|
|
|
|
|
|
if o.Parser == nil {
|
|
|
|
o.Parser = NewParser()
|
|
|
|
}
|
|
|
|
|
|
|
|
if o.Concurrency == 0 {
|
2023-12-30 11:41:08 +00:00
|
|
|
o.Concurrency = runtime.NumCPU() / 2
|
2023-12-29 16:09:49 +00:00
|
|
|
}
|
|
|
|
|
2023-12-29 19:35:02 +00:00
|
|
|
if o.RequestTimeout == 0 {
|
2023-12-30 12:43:06 +00:00
|
|
|
o.RequestTimeout = 1 * time.Minute
|
2023-12-29 19:35:02 +00:00
|
|
|
}
|
|
|
|
|
2023-12-29 16:09:49 +00:00
|
|
|
return o
|
|
|
|
}
|
|
|
|
|
|
|
|
// DeadLinks crawls a configured space of URLs and keeps track of any broken
|
|
|
|
// links which it finds.
|
|
|
|
//
|
|
|
|
// DeadLinks supports multiple web protocols and document formats
|
|
|
|
// out-of-the-box, and will traverse between them as necessary based on URL
|
|
|
|
// schemas. See the `NewClient` and `NewParser` functions for more details.
|
|
|
|
type DeadLinks struct {
|
|
|
|
opts Opts
|
|
|
|
store Store
|
|
|
|
patterns []*regexp.Regexp
|
2023-12-30 11:41:08 +00:00
|
|
|
clients []Client
|
2023-12-26 22:18:09 +00:00
|
|
|
}
|
|
|
|
|
2023-12-29 16:09:49 +00:00
|
|
|
// New initializes and returns a DeadLinks instance which will track the
|
|
|
|
// liveness of the given set of pinned URLs, as well as all URLs linked to from
|
|
|
|
// those. If a linked URL matches one of the given regexp patterns then any
|
|
|
|
// URLs linked to from it will be tracked as well.
|
|
|
|
//
|
|
|
|
// If a non-empty Store is passed to New then whatever set of previously pinned
|
|
|
|
// URLs were present will be overwritten with the given ones.
|
|
|
|
func New(
|
|
|
|
ctx context.Context,
|
|
|
|
store Store,
|
|
|
|
pinnedURLStrs,
|
|
|
|
patternStrs []string,
|
|
|
|
opts *Opts,
|
|
|
|
) (
|
|
|
|
*DeadLinks, error,
|
|
|
|
) {
|
2023-12-26 22:18:09 +00:00
|
|
|
var (
|
2023-12-29 16:09:49 +00:00
|
|
|
err error
|
|
|
|
pinnedURLs = make([]URL, len(pinnedURLStrs))
|
|
|
|
patterns = make([]*regexp.Regexp, len(patternStrs))
|
2023-12-26 22:18:09 +00:00
|
|
|
)
|
2023-12-29 16:09:49 +00:00
|
|
|
|
|
|
|
for i, u := range pinnedURLStrs {
|
|
|
|
if pinnedURLs[i], err = ParseURL(u); err != nil {
|
|
|
|
return nil, fmt.Errorf("parsing url %q: %w", u, err)
|
2023-12-26 22:18:09 +00:00
|
|
|
}
|
|
|
|
}
|
2023-12-29 16:09:49 +00:00
|
|
|
|
|
|
|
for i, p := range patternStrs {
|
|
|
|
if patterns[i], err = regexp.Compile(p); err != nil {
|
|
|
|
return nil, fmt.Errorf("compiling regexp %q: %w", p, err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
d := &DeadLinks{
|
|
|
|
opts: *opts.withDefaults(),
|
|
|
|
store: store,
|
|
|
|
patterns: patterns,
|
|
|
|
}
|
|
|
|
|
2023-12-30 11:41:08 +00:00
|
|
|
d.clients = make([]Client, d.opts.Concurrency)
|
|
|
|
for i := range d.clients {
|
|
|
|
d.clients[i] = d.opts.NewClient()
|
|
|
|
}
|
|
|
|
|
2023-12-29 16:09:49 +00:00
|
|
|
if err := d.store.SetPinned(ctx, pinnedURLs); err != nil {
|
|
|
|
return nil, fmt.Errorf("pinning URLs: %w", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
return d, nil
|
2023-12-26 22:18:09 +00:00
|
|
|
}
|
|
|
|
|
2023-12-29 16:09:49 +00:00
|
|
|
func (d *DeadLinks) onError(ctx context.Context, err error) {
|
|
|
|
if d.opts.OnError == nil ||
|
|
|
|
(ctx.Err() != nil && errors.Is(err, ctx.Err())) {
|
|
|
|
return
|
2023-12-28 16:11:42 +00:00
|
|
|
}
|
2023-12-29 16:09:49 +00:00
|
|
|
|
|
|
|
d.opts.OnError(err)
|
2023-12-28 16:11:42 +00:00
|
|
|
}
|
|
|
|
|
2023-12-29 16:09:49 +00:00
|
|
|
func (d *DeadLinks) shouldFollowURL(url URL) bool {
|
|
|
|
urlStr := string(url)
|
|
|
|
for _, pattern := range d.patterns {
|
|
|
|
if pattern.MatchString(urlStr) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
2023-12-30 11:41:08 +00:00
|
|
|
func (d *DeadLinks) getURL(
|
|
|
|
ctx context.Context, client Client, url URL,
|
|
|
|
) (
|
|
|
|
[]URL, error,
|
|
|
|
) {
|
2023-12-29 19:35:02 +00:00
|
|
|
ctx, cancel := context.WithTimeout(ctx, d.opts.RequestTimeout)
|
|
|
|
defer cancel()
|
|
|
|
|
2023-12-30 11:41:08 +00:00
|
|
|
mimeType, body, err := client.Get(ctx, url)
|
2023-12-28 16:11:42 +00:00
|
|
|
if err != nil {
|
2023-12-29 16:09:49 +00:00
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
defer body.Close()
|
|
|
|
|
2023-12-29 19:35:02 +00:00
|
|
|
// strip off mimeType extensions
|
|
|
|
if i := strings.Index(mimeType, ";"); i > 0 {
|
|
|
|
mimeType = mimeType[:i]
|
|
|
|
}
|
|
|
|
|
2023-12-29 16:09:49 +00:00
|
|
|
if !d.shouldFollowURL(url) {
|
|
|
|
return nil, nil
|
2023-12-28 16:11:42 +00:00
|
|
|
}
|
2023-12-29 16:09:49 +00:00
|
|
|
|
|
|
|
return d.opts.Parser.Parse(mimeType, body)
|
2023-12-28 16:11:42 +00:00
|
|
|
}
|
|
|
|
|
2023-12-29 16:09:49 +00:00
|
|
|
// checkURL only returns an error if storing the results of the check fails.
|
2023-12-30 11:41:08 +00:00
|
|
|
func (d *DeadLinks) checkURL(
|
|
|
|
ctx context.Context, client Client, url URL,
|
|
|
|
) error {
|
2023-12-29 16:09:49 +00:00
|
|
|
var (
|
|
|
|
now = time.Now()
|
|
|
|
status = ResourceStatusOK
|
|
|
|
errorStr string
|
|
|
|
)
|
2023-12-26 22:18:09 +00:00
|
|
|
|
2023-12-30 11:41:08 +00:00
|
|
|
outgoingURLs, err := d.getURL(ctx, client, url)
|
2023-12-29 16:09:49 +00:00
|
|
|
if err != nil {
|
|
|
|
status = ResourceStatusError
|
|
|
|
errorStr = err.Error()
|
|
|
|
}
|
2023-12-26 22:18:09 +00:00
|
|
|
|
2023-12-29 19:35:02 +00:00
|
|
|
for i := range outgoingURLs {
|
|
|
|
outgoingURLs[i] = url.ResolveReference(outgoingURLs[i])
|
|
|
|
}
|
|
|
|
|
2023-12-29 16:09:49 +00:00
|
|
|
err = d.store.Update(ctx, now, url, status, errorStr, outgoingURLs)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf(
|
|
|
|
"failed storing status:%v (errorStr:%q) and %d outgoingURLs: %w",
|
|
|
|
status, errorStr, len(outgoingURLs), err,
|
|
|
|
)
|
2023-12-26 22:18:09 +00:00
|
|
|
}
|
2023-12-29 16:09:49 +00:00
|
|
|
|
|
|
|
return nil
|
2023-12-26 22:18:09 +00:00
|
|
|
}
|
|
|
|
|
2023-12-29 19:35:02 +00:00
|
|
|
// returns the number of URLs checked.
|
|
|
|
func (d *DeadLinks) update(
|
2023-12-29 16:09:49 +00:00
|
|
|
ctx context.Context, lastCheckedBefore time.Time,
|
2023-12-29 19:35:02 +00:00
|
|
|
) (
|
|
|
|
int, error,
|
|
|
|
) {
|
2023-12-29 16:09:49 +00:00
|
|
|
var (
|
|
|
|
wg = new(sync.WaitGroup)
|
|
|
|
ch = make(chan URL, d.opts.Concurrency)
|
|
|
|
)
|
|
|
|
|
|
|
|
wg.Add(d.opts.Concurrency)
|
|
|
|
for i := 0; i < d.opts.Concurrency; i++ {
|
2023-12-30 11:41:08 +00:00
|
|
|
go func(client Client) {
|
2023-12-29 16:09:49 +00:00
|
|
|
defer wg.Done()
|
|
|
|
for url := range ch {
|
2023-12-30 11:41:08 +00:00
|
|
|
if err := d.checkURL(ctx, client, url); err != nil {
|
2023-12-29 16:09:49 +00:00
|
|
|
d.onError(ctx, fmt.Errorf("checking url %q: %w", url, err))
|
|
|
|
}
|
|
|
|
}
|
2023-12-30 11:41:08 +00:00
|
|
|
}(d.clients[i])
|
2023-12-29 16:09:49 +00:00
|
|
|
}
|
|
|
|
|
2023-12-29 19:35:02 +00:00
|
|
|
var (
|
|
|
|
n int
|
|
|
|
err error
|
|
|
|
urlsIter = d.store.GetURLsByLastChecked(lastCheckedBefore)
|
|
|
|
)
|
2023-12-29 16:09:49 +00:00
|
|
|
|
2023-12-29 19:35:02 +00:00
|
|
|
err = miter.ForEach(ctx, urlsIter, func(url URL) error {
|
|
|
|
ch <- url
|
|
|
|
n++
|
|
|
|
return nil
|
|
|
|
})
|
2023-12-29 16:09:49 +00:00
|
|
|
|
|
|
|
close(ch)
|
|
|
|
wg.Wait()
|
2023-12-26 22:18:09 +00:00
|
|
|
|
2023-12-29 16:09:49 +00:00
|
|
|
if err != nil {
|
2023-12-29 19:35:02 +00:00
|
|
|
return 0, fmt.Errorf("iterating urls needing checked: %w", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
return n, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Update runs through all pinned or previously discovered URLs which were
|
|
|
|
// last checked prior to the given time (or which have never been checked) and
|
|
|
|
// updates the internal storage with any new URLs and links to dead URLs which
|
|
|
|
// it finds.
|
|
|
|
func (d *DeadLinks) Update(
|
|
|
|
ctx context.Context, lastCheckedBefore time.Time,
|
|
|
|
) error {
|
|
|
|
|
|
|
|
// Because we are iterating over the same dataset which is being updated it
|
|
|
|
// is required that we re-attempt the update multiple times, until all
|
|
|
|
// possible updates have been done.
|
|
|
|
for {
|
|
|
|
n, err := d.update(ctx, lastCheckedBefore)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
} else if n == 0 {
|
|
|
|
break
|
|
|
|
}
|
2023-12-29 16:09:49 +00:00
|
|
|
}
|
2023-12-26 22:18:09 +00:00
|
|
|
|
2023-12-29 16:09:49 +00:00
|
|
|
if err := d.store.GC(ctx); err != nil {
|
|
|
|
return fmt.Errorf("garbage collecting: %w", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
2023-12-26 22:18:09 +00:00
|
|
|
}
|
2023-12-29 16:09:49 +00:00
|
|
|
|
2023-12-29 19:35:02 +00:00
|
|
|
// GetByStatus returns an iterator which will return all Resources with the
|
|
|
|
// given status.
|
|
|
|
func (d *DeadLinks) GetByStatus(status ResourceStatus) miter.Iterator[Resource] {
|
|
|
|
return d.store.GetByStatus(status)
|
|
|
|
}
|