A tool for crawling and finding links to URLs which no longer exist
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
deadlinks/deadlinks.go

285 lines
6.5 KiB

// Package deadlinks implements a liveness checker for hyperlinks in HTML and
// gemtext documents.
//
// # Storage
//
// By default DeadLinks uses an in-memory SQLite database for tracking the
// status of resources and the links between them. If memory usage becomes a
// problem it is also possible to use a SQLite database file:
//
// store := deadlinks.NewSQLiteStore(&deadlinks.SQLiteStoreOpts{
// Path: "/path/to/db/file.sqlite",
// })
// defer store.Close()
//
// dl, err := deadlinks.New(
// ctx, store, pinnedURLs, patterns, nil,
// )
package deadlinks
import (
"context"
"errors"
"fmt"
"regexp"
"runtime"
"strings"
"sync"
"time"
"code.betamike.com/mediocregopher/mediocre-go-lib/miter"
)
// Opts are optional fields which can be provided to New. A nil Opts is
// equivalent to an empty one.
type Opts struct {
Client Client // Defaults to `NewClient(nil)`
Parser Parser // Defaults to `NewParser()`
// Concurrency determines the maximum number of URLs which can be checked
// simultaneously.
//
// Default: `runtime.NumCPU()`
Concurrency int
// OnError, if set, will be called whenever DeadLinks encounters an error
// internally that it would otherwise skip over.
OnError func(error)
// RequestTimeout determines how long a request for a resource can run
// before the resource is considered unavailable.
//
// Default: 10 * time.Second
RequestTimeout time.Duration
}
func (o *Opts) withDefaults() *Opts {
if o == nil {
o = new(Opts)
}
if o.Client == nil {
o.Client = NewClient(nil)
}
if o.Parser == nil {
o.Parser = NewParser()
}
if o.Concurrency == 0 {
o.Concurrency = runtime.NumCPU()
}
if o.RequestTimeout == 0 {
o.RequestTimeout = 10 * time.Second
}
return o
}
// DeadLinks crawls a configured space of URLs and keeps track of any broken
// links which it finds.
//
// DeadLinks supports multiple web protocols and document formats
// out-of-the-box, and will traverse between them as necessary based on URL
// schemas. See the `NewClient` and `NewParser` functions for more details.
type DeadLinks struct {
opts Opts
store Store
patterns []*regexp.Regexp
}
// New initializes and returns a DeadLinks instance which will track the
// liveness of the given set of pinned URLs, as well as all URLs linked to from
// those. If a linked URL matches one of the given regexp patterns then any
// URLs linked to from it will be tracked as well.
//
// If a non-empty Store is passed to New then whatever set of previously pinned
// URLs were present will be overwritten with the given ones.
func New(
ctx context.Context,
store Store,
pinnedURLStrs,
patternStrs []string,
opts *Opts,
) (
*DeadLinks, error,
) {
var (
err error
pinnedURLs = make([]URL, len(pinnedURLStrs))
patterns = make([]*regexp.Regexp, len(patternStrs))
)
for i, u := range pinnedURLStrs {
if pinnedURLs[i], err = ParseURL(u); err != nil {
return nil, fmt.Errorf("parsing url %q: %w", u, err)
}
}
for i, p := range patternStrs {
if patterns[i], err = regexp.Compile(p); err != nil {
return nil, fmt.Errorf("compiling regexp %q: %w", p, err)
}
}
d := &DeadLinks{
opts: *opts.withDefaults(),
store: store,
patterns: patterns,
}
if err := d.store.SetPinned(ctx, pinnedURLs); err != nil {
return nil, fmt.Errorf("pinning URLs: %w", err)
}
return d, nil
}
func (d *DeadLinks) onError(ctx context.Context, err error) {
if d.opts.OnError == nil ||
(ctx.Err() != nil && errors.Is(err, ctx.Err())) {
return
}
d.opts.OnError(err)
}
func (d *DeadLinks) shouldFollowURL(url URL) bool {
urlStr := string(url)
for _, pattern := range d.patterns {
if pattern.MatchString(urlStr) {
return true
}
}
return false
}
func (d *DeadLinks) getURL(ctx context.Context, url URL) ([]URL, error) {
ctx, cancel := context.WithTimeout(ctx, d.opts.RequestTimeout)
defer cancel()
mimeType, body, err := d.opts.Client.Get(ctx, url)
if err != nil {
return nil, err
}
defer body.Close()
// strip off mimeType extensions
if i := strings.Index(mimeType, ";"); i > 0 {
mimeType = mimeType[:i]
}
if !d.shouldFollowURL(url) {
return nil, nil
}
return d.opts.Parser.Parse(mimeType, body)
}
// checkURL only returns an error if storing the results of the check fails.
func (d *DeadLinks) checkURL(ctx context.Context, url URL) error {
var (
now = time.Now()
status = ResourceStatusOK
errorStr string
)
outgoingURLs, err := d.getURL(ctx, url)
if err != nil {
status = ResourceStatusError
errorStr = err.Error()
}
for i := range outgoingURLs {
outgoingURLs[i] = url.ResolveReference(outgoingURLs[i])
}
err = d.store.Update(ctx, now, url, status, errorStr, outgoingURLs)
if err != nil {
return fmt.Errorf(
"failed storing status:%v (errorStr:%q) and %d outgoingURLs: %w",
status, errorStr, len(outgoingURLs), err,
)
}
return nil
}
// returns the number of URLs checked.
func (d *DeadLinks) update(
ctx context.Context, lastCheckedBefore time.Time,
) (
int, error,
) {
var (
wg = new(sync.WaitGroup)
ch = make(chan URL, d.opts.Concurrency)
)
wg.Add(d.opts.Concurrency)
for i := 0; i < d.opts.Concurrency; i++ {
go func() {
defer wg.Done()
for url := range ch {
if err := d.checkURL(ctx, url); err != nil {
d.onError(ctx, fmt.Errorf("checking url %q: %w", url, err))
}
}
}()
}
var (
n int
err error
urlsIter = d.store.GetURLsByLastChecked(lastCheckedBefore)
)
err = miter.ForEach(ctx, urlsIter, func(url URL) error {
ch <- url
n++
return nil
})
close(ch)
wg.Wait()
if err != nil {
return 0, fmt.Errorf("iterating urls needing checked: %w", err)
}
return n, nil
}
// Update runs through all pinned or previously discovered URLs which were
// last checked prior to the given time (or which have never been checked) and
// updates the internal storage with any new URLs and links to dead URLs which
// it finds.
func (d *DeadLinks) Update(
ctx context.Context, lastCheckedBefore time.Time,
) error {
// Because we are iterating over the same dataset which is being updated it
// is required that we re-attempt the update multiple times, until all
// possible updates have been done.
for {
n, err := d.update(ctx, lastCheckedBefore)
if err != nil {
return err
} else if n == 0 {
break
}
}
if err := d.store.GC(ctx); err != nil {
return fmt.Errorf("garbage collecting: %w", err)
}
return nil
}
// GetByStatus returns an iterator which will return all Resources with the
// given status.
func (d *DeadLinks) GetByStatus(status ResourceStatus) miter.Iterator[Resource] {
return d.store.GetByStatus(status)
}