deadlinks/deadlinks.go

// Package deadlinks implements a liveness checker for hyperlinks in HTML and
// gemtext documents.
//
// # URLs
//
// DeadLinks crawls and keeps track of hyperlinks between different
// resources, such as webpages and gemtext documents. If a resource is not
// linked to from any other resources then DeadLinks forgets about it.
//
// For this reason it is required to have a starting set of URLs which DeadLinks
// will not forget about; these are the pinned URLs. Pinned URLs act as the
// starting point for crawling.
//
// When DeadLinks traverses a URL link, it will check the liveness of that URL's
// resource, but it will not by default recur into _that_ resource's links. It
// will only do so if the URL matches one of the given regex patterns which
// DeadLinks was configured with.
//
// # Basic Usage
//
// DeadLinks can be initialized using `New`:
//
//	store := deadlinks.NewSQLiteStore(nil)
//	defer store.Close()
//
//	pinnedURLs := []string{"https://some.website.com"}
//	followRegexps := []string{"website.com"}
//
//	dl, err := deadlinks.New(ctx, store, pinnedURLs, &deadlinks.Opts{
//		FollowRegexps: followRegexps,
//	})
//
// `Update` is then used to crawl all links, starting with `pinnedURLs`:
//
//	err := dl.Update(ctx, time.Now())
//
// Finally, `GetByStatus` can be used to query all discovered resources based on
// their current status. To retrieve all resources which have some error
// (indicating a broken link):
//
//	erroredResources, err := miter.ToSlice(
//		ctx, dl.GetByStatus(deadlinks.ResourceStatusError),
//	)
//
// Note that `GetByStatus` returns a `miter.Iterator`, see its documentation for
// more options on how to use it beyond `ToSlice`:
//
// https://godocs.io/code.betamike.com/mediocregopher/mediocre-go-lib/miter
//
// # Storage
//
// By default DeadLinks uses an in-memory SQLite database for tracking the
// status of resources and the links between them. If memory usage becomes a
// problem it is also possible to use a SQLite database file:
//
//	store := deadlinks.NewSQLiteStore(&deadlinks.SQLiteStoreOpts{
//		Path: "/path/to/db/file.sqlite",
//	})
//	defer store.Close()
//
//	dl, err := deadlinks.New(ctx, store, pinnedURLs, nil)
//
// # Further Customization
//
// Most functionality of DeadLinks can be extended or superceded by injecting
// alternate interface implementations via the various Opts structs.
package deadlinks

import (
	"context"
	"errors"
	"fmt"
	"regexp"
	"runtime"
	"strings"
	"sync"
	"time"

	"code.betamike.com/mediocregopher/mediocre-go-lib/miter"
)

// Opts are optional fields which can be provided to New. A nil Opts is
// equivalent to an empty one.
type Opts struct {
	NewClient func() Client // Defaults to `func () Client { return NewClient(nil) }`
	Parser    Parser        // Defaults to `NewParser()`

	// If a URL matches any of these regexps then any links found within it will
	// be followed and checked for liveness themselves.
	FollowRegexps []string

	// If a URL matches any of these regexps then it will not be checked at all.
	IgnoreRegexps []string

	// Concurrency determines the maximum number of URLs which can be checked
	// simultaneously.
	//
	// Default: `runtime.NumCPU() / 2`
	Concurrency int

	// OnError, if set, will be called whenever DeadLinks encounters an error
	// internally that it would otherwise skip over.
	OnError func(error)

	// RequestTimeout determines how long a request for a resource can run
	// before the resource is considered unavailable.
	//
	// Default: 1 * time.Minute
	RequestTimeout time.Duration
}

func (o *Opts) withDefaults() *Opts {
	if o == nil {
		o = new(Opts)
	}

	if o.NewClient == nil {
		o.NewClient = func() Client { return NewClient(nil) }
	}

	if o.Parser == nil {
		o.Parser = NewParser()
	}

	if o.Concurrency == 0 {
		o.Concurrency = runtime.NumCPU() / 2
	}

	if o.RequestTimeout == 0 {
		o.RequestTimeout = 1 * time.Minute
	}

	return o
}

// DeadLinks crawls a configured space of URLs and keeps track of any broken
// links which it finds.
//
// DeadLinks supports multiple web protocols and document formats
// out-of-the-box, and will traverse between them as necessary based on URL
// schemas. See the `NewClient` and `NewParser` functions for more details.
type DeadLinks struct {
	opts             Opts
	store            Store
	follows, ignores []*regexp.Regexp
	clients          []Client
}

func compileRegexps(strs []string) ([]*regexp.Regexp, error) {
	var (
		rr  = make([]*regexp.Regexp, len(strs))
		err error
	)
	for i, str := range strs {
		if rr[i], err = regexp.Compile(str); err != nil {
			return nil, fmt.Errorf("compiling regexp %q: %w", str, err)
		}
	}
	return rr, nil
}

// New initializes and returns a DeadLinks instance which will track the
// liveness of the given set of pinned URLs, and potentially URLs linked to from
// those.
//
// If a previously used Store is passed to New then whatever set of previously
// pinned URLs were present will be overwritten with the given ones.
func New(
	ctx context.Context,
	store Store,
	pinnedURLStrs []string,
	opts *Opts,
) (
	*DeadLinks, error,
) {
	opts = opts.withDefaults()

	var (
		err        error
		pinnedURLs = make([]URL, len(pinnedURLStrs))
	)

	for i, u := range pinnedURLStrs {
		if pinnedURLs[i], err = ParseURL(u); err != nil {
			return nil, fmt.Errorf("parsing url %q: %w", u, err)
		}
	}

	follows, err := compileRegexps(opts.FollowRegexps)
	if err != nil {
		return nil, fmt.Errorf("compiling follows: %w", err)
	}

	ignores, err := compileRegexps(opts.IgnoreRegexps)
	if err != nil {
		return nil, fmt.Errorf("compiling ignores: %w", err)
	}

	d := &DeadLinks{
		opts:    *opts,
		store:   store,
		follows: follows,
		ignores: ignores,
	}

	d.clients = make([]Client, d.opts.Concurrency)
	for i := range d.clients {
		d.clients[i] = d.opts.NewClient()
	}

	if err := d.store.SetPinned(ctx, pinnedURLs); err != nil {
		return nil, fmt.Errorf("pinning URLs: %w", err)
	}

	return d, nil
}

func (d *DeadLinks) onError(ctx context.Context, err error) {
	if d.opts.OnError == nil ||
		(ctx.Err() != nil && errors.Is(err, ctx.Err())) {
		return
	}

	d.opts.OnError(err)
}

func matchesAnyRegexp(url URL, rr []*regexp.Regexp) bool {
	urlStr := string(url)
	for _, r := range rr {
		if r.MatchString(urlStr) {
			return true
		}
	}
	return false
}

func (d *DeadLinks) getURL(
	ctx context.Context, client Client, url URL,
) (
	[]URL, error,
) {
	if matchesAnyRegexp(url, d.ignores) {
		return nil, nil
	}

	ctx, cancel := context.WithTimeout(ctx, d.opts.RequestTimeout)
	defer cancel()

	mimeType, body, err := client.Get(ctx, url)
	if err != nil {
		return nil, err
	}
	defer body.Close()

	// strip off mimeType extensions
	if i := strings.Index(mimeType, ";"); i > 0 {
		mimeType = mimeType[:i]
	}

	if !matchesAnyRegexp(url, d.follows) {
		return nil, nil
	}

	return d.opts.Parser.Parse(mimeType, body)
}

// checkURL only returns an error if storing the results of the check fails.
func (d *DeadLinks) checkURL(
	ctx context.Context, client Client, url URL,
) error {
	var (
		now      = time.Now()
		status   = ResourceStatusOK
		errorStr string
	)

	outgoingURLs, err := d.getURL(ctx, client, url)
	if err != nil {
		status = ResourceStatusError
		errorStr = err.Error()
	}

	for i := range outgoingURLs {
		outgoingURLs[i] = url.ResolveReference(outgoingURLs[i])
	}

	err = d.store.Update(ctx, now, url, status, errorStr, outgoingURLs)
	if err != nil {
		return fmt.Errorf(
			"failed storing status:%v (errorStr:%q) and %d outgoingURLs: %w",
			status, errorStr, len(outgoingURLs), err,
		)
	}

	return nil
}

// returns the number of URLs checked.
func (d *DeadLinks) update(
	ctx context.Context, lastCheckedBefore time.Time,
) (
	int, error,
) {
	var (
		wg = new(sync.WaitGroup)
		ch = make(chan URL, d.opts.Concurrency)
	)

	wg.Add(d.opts.Concurrency)
	for i := 0; i < d.opts.Concurrency; i++ {
		go func(client Client) {
			defer wg.Done()
			for url := range ch {
				if err := d.checkURL(ctx, client, url); err != nil {
					d.onError(ctx, fmt.Errorf("checking url %q: %w", url, err))
				}
			}
		}(d.clients[i])
	}

	var (
		n        int
		err      error
		urlsIter = d.store.GetURLsByLastChecked(lastCheckedBefore)
	)

	err = miter.ForEach(ctx, urlsIter, func(url URL) error {
		ch <- url
		n++
		return nil
	})

	close(ch)
	wg.Wait()

	if err != nil {
		return 0, fmt.Errorf("iterating urls needing checked: %w", err)
	}

	return n, nil
}

// Update runs through all pinned or previously discovered URLs which were
// last checked prior to the given time (or which have never been checked) and
// updates the internal storage with any new URLs and links to dead URLs which
// it finds.
func (d *DeadLinks) Update(
	ctx context.Context, lastCheckedBefore time.Time,
) error {

	// Because we are iterating over the same dataset which is being updated it
	// is required that we re-attempt the update multiple times, until all
	// possible updates have been done.
	for {
		n, err := d.update(ctx, lastCheckedBefore)
		if err != nil {
			return err
		} else if n == 0 {
			break
		}
	}

	if err := d.store.GC(ctx); err != nil {
		return fmt.Errorf("garbage collecting: %w", err)
	}

	return nil
}

// GetByStatus returns an iterator which will return all Resources with the
// given status.
func (d *DeadLinks) GetByStatus(status ResourceStatus) miter.Iterator[Resource] {
	return d.store.GetByStatus(status)
}