// Package deadlinks implements a liveness checker for hyperlinks in HTML and // gemtext documents. // // # Storage // // By default DeadLinks uses an in-memory SQLite database for tracking the // status of resources and the links between them. If memory usage becomes a // problem it is also possible to use a SQLite database file: // // store := deadlinks.NewSQLiteStore(&deadlinks.SQLiteStoreOpts{ // Path: "/path/to/db/file.sqlite", // }) // // // TODO initialize DeadLinks package deadlinks import ( "context" "errors" "fmt" "regexp" "runtime" "sync" "time" "code.betamike.com/mediocregopher/mediocre-go-lib/miter" ) // Opts are optional fields which can be provided to New. A nil Opts is // equivalent to an empty one. type Opts struct { Client Client // Defaults to `NewClient(nil)` Parser Parser // Defaults to `NewParser()` // Concurrency determines the maximum number of URLs which can be checked // simultaneously. // // Default: `runtime.NumCPU()` Concurrency int // OnError, if set, will be called whenever DeadLinks encounters an error // internally that it would otherwise skip over. OnError func(error) } func (o *Opts) withDefaults() *Opts { if o == nil { o = new(Opts) } if o.Client == nil { o.Client = NewClient(nil) } if o.Parser == nil { o.Parser = NewParser() } if o.Concurrency == 0 { o.Concurrency = runtime.NumCPU() } return o } // DeadLinks crawls a configured space of URLs and keeps track of any broken // links which it finds. // // DeadLinks supports multiple web protocols and document formats // out-of-the-box, and will traverse between them as necessary based on URL // schemas. See the `NewClient` and `NewParser` functions for more details. type DeadLinks struct { opts Opts store Store patterns []*regexp.Regexp } // New initializes and returns a DeadLinks instance which will track the // liveness of the given set of pinned URLs, as well as all URLs linked to from // those. If a linked URL matches one of the given regexp patterns then any // URLs linked to from it will be tracked as well. // // If a non-empty Store is passed to New then whatever set of previously pinned // URLs were present will be overwritten with the given ones. func New( ctx context.Context, store Store, pinnedURLStrs, patternStrs []string, opts *Opts, ) ( *DeadLinks, error, ) { var ( err error pinnedURLs = make([]URL, len(pinnedURLStrs)) patterns = make([]*regexp.Regexp, len(patternStrs)) ) for i, u := range pinnedURLStrs { if pinnedURLs[i], err = ParseURL(u); err != nil { return nil, fmt.Errorf("parsing url %q: %w", u, err) } } for i, p := range patternStrs { if patterns[i], err = regexp.Compile(p); err != nil { return nil, fmt.Errorf("compiling regexp %q: %w", p, err) } } d := &DeadLinks{ opts: *opts.withDefaults(), store: store, patterns: patterns, } if err := d.store.SetPinned(ctx, pinnedURLs); err != nil { return nil, fmt.Errorf("pinning URLs: %w", err) } return d, nil } func (d *DeadLinks) onError(ctx context.Context, err error) { if d.opts.OnError == nil || (ctx.Err() != nil && errors.Is(err, ctx.Err())) { return } d.opts.OnError(err) } func (d *DeadLinks) shouldFollowURL(url URL) bool { urlStr := string(url) for _, pattern := range d.patterns { if pattern.MatchString(urlStr) { return true } } return false } func (d *DeadLinks) getURL(ctx context.Context, url URL) ([]URL, error) { mimeType, body, err := d.opts.Client.Get(ctx, url) if err != nil { return nil, err } defer body.Close() if !d.shouldFollowURL(url) { return nil, nil } return d.opts.Parser.Parse(mimeType, body) } // checkURL only returns an error if storing the results of the check fails. func (d *DeadLinks) checkURL(ctx context.Context, url URL) error { var ( now = time.Now() status = ResourceStatusOK errorStr string ) outgoingURLs, err := d.getURL(ctx, url) if err != nil { status = ResourceStatusError errorStr = err.Error() } err = d.store.Update(ctx, now, url, status, errorStr, outgoingURLs) if err != nil { return fmt.Errorf( "failed storing status:%v (errorStr:%q) and %d outgoingURLs: %w", status, errorStr, len(outgoingURLs), err, ) } return nil } // Update runs through all pinned or previously discovered URLs which were // last checked prior to the given time (or which have never been checked) and // updates the internal storage with any new URLs and links to dead URLs which // it finds. func (d *DeadLinks) Update( ctx context.Context, lastCheckedBefore time.Time, ) error { var ( wg = new(sync.WaitGroup) ch = make(chan URL, d.opts.Concurrency) ) wg.Add(d.opts.Concurrency) for i := 0; i < d.opts.Concurrency; i++ { go func() { defer wg.Done() for url := range ch { if err := d.checkURL(ctx, url); err != nil { d.onError(ctx, fmt.Errorf("checking url %q: %w", url, err)) } } }() } var err error // Because checking URLs can result in new URLs being inserted into the // Store, we query the Store in a loop until it stops producing // unvisited/stale URLs. for { var ( n int urlsIter = d.store.GetURLsByLastChecked(lastCheckedBefore) ) err = miter.ForEach(ctx, urlsIter, func(url URL) error { ch <- url n++ return nil }) if err != nil || n == 0 { break } } close(ch) wg.Wait() if err != nil { return fmt.Errorf("iterating urls needing checked: %w", err) } if err := d.store.GC(ctx); err != nil { return fmt.Errorf("garbage collecting: %w", err) } return nil } // TODO expose GetByStatus