// Package deadlinks implements a liveness checker for hyperlinks in HTML and // gemtext documents. // // # URLs // // DeadLinks crawls and keeps track of hyperlinks between different // resources, such as webpages and gemtext documents. If a resource is not // linked to from any other resources then DeadLinks forgets about it. // // For this reason it is required to have a starting set of URLs which DeadLinks // will not forget about; these are the pinned URLs. Pinned URLs act as the // starting point for crawling. // // When DeadLinks traverses a URL link, it will check the liveness of that URL's // resource, but it will not by default recur into _that_ resource's links. It // will only do so if the URL matches one of the given regex patterns which // DeadLinks was configured with. // // # Basic Usage // // DeadLinks can be initialized using `New`: // // store := deadlinks.NewSQLiteStore(nil) // defer store.Close() // // pinnedURLs := []string{"https://some.website.com"} // followRegexps := []string{"website.com"} // // dl, err := deadlinks.New(ctx, store, pinnedURLs, &deadlinks.Opts{ // FollowRegexps: followRegexps, // }) // // `Update` is then used to crawl all links, starting with `pinnedURLs`: // // err := dl.Update(ctx, time.Now()) // // Finally, `GetByStatus` can be used to query all discovered resources based on // their current status. To retrieve all resources which have some error // (indicating a broken link): // // erroredResources, err := miter.ToSlice( // ctx, dl.GetByStatus(deadlinks.ResourceStatusError), // ) // // Note that `GetByStatus` returns a `miter.Iterator`, see its documentation for // more options on how to use it beyond `ToSlice`: // // https://godocs.io/code.betamike.com/mediocregopher/mediocre-go-lib/miter // // # Storage // // By default DeadLinks uses an in-memory SQLite database for tracking the // status of resources and the links between them. If memory usage becomes a // problem it is also possible to use a SQLite database file: // // store := deadlinks.NewSQLiteStore(&deadlinks.SQLiteStoreOpts{ // Path: "/path/to/db/file.sqlite", // }) // defer store.Close() // // dl, err := deadlinks.New(ctx, store, pinnedURLs, nil) // // # Further Customization // // Most functionality of DeadLinks can be extended or superceded by injecting // alternate interface implementations via the various Opts structs. package deadlinks import ( "context" "errors" "fmt" "regexp" "runtime" "strings" "sync" "time" "code.betamike.com/mediocregopher/mediocre-go-lib/miter" ) // Opts are optional fields which can be provided to New. A nil Opts is // equivalent to an empty one. type Opts struct { NewClient func() Client // Defaults to `func () Client { return NewClient(nil) }` Parser Parser // Defaults to `NewParser()` // If a URL matches any of these regexps then any links found within it will // be followed and checked for liveness themselves. FollowRegexps []string // If a URL matches any of these regexps then it will not be checked at all. IgnoreRegexps []string // Concurrency determines the maximum number of URLs which can be checked // simultaneously. // // Default: `runtime.NumCPU() / 2` Concurrency int // OnError, if set, will be called whenever DeadLinks encounters an error // internally that it would otherwise skip over. OnError func(error) // RequestTimeout determines how long a request for a resource can run // before the resource is considered unavailable. // // Default: 1 * time.Minute RequestTimeout time.Duration } func (o *Opts) withDefaults() *Opts { if o == nil { o = new(Opts) } if o.NewClient == nil { o.NewClient = func() Client { return NewClient(nil) } } if o.Parser == nil { o.Parser = NewParser() } if o.Concurrency == 0 { o.Concurrency = runtime.NumCPU() / 2 } if o.RequestTimeout == 0 { o.RequestTimeout = 1 * time.Minute } return o } // DeadLinks crawls a configured space of URLs and keeps track of any broken // links which it finds. // // DeadLinks supports multiple web protocols and document formats // out-of-the-box, and will traverse between them as necessary based on URL // schemas. See the `NewClient` and `NewParser` functions for more details. type DeadLinks struct { opts Opts store Store follows, ignores []*regexp.Regexp clients []Client } func compileRegexps(strs []string) ([]*regexp.Regexp, error) { var ( rr = make([]*regexp.Regexp, len(strs)) err error ) for i, str := range strs { if rr[i], err = regexp.Compile(str); err != nil { return nil, fmt.Errorf("compiling regexp %q: %w", str, err) } } return rr, nil } // New initializes and returns a DeadLinks instance which will track the // liveness of the given set of pinned URLs, and potentially URLs linked to from // those. // // If a previously used Store is passed to New then whatever set of previously // pinned URLs were present will be overwritten with the given ones. func New( ctx context.Context, store Store, pinnedURLStrs []string, opts *Opts, ) ( *DeadLinks, error, ) { opts = opts.withDefaults() var ( err error pinnedURLs = make([]URL, len(pinnedURLStrs)) ) for i, u := range pinnedURLStrs { if pinnedURLs[i], err = ParseURL(u); err != nil { return nil, fmt.Errorf("parsing url %q: %w", u, err) } } follows, err := compileRegexps(opts.FollowRegexps) if err != nil { return nil, fmt.Errorf("compiling follows: %w", err) } ignores, err := compileRegexps(opts.IgnoreRegexps) if err != nil { return nil, fmt.Errorf("compiling ignores: %w", err) } d := &DeadLinks{ opts: *opts, store: store, follows: follows, ignores: ignores, } d.clients = make([]Client, d.opts.Concurrency) for i := range d.clients { d.clients[i] = d.opts.NewClient() } if err := d.store.SetPinned(ctx, pinnedURLs); err != nil { return nil, fmt.Errorf("pinning URLs: %w", err) } return d, nil } func (d *DeadLinks) onError(ctx context.Context, err error) { if d.opts.OnError == nil || (ctx.Err() != nil && errors.Is(err, ctx.Err())) { return } d.opts.OnError(err) } func matchesAnyRegexp(url URL, rr []*regexp.Regexp) bool { urlStr := string(url) for _, r := range rr { if r.MatchString(urlStr) { return true } } return false } func (d *DeadLinks) getURL( ctx context.Context, client Client, url URL, ) ( []URL, error, ) { if matchesAnyRegexp(url, d.ignores) { return nil, nil } ctx, cancel := context.WithTimeout(ctx, d.opts.RequestTimeout) defer cancel() mimeType, body, err := client.Get(ctx, url) if err != nil { return nil, err } defer body.Close() // strip off mimeType extensions if i := strings.Index(mimeType, ";"); i > 0 { mimeType = mimeType[:i] } if !matchesAnyRegexp(url, d.follows) { return nil, nil } return d.opts.Parser.Parse(mimeType, body) } // checkURL only returns an error if storing the results of the check fails. func (d *DeadLinks) checkURL( ctx context.Context, client Client, url URL, ) error { var ( now = time.Now() status = ResourceStatusOK errorStr string ) outgoingURLs, err := d.getURL(ctx, client, url) if err != nil { status = ResourceStatusError errorStr = err.Error() } for i := range outgoingURLs { outgoingURLs[i] = url.ResolveReference(outgoingURLs[i]) } err = d.store.Update(ctx, now, url, status, errorStr, outgoingURLs) if err != nil { return fmt.Errorf( "failed storing status:%v (errorStr:%q) and %d outgoingURLs: %w", status, errorStr, len(outgoingURLs), err, ) } return nil } // returns the number of URLs checked. func (d *DeadLinks) update( ctx context.Context, lastCheckedBefore time.Time, ) ( int, error, ) { var ( wg = new(sync.WaitGroup) ch = make(chan URL, d.opts.Concurrency) ) wg.Add(d.opts.Concurrency) for i := 0; i < d.opts.Concurrency; i++ { go func(client Client) { defer wg.Done() for url := range ch { if err := d.checkURL(ctx, client, url); err != nil { d.onError(ctx, fmt.Errorf("checking url %q: %w", url, err)) } } }(d.clients[i]) } var ( n int err error urlsIter = d.store.GetURLsByLastChecked(lastCheckedBefore) ) err = miter.ForEach(ctx, urlsIter, func(url URL) error { ch <- url n++ return nil }) close(ch) wg.Wait() if err != nil { return 0, fmt.Errorf("iterating urls needing checked: %w", err) } return n, nil } // Update runs through all pinned or previously discovered URLs which were // last checked prior to the given time (or which have never been checked) and // updates the internal storage with any new URLs and links to dead URLs which // it finds. func (d *DeadLinks) Update( ctx context.Context, lastCheckedBefore time.Time, ) error { // Because we are iterating over the same dataset which is being updated it // is required that we re-attempt the update multiple times, until all // possible updates have been done. for { n, err := d.update(ctx, lastCheckedBefore) if err != nil { return err } else if n == 0 { break } } if err := d.store.GC(ctx); err != nil { return fmt.Errorf("garbage collecting: %w", err) } return nil } // GetByStatus returns an iterator which will return all Resources with the // given status. func (d *DeadLinks) GetByStatus(status ResourceStatus) miter.Iterator[Resource] { return d.store.GetByStatus(status) }