|
|
|
@ -1,6 +1,50 @@ |
|
|
|
|
// Package deadlinks implements a liveness checker for hyperlinks in HTML and
|
|
|
|
|
// gemtext documents.
|
|
|
|
|
//
|
|
|
|
|
// # URLs
|
|
|
|
|
//
|
|
|
|
|
// DeadLinks crawls and keeps track of hyperlinks between different
|
|
|
|
|
// resources, such as webpages and gemtext documents. If a resource is not
|
|
|
|
|
// linked to from any other resources then DeadLinks forgets about it.
|
|
|
|
|
//
|
|
|
|
|
// For this reason it is required to have a starting set of URLs which DeadLinks
|
|
|
|
|
// will not forget about; these are the pinned URLs. Pinned URLs act as the
|
|
|
|
|
// starting point for crawling.
|
|
|
|
|
//
|
|
|
|
|
// When DeadLinks traverses a URL link, it will check the liveness of that URL's
|
|
|
|
|
// resource, but it will not by default recur into _that_ resource's links. It
|
|
|
|
|
// will only do so if the URL matches one of the given regex patterns which
|
|
|
|
|
// DeadLinks was configured with.
|
|
|
|
|
//
|
|
|
|
|
// # Basic Usage
|
|
|
|
|
//
|
|
|
|
|
// DeadLinks can be initialized using `New`:
|
|
|
|
|
//
|
|
|
|
|
// store := deadlinks.NewSQLiteStore(nil)
|
|
|
|
|
// defer store.Close()
|
|
|
|
|
//
|
|
|
|
|
// pinnedURLs := []string{"https://some.website.com"}
|
|
|
|
|
// patterns := []string{"website.com"}
|
|
|
|
|
//
|
|
|
|
|
// dl, err := deadlinks.New(ctx, store, pinnedURLs, patterns, nil)
|
|
|
|
|
//
|
|
|
|
|
// `Update` is then used to crawl all links, starting with `pinnedURLs`:
|
|
|
|
|
//
|
|
|
|
|
// err := dl.Update(ctx, time.Now())
|
|
|
|
|
//
|
|
|
|
|
// Finally, `GetByStatus` can be used to query all discovered resources based on
|
|
|
|
|
// their current status. To retrieve all resources which have some error
|
|
|
|
|
// (indicating a broken link):
|
|
|
|
|
//
|
|
|
|
|
// erroredResources, err := miter.ToSlice(
|
|
|
|
|
// ctx, dl.GetByStatus(deadlinks.ResourceStatusError),
|
|
|
|
|
// )
|
|
|
|
|
//
|
|
|
|
|
// Note that `GetByStatus` returns a `miter.Iterator`, see its documentation for
|
|
|
|
|
// more options on how to use it beyond `ToSlice`:
|
|
|
|
|
//
|
|
|
|
|
// https://godocs.io/code.betamike.com/mediocregopher/mediocre-go-lib/miter
|
|
|
|
|
//
|
|
|
|
|
// # Storage
|
|
|
|
|
//
|
|
|
|
|
// By default DeadLinks uses an in-memory SQLite database for tracking the
|
|
|
|
@ -12,9 +56,12 @@ |
|
|
|
|
// })
|
|
|
|
|
// defer store.Close()
|
|
|
|
|
//
|
|
|
|
|
// dl, err := deadlinks.New(
|
|
|
|
|
// ctx, store, pinnedURLs, patterns, nil,
|
|
|
|
|
// )
|
|
|
|
|
// dl, err := deadlinks.New(ctx, store, pinnedURLs, patterns, nil)
|
|
|
|
|
//
|
|
|
|
|
// # Further Customization
|
|
|
|
|
//
|
|
|
|
|
// Most functionality of DeadLinks can be extended or superceded by injecting
|
|
|
|
|
// alternate interface implementations via the various Opts structs.
|
|
|
|
|
package deadlinks |
|
|
|
|
|
|
|
|
|
import ( |
|
|
|
@ -33,13 +80,13 @@ import ( |
|
|
|
|
// Opts are optional fields which can be provided to New. A nil Opts is
|
|
|
|
|
// equivalent to an empty one.
|
|
|
|
|
type Opts struct { |
|
|
|
|
Client Client // Defaults to `NewClient(nil)`
|
|
|
|
|
Parser Parser // Defaults to `NewParser()`
|
|
|
|
|
NewClient func() Client // Defaults to `func () Client { return NewClient(nil) }`
|
|
|
|
|
Parser Parser // Defaults to `NewParser()`
|
|
|
|
|
|
|
|
|
|
// Concurrency determines the maximum number of URLs which can be checked
|
|
|
|
|
// simultaneously.
|
|
|
|
|
//
|
|
|
|
|
// Default: `runtime.NumCPU()`
|
|
|
|
|
// Default: `runtime.NumCPU() / 2`
|
|
|
|
|
Concurrency int |
|
|
|
|
|
|
|
|
|
// OnError, if set, will be called whenever DeadLinks encounters an error
|
|
|
|
@ -49,7 +96,7 @@ type Opts struct { |
|
|
|
|
// RequestTimeout determines how long a request for a resource can run
|
|
|
|
|
// before the resource is considered unavailable.
|
|
|
|
|
//
|
|
|
|
|
// Default: 10 * time.Second
|
|
|
|
|
// Default: 1 * time.Minute
|
|
|
|
|
RequestTimeout time.Duration |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -58,8 +105,8 @@ func (o *Opts) withDefaults() *Opts { |
|
|
|
|
o = new(Opts) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if o.Client == nil { |
|
|
|
|
o.Client = NewClient(nil) |
|
|
|
|
if o.NewClient == nil { |
|
|
|
|
o.NewClient = func() Client { return NewClient(nil) } |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if o.Parser == nil { |
|
|
|
@ -67,11 +114,11 @@ func (o *Opts) withDefaults() *Opts { |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if o.Concurrency == 0 { |
|
|
|
|
o.Concurrency = runtime.NumCPU() |
|
|
|
|
o.Concurrency = runtime.NumCPU() / 2 |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if o.RequestTimeout == 0 { |
|
|
|
|
o.RequestTimeout = 10 * time.Second |
|
|
|
|
o.RequestTimeout = 1 * time.Minute |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return o |
|
|
|
@ -87,6 +134,7 @@ type DeadLinks struct { |
|
|
|
|
opts Opts |
|
|
|
|
store Store |
|
|
|
|
patterns []*regexp.Regexp |
|
|
|
|
clients []Client |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// New initializes and returns a DeadLinks instance which will track the
|
|
|
|
@ -129,6 +177,11 @@ func New( |
|
|
|
|
patterns: patterns, |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
d.clients = make([]Client, d.opts.Concurrency) |
|
|
|
|
for i := range d.clients { |
|
|
|
|
d.clients[i] = d.opts.NewClient() |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if err := d.store.SetPinned(ctx, pinnedURLs); err != nil { |
|
|
|
|
return nil, fmt.Errorf("pinning URLs: %w", err) |
|
|
|
|
} |
|
|
|
@ -155,11 +208,15 @@ func (d *DeadLinks) shouldFollowURL(url URL) bool { |
|
|
|
|
return false |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
func (d *DeadLinks) getURL(ctx context.Context, url URL) ([]URL, error) { |
|
|
|
|
func (d *DeadLinks) getURL( |
|
|
|
|
ctx context.Context, client Client, url URL, |
|
|
|
|
) ( |
|
|
|
|
[]URL, error, |
|
|
|
|
) { |
|
|
|
|
ctx, cancel := context.WithTimeout(ctx, d.opts.RequestTimeout) |
|
|
|
|
defer cancel() |
|
|
|
|
|
|
|
|
|
mimeType, body, err := d.opts.Client.Get(ctx, url) |
|
|
|
|
mimeType, body, err := client.Get(ctx, url) |
|
|
|
|
if err != nil { |
|
|
|
|
return nil, err |
|
|
|
|
} |
|
|
|
@ -178,14 +235,16 @@ func (d *DeadLinks) getURL(ctx context.Context, url URL) ([]URL, error) { |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// checkURL only returns an error if storing the results of the check fails.
|
|
|
|
|
func (d *DeadLinks) checkURL(ctx context.Context, url URL) error { |
|
|
|
|
func (d *DeadLinks) checkURL( |
|
|
|
|
ctx context.Context, client Client, url URL, |
|
|
|
|
) error { |
|
|
|
|
var ( |
|
|
|
|
now = time.Now() |
|
|
|
|
status = ResourceStatusOK |
|
|
|
|
errorStr string |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
outgoingURLs, err := d.getURL(ctx, url) |
|
|
|
|
outgoingURLs, err := d.getURL(ctx, client, url) |
|
|
|
|
if err != nil { |
|
|
|
|
status = ResourceStatusError |
|
|
|
|
errorStr = err.Error() |
|
|
|
@ -219,14 +278,14 @@ func (d *DeadLinks) update( |
|
|
|
|
|
|
|
|
|
wg.Add(d.opts.Concurrency) |
|
|
|
|
for i := 0; i < d.opts.Concurrency; i++ { |
|
|
|
|
go func() { |
|
|
|
|
go func(client Client) { |
|
|
|
|
defer wg.Done() |
|
|
|
|
for url := range ch { |
|
|
|
|
if err := d.checkURL(ctx, url); err != nil { |
|
|
|
|
if err := d.checkURL(ctx, client, url); err != nil { |
|
|
|
|
d.onError(ctx, fmt.Errorf("checking url %q: %w", url, err)) |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
}() |
|
|
|
|
}(d.clients[i]) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
var ( |
|
|
|
|