Compare commits

...

5 Commits

7 changed files with 240 additions and 25 deletions

14
LICENSE.txt Normal file
View File

@ -0,0 +1,14 @@
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
Version 2, December 2004
Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
Everyone is permitted to copy and distribute verbatim or modified
copies of this license document, and changing it is allowed as long
as the name is changed.
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. You just DO WHAT THE FUCK YOU WANT TO.

44
README.md Normal file
View File

@ -0,0 +1,44 @@
# DeadLinks
A tool for crawling and finding links to URLs which no longer exist. deadlinks
supports the HTTP(s) and gemini protocols, and is intended for periodically
checking links on personal websites and blogs.
## Library
The `deadlinks` package is designed to be easily embedded into a process and
have its results displayed in something like a status page.
[See the godocs for more info.](https://godocs.io/code.betamike.com/mediocregopher/deadlinks)
## Command-Line
The command-line utility can be installed using `go install`:
```
go install code.betamike.com/mediocregopher/deadlinks/cmd/deadlinks
```
The `-urls` parameter is required. Given one or more URLs it will check each one
for any dead links:
```
deadlinks -urls 'https://mediocregopher.com,gemini://mediocregopher.com'
```
Any links which are dead will be output to stdout as YAML objects, each
containing the dead URL, the error encountered, and which pages link to it.
In order to recursively crawl through links you can give one or more regex
patterns. Any URL which matches a pattern will have its links checked as well
(and if any of those link URLs match a pattern their links will be checked, and
so on):
```
deadlinks \
-urls 'https://mediocregopher.com,gemini://mediocregopher.com' \
-patterns '://mediocregopher.com'
```
There are further options available which affect the utility's behavior, see
`deadlinks -h` for more.

View File

@ -78,6 +78,7 @@ type client struct {
//
// Some schemas automatically return success:
// - mailto
// - data
func NewClient(opts *ClientOpts) Client {
return &client{*opts.withDefaults()}
}
@ -261,7 +262,7 @@ func (c *client) get(
return c.getGemini(ctx, url, redirectDepth)
case "http", "https":
return c.getHTTP(ctx, url, redirectDepth)
case "mailto":
case "mailto", "data":
return c.noOpGet()
default:
return "", nil, fmt.Errorf("unsupported scheme %q", scheme)

97
cmd/deadlinks/main.go Normal file
View File

@ -0,0 +1,97 @@
package main
import (
"context"
"flag"
"io"
"log"
"os"
"os/signal"
"runtime"
"strings"
"time"
"code.betamike.com/mediocregopher/deadlinks"
"code.betamike.com/mediocregopher/mediocre-go-lib/miter"
"gopkg.in/yaml.v3"
)
type loggingClient struct {
inner deadlinks.Client
}
func (c loggingClient) Get(
ctx context.Context, url deadlinks.URL,
) (
string, io.ReadCloser, error,
) {
log.Printf("querying %q", url)
return c.inner.Get(ctx, url)
}
func main() {
var (
storePath = flag.String("store-path", "", "Path to sqlite storage file. If not given then a temporary in-memory storage is used")
maxAge = flag.Duration("max-age", 0, "Maximum duration since last check of a resource, before it must be checked again. Must be used with -store-path")
urls = flag.String("urls", "", `Comma-separated list of URLs which are always checked. At least one is required`)
patternsStr = flag.String("patterns", "", "Comma-separated list of regexps. All URLs which match one of these will have their links checked as well")
concurrency = flag.Int("concurrency", runtime.NumCPU()/2, "Number simultaneous requests to make at a time")
)
flag.Parse()
if *urls == "" {
log.Fatal("-urls is required")
}
var patterns []string
if *patternsStr != "" {
patterns = strings.Split(*patternsStr, ",")
}
ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)
defer cancel()
store := deadlinks.NewSQLiteStore(&deadlinks.SQLiteStoreOpts{
Path: *storePath,
})
defer store.Close()
dl, err := deadlinks.New(
ctx,
store,
strings.Split(*urls, ","),
patterns,
&deadlinks.Opts{
NewClient: func() deadlinks.Client {
return loggingClient{deadlinks.NewClient(nil)}
},
Concurrency: *concurrency,
OnError: func(err error) {
log.Printf("runtime error: %v", err)
},
},
)
if err != nil {
log.Fatalf("initialization error: %v", err)
}
lastCheckedBefore := time.Now().Add(-*maxAge)
if err := dl.Update(ctx, lastCheckedBefore); err != nil {
log.Fatalf("update encountered error: %v", err)
}
enc := yaml.NewEncoder(os.Stdout)
defer os.Stdout.Sync()
iter := dl.GetByStatus(deadlinks.ResourceStatusError)
err = miter.ForEach(ctx, iter, func(r deadlinks.Resource) error {
return enc.Encode(r)
})
if err != nil {
log.Fatalf("iterating over errored resources failed: %v", err)
}
}

View File

@ -1,6 +1,50 @@
// Package deadlinks implements a liveness checker for hyperlinks in HTML and
// gemtext documents.
//
// # URLs
//
// DeadLinks crawls and keeps track of hyperlinks between different
// resources, such as webpages and gemtext documents. If a resource is not
// linked to from any other resources then DeadLinks forgets about it.
//
// For this reason it is required to have a starting set of URLs which DeadLinks
// will not forget about; these are the pinned URLs. Pinned URLs act as the
// starting point for crawling.
//
// When DeadLinks traverses a URL link, it will check the liveness of that URL's
// resource, but it will not by default recur into _that_ resource's links. It
// will only do so if the URL matches one of the given regex patterns which
// DeadLinks was configured with.
//
// # Basic Usage
//
// DeadLinks can be initialized using `New`:
//
// store := deadlinks.NewSQLiteStore(nil)
// defer store.Close()
//
// pinnedURLs := []string{"https://some.website.com"}
// patterns := []string{"website.com"}
//
// dl, err := deadlinks.New(ctx, store, pinnedURLs, patterns, nil)
//
// `Update` is then used to crawl all links, starting with `pinnedURLs`:
//
// err := dl.Update(ctx, time.Now())
//
// Finally, `GetByStatus` can be used to query all discovered resources based on
// their current status. To retrieve all resources which have some error
// (indicating a broken link):
//
// erroredResources, err := miter.ToSlice(
// ctx, dl.GetByStatus(deadlinks.ResourceStatusError),
// )
//
// Note that `GetByStatus` returns a `miter.Iterator`, see its documentation for
// more options on how to use it beyond `ToSlice`:
//
// https://godocs.io/code.betamike.com/mediocregopher/mediocre-go-lib/miter
//
// # Storage
//
// By default DeadLinks uses an in-memory SQLite database for tracking the
@ -12,9 +56,12 @@
// })
// defer store.Close()
//
// dl, err := deadlinks.New(
// ctx, store, pinnedURLs, patterns, nil,
// )
// dl, err := deadlinks.New(ctx, store, pinnedURLs, patterns, nil)
//
// # Further Customization
//
// Most functionality of DeadLinks can be extended or superceded by injecting
// alternate interface implementations via the various Opts structs.
package deadlinks
import (
@ -33,13 +80,13 @@ import (
// Opts are optional fields which can be provided to New. A nil Opts is
// equivalent to an empty one.
type Opts struct {
Client Client // Defaults to `NewClient(nil)`
NewClient func() Client // Defaults to `func () Client { return NewClient(nil) }`
Parser Parser // Defaults to `NewParser()`
// Concurrency determines the maximum number of URLs which can be checked
// simultaneously.
//
// Default: `runtime.NumCPU()`
// Default: `runtime.NumCPU() / 2`
Concurrency int
// OnError, if set, will be called whenever DeadLinks encounters an error
@ -49,7 +96,7 @@ type Opts struct {
// RequestTimeout determines how long a request for a resource can run
// before the resource is considered unavailable.
//
// Default: 10 * time.Second
// Default: 1 * time.Minute
RequestTimeout time.Duration
}
@ -58,8 +105,8 @@ func (o *Opts) withDefaults() *Opts {
o = new(Opts)
}
if o.Client == nil {
o.Client = NewClient(nil)
if o.NewClient == nil {
o.NewClient = func() Client { return NewClient(nil) }
}
if o.Parser == nil {
@ -67,11 +114,11 @@ func (o *Opts) withDefaults() *Opts {
}
if o.Concurrency == 0 {
o.Concurrency = runtime.NumCPU()
o.Concurrency = runtime.NumCPU() / 2
}
if o.RequestTimeout == 0 {
o.RequestTimeout = 10 * time.Second
o.RequestTimeout = 1 * time.Minute
}
return o
@ -87,6 +134,7 @@ type DeadLinks struct {
opts Opts
store Store
patterns []*regexp.Regexp
clients []Client
}
// New initializes and returns a DeadLinks instance which will track the
@ -129,6 +177,11 @@ func New(
patterns: patterns,
}
d.clients = make([]Client, d.opts.Concurrency)
for i := range d.clients {
d.clients[i] = d.opts.NewClient()
}
if err := d.store.SetPinned(ctx, pinnedURLs); err != nil {
return nil, fmt.Errorf("pinning URLs: %w", err)
}
@ -155,11 +208,15 @@ func (d *DeadLinks) shouldFollowURL(url URL) bool {
return false
}
func (d *DeadLinks) getURL(ctx context.Context, url URL) ([]URL, error) {
func (d *DeadLinks) getURL(
ctx context.Context, client Client, url URL,
) (
[]URL, error,
) {
ctx, cancel := context.WithTimeout(ctx, d.opts.RequestTimeout)
defer cancel()
mimeType, body, err := d.opts.Client.Get(ctx, url)
mimeType, body, err := client.Get(ctx, url)
if err != nil {
return nil, err
}
@ -178,14 +235,16 @@ func (d *DeadLinks) getURL(ctx context.Context, url URL) ([]URL, error) {
}
// checkURL only returns an error if storing the results of the check fails.
func (d *DeadLinks) checkURL(ctx context.Context, url URL) error {
func (d *DeadLinks) checkURL(
ctx context.Context, client Client, url URL,
) error {
var (
now = time.Now()
status = ResourceStatusOK
errorStr string
)
outgoingURLs, err := d.getURL(ctx, url)
outgoingURLs, err := d.getURL(ctx, client, url)
if err != nil {
status = ResourceStatusError
errorStr = err.Error()
@ -219,14 +278,14 @@ func (d *DeadLinks) update(
wg.Add(d.opts.Concurrency)
for i := 0; i < d.opts.Concurrency; i++ {
go func() {
go func(client Client) {
defer wg.Done()
for url := range ch {
if err := d.checkURL(ctx, url); err != nil {
if err := d.checkURL(ctx, client, url); err != nil {
d.onError(ctx, fmt.Errorf("checking url %q: %w", url, err))
}
}
}()
}(d.clients[i])
}
var (

View File

@ -35,7 +35,6 @@
pkgs.go
pkgs.gotools
pkgs.golangci-lint
pkgs.sqlite
];
};
});

View File

@ -32,14 +32,15 @@ func (ds ResourceStatus) String() string {
// uniquely identified by a URL.
type Resource struct {
URL URL
Status ResourceStatus
Pinned bool
LastChecked time.Time
Status ResourceStatus `yaml:"-"`
Pinned bool `yaml:"-"`
LastChecked time.Time `yaml:"last_checked"`
// only set if Status == ResourceStatusError
ErrorString string
ErrorString string `yaml:"error"`
// Indicate the URLs of resources which link to/are linked from this
// resource.
IncomingLinkURLs, OutgoingLinkURLs []URL
IncomingLinkURLs []URL `yaml:"incoming_links"`
OutgoingLinkURLs []URL `yaml:"outgoing_links"`
}