Compare commits
5 Commits
b6c20e57f9
...
c6361ea488
Author | SHA1 | Date | |
---|---|---|---|
c6361ea488 | |||
07a5acceaf | |||
eaccb83a7b | |||
307e311b61 | |||
d5c85c16b9 |
14
LICENSE.txt
Normal file
14
LICENSE.txt
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
|
||||||
|
Version 2, December 2004
|
||||||
|
|
||||||
|
Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
|
||||||
|
|
||||||
|
Everyone is permitted to copy and distribute verbatim or modified
|
||||||
|
copies of this license document, and changing it is allowed as long
|
||||||
|
as the name is changed.
|
||||||
|
|
||||||
|
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
|
||||||
|
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
|
||||||
|
|
||||||
|
0. You just DO WHAT THE FUCK YOU WANT TO.
|
||||||
|
|
44
README.md
Normal file
44
README.md
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
# DeadLinks
|
||||||
|
|
||||||
|
A tool for crawling and finding links to URLs which no longer exist. deadlinks
|
||||||
|
supports the HTTP(s) and gemini protocols, and is intended for periodically
|
||||||
|
checking links on personal websites and blogs.
|
||||||
|
|
||||||
|
## Library
|
||||||
|
|
||||||
|
The `deadlinks` package is designed to be easily embedded into a process and
|
||||||
|
have its results displayed in something like a status page.
|
||||||
|
|
||||||
|
[See the godocs for more info.](https://godocs.io/code.betamike.com/mediocregopher/deadlinks)
|
||||||
|
|
||||||
|
## Command-Line
|
||||||
|
|
||||||
|
The command-line utility can be installed using `go install`:
|
||||||
|
|
||||||
|
```
|
||||||
|
go install code.betamike.com/mediocregopher/deadlinks/cmd/deadlinks
|
||||||
|
```
|
||||||
|
|
||||||
|
The `-urls` parameter is required. Given one or more URLs it will check each one
|
||||||
|
for any dead links:
|
||||||
|
|
||||||
|
```
|
||||||
|
deadlinks -urls 'https://mediocregopher.com,gemini://mediocregopher.com'
|
||||||
|
```
|
||||||
|
|
||||||
|
Any links which are dead will be output to stdout as YAML objects, each
|
||||||
|
containing the dead URL, the error encountered, and which pages link to it.
|
||||||
|
|
||||||
|
In order to recursively crawl through links you can give one or more regex
|
||||||
|
patterns. Any URL which matches a pattern will have its links checked as well
|
||||||
|
(and if any of those link URLs match a pattern their links will be checked, and
|
||||||
|
so on):
|
||||||
|
|
||||||
|
```
|
||||||
|
deadlinks \
|
||||||
|
-urls 'https://mediocregopher.com,gemini://mediocregopher.com' \
|
||||||
|
-patterns '://mediocregopher.com'
|
||||||
|
```
|
||||||
|
|
||||||
|
There are further options available which affect the utility's behavior, see
|
||||||
|
`deadlinks -h` for more.
|
@ -78,6 +78,7 @@ type client struct {
|
|||||||
//
|
//
|
||||||
// Some schemas automatically return success:
|
// Some schemas automatically return success:
|
||||||
// - mailto
|
// - mailto
|
||||||
|
// - data
|
||||||
func NewClient(opts *ClientOpts) Client {
|
func NewClient(opts *ClientOpts) Client {
|
||||||
return &client{*opts.withDefaults()}
|
return &client{*opts.withDefaults()}
|
||||||
}
|
}
|
||||||
@ -261,7 +262,7 @@ func (c *client) get(
|
|||||||
return c.getGemini(ctx, url, redirectDepth)
|
return c.getGemini(ctx, url, redirectDepth)
|
||||||
case "http", "https":
|
case "http", "https":
|
||||||
return c.getHTTP(ctx, url, redirectDepth)
|
return c.getHTTP(ctx, url, redirectDepth)
|
||||||
case "mailto":
|
case "mailto", "data":
|
||||||
return c.noOpGet()
|
return c.noOpGet()
|
||||||
default:
|
default:
|
||||||
return "", nil, fmt.Errorf("unsupported scheme %q", scheme)
|
return "", nil, fmt.Errorf("unsupported scheme %q", scheme)
|
||||||
|
97
cmd/deadlinks/main.go
Normal file
97
cmd/deadlinks/main.go
Normal file
@ -0,0 +1,97 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"flag"
|
||||||
|
"io"
|
||||||
|
"log"
|
||||||
|
"os"
|
||||||
|
"os/signal"
|
||||||
|
"runtime"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"code.betamike.com/mediocregopher/deadlinks"
|
||||||
|
"code.betamike.com/mediocregopher/mediocre-go-lib/miter"
|
||||||
|
"gopkg.in/yaml.v3"
|
||||||
|
)
|
||||||
|
|
||||||
|
type loggingClient struct {
|
||||||
|
inner deadlinks.Client
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c loggingClient) Get(
|
||||||
|
ctx context.Context, url deadlinks.URL,
|
||||||
|
) (
|
||||||
|
string, io.ReadCloser, error,
|
||||||
|
) {
|
||||||
|
log.Printf("querying %q", url)
|
||||||
|
return c.inner.Get(ctx, url)
|
||||||
|
}
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
var (
|
||||||
|
storePath = flag.String("store-path", "", "Path to sqlite storage file. If not given then a temporary in-memory storage is used")
|
||||||
|
maxAge = flag.Duration("max-age", 0, "Maximum duration since last check of a resource, before it must be checked again. Must be used with -store-path")
|
||||||
|
urls = flag.String("urls", "", `Comma-separated list of URLs which are always checked. At least one is required`)
|
||||||
|
patternsStr = flag.String("patterns", "", "Comma-separated list of regexps. All URLs which match one of these will have their links checked as well")
|
||||||
|
concurrency = flag.Int("concurrency", runtime.NumCPU()/2, "Number simultaneous requests to make at a time")
|
||||||
|
)
|
||||||
|
|
||||||
|
flag.Parse()
|
||||||
|
|
||||||
|
if *urls == "" {
|
||||||
|
log.Fatal("-urls is required")
|
||||||
|
}
|
||||||
|
|
||||||
|
var patterns []string
|
||||||
|
if *patternsStr != "" {
|
||||||
|
patterns = strings.Split(*patternsStr, ",")
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
store := deadlinks.NewSQLiteStore(&deadlinks.SQLiteStoreOpts{
|
||||||
|
Path: *storePath,
|
||||||
|
})
|
||||||
|
defer store.Close()
|
||||||
|
|
||||||
|
dl, err := deadlinks.New(
|
||||||
|
ctx,
|
||||||
|
store,
|
||||||
|
strings.Split(*urls, ","),
|
||||||
|
patterns,
|
||||||
|
&deadlinks.Opts{
|
||||||
|
NewClient: func() deadlinks.Client {
|
||||||
|
return loggingClient{deadlinks.NewClient(nil)}
|
||||||
|
},
|
||||||
|
Concurrency: *concurrency,
|
||||||
|
OnError: func(err error) {
|
||||||
|
log.Printf("runtime error: %v", err)
|
||||||
|
},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("initialization error: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
lastCheckedBefore := time.Now().Add(-*maxAge)
|
||||||
|
|
||||||
|
if err := dl.Update(ctx, lastCheckedBefore); err != nil {
|
||||||
|
log.Fatalf("update encountered error: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
enc := yaml.NewEncoder(os.Stdout)
|
||||||
|
defer os.Stdout.Sync()
|
||||||
|
|
||||||
|
iter := dl.GetByStatus(deadlinks.ResourceStatusError)
|
||||||
|
err = miter.ForEach(ctx, iter, func(r deadlinks.Resource) error {
|
||||||
|
return enc.Encode(r)
|
||||||
|
})
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("iterating over errored resources failed: %v", err)
|
||||||
|
}
|
||||||
|
}
|
93
deadlinks.go
93
deadlinks.go
@ -1,6 +1,50 @@
|
|||||||
// Package deadlinks implements a liveness checker for hyperlinks in HTML and
|
// Package deadlinks implements a liveness checker for hyperlinks in HTML and
|
||||||
// gemtext documents.
|
// gemtext documents.
|
||||||
//
|
//
|
||||||
|
// # URLs
|
||||||
|
//
|
||||||
|
// DeadLinks crawls and keeps track of hyperlinks between different
|
||||||
|
// resources, such as webpages and gemtext documents. If a resource is not
|
||||||
|
// linked to from any other resources then DeadLinks forgets about it.
|
||||||
|
//
|
||||||
|
// For this reason it is required to have a starting set of URLs which DeadLinks
|
||||||
|
// will not forget about; these are the pinned URLs. Pinned URLs act as the
|
||||||
|
// starting point for crawling.
|
||||||
|
//
|
||||||
|
// When DeadLinks traverses a URL link, it will check the liveness of that URL's
|
||||||
|
// resource, but it will not by default recur into _that_ resource's links. It
|
||||||
|
// will only do so if the URL matches one of the given regex patterns which
|
||||||
|
// DeadLinks was configured with.
|
||||||
|
//
|
||||||
|
// # Basic Usage
|
||||||
|
//
|
||||||
|
// DeadLinks can be initialized using `New`:
|
||||||
|
//
|
||||||
|
// store := deadlinks.NewSQLiteStore(nil)
|
||||||
|
// defer store.Close()
|
||||||
|
//
|
||||||
|
// pinnedURLs := []string{"https://some.website.com"}
|
||||||
|
// patterns := []string{"website.com"}
|
||||||
|
//
|
||||||
|
// dl, err := deadlinks.New(ctx, store, pinnedURLs, patterns, nil)
|
||||||
|
//
|
||||||
|
// `Update` is then used to crawl all links, starting with `pinnedURLs`:
|
||||||
|
//
|
||||||
|
// err := dl.Update(ctx, time.Now())
|
||||||
|
//
|
||||||
|
// Finally, `GetByStatus` can be used to query all discovered resources based on
|
||||||
|
// their current status. To retrieve all resources which have some error
|
||||||
|
// (indicating a broken link):
|
||||||
|
//
|
||||||
|
// erroredResources, err := miter.ToSlice(
|
||||||
|
// ctx, dl.GetByStatus(deadlinks.ResourceStatusError),
|
||||||
|
// )
|
||||||
|
//
|
||||||
|
// Note that `GetByStatus` returns a `miter.Iterator`, see its documentation for
|
||||||
|
// more options on how to use it beyond `ToSlice`:
|
||||||
|
//
|
||||||
|
// https://godocs.io/code.betamike.com/mediocregopher/mediocre-go-lib/miter
|
||||||
|
//
|
||||||
// # Storage
|
// # Storage
|
||||||
//
|
//
|
||||||
// By default DeadLinks uses an in-memory SQLite database for tracking the
|
// By default DeadLinks uses an in-memory SQLite database for tracking the
|
||||||
@ -12,9 +56,12 @@
|
|||||||
// })
|
// })
|
||||||
// defer store.Close()
|
// defer store.Close()
|
||||||
//
|
//
|
||||||
// dl, err := deadlinks.New(
|
// dl, err := deadlinks.New(ctx, store, pinnedURLs, patterns, nil)
|
||||||
// ctx, store, pinnedURLs, patterns, nil,
|
//
|
||||||
// )
|
// # Further Customization
|
||||||
|
//
|
||||||
|
// Most functionality of DeadLinks can be extended or superceded by injecting
|
||||||
|
// alternate interface implementations via the various Opts structs.
|
||||||
package deadlinks
|
package deadlinks
|
||||||
|
|
||||||
import (
|
import (
|
||||||
@ -33,13 +80,13 @@ import (
|
|||||||
// Opts are optional fields which can be provided to New. A nil Opts is
|
// Opts are optional fields which can be provided to New. A nil Opts is
|
||||||
// equivalent to an empty one.
|
// equivalent to an empty one.
|
||||||
type Opts struct {
|
type Opts struct {
|
||||||
Client Client // Defaults to `NewClient(nil)`
|
NewClient func() Client // Defaults to `func () Client { return NewClient(nil) }`
|
||||||
Parser Parser // Defaults to `NewParser()`
|
Parser Parser // Defaults to `NewParser()`
|
||||||
|
|
||||||
// Concurrency determines the maximum number of URLs which can be checked
|
// Concurrency determines the maximum number of URLs which can be checked
|
||||||
// simultaneously.
|
// simultaneously.
|
||||||
//
|
//
|
||||||
// Default: `runtime.NumCPU()`
|
// Default: `runtime.NumCPU() / 2`
|
||||||
Concurrency int
|
Concurrency int
|
||||||
|
|
||||||
// OnError, if set, will be called whenever DeadLinks encounters an error
|
// OnError, if set, will be called whenever DeadLinks encounters an error
|
||||||
@ -49,7 +96,7 @@ type Opts struct {
|
|||||||
// RequestTimeout determines how long a request for a resource can run
|
// RequestTimeout determines how long a request for a resource can run
|
||||||
// before the resource is considered unavailable.
|
// before the resource is considered unavailable.
|
||||||
//
|
//
|
||||||
// Default: 10 * time.Second
|
// Default: 1 * time.Minute
|
||||||
RequestTimeout time.Duration
|
RequestTimeout time.Duration
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -58,8 +105,8 @@ func (o *Opts) withDefaults() *Opts {
|
|||||||
o = new(Opts)
|
o = new(Opts)
|
||||||
}
|
}
|
||||||
|
|
||||||
if o.Client == nil {
|
if o.NewClient == nil {
|
||||||
o.Client = NewClient(nil)
|
o.NewClient = func() Client { return NewClient(nil) }
|
||||||
}
|
}
|
||||||
|
|
||||||
if o.Parser == nil {
|
if o.Parser == nil {
|
||||||
@ -67,11 +114,11 @@ func (o *Opts) withDefaults() *Opts {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if o.Concurrency == 0 {
|
if o.Concurrency == 0 {
|
||||||
o.Concurrency = runtime.NumCPU()
|
o.Concurrency = runtime.NumCPU() / 2
|
||||||
}
|
}
|
||||||
|
|
||||||
if o.RequestTimeout == 0 {
|
if o.RequestTimeout == 0 {
|
||||||
o.RequestTimeout = 10 * time.Second
|
o.RequestTimeout = 1 * time.Minute
|
||||||
}
|
}
|
||||||
|
|
||||||
return o
|
return o
|
||||||
@ -87,6 +134,7 @@ type DeadLinks struct {
|
|||||||
opts Opts
|
opts Opts
|
||||||
store Store
|
store Store
|
||||||
patterns []*regexp.Regexp
|
patterns []*regexp.Regexp
|
||||||
|
clients []Client
|
||||||
}
|
}
|
||||||
|
|
||||||
// New initializes and returns a DeadLinks instance which will track the
|
// New initializes and returns a DeadLinks instance which will track the
|
||||||
@ -129,6 +177,11 @@ func New(
|
|||||||
patterns: patterns,
|
patterns: patterns,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
d.clients = make([]Client, d.opts.Concurrency)
|
||||||
|
for i := range d.clients {
|
||||||
|
d.clients[i] = d.opts.NewClient()
|
||||||
|
}
|
||||||
|
|
||||||
if err := d.store.SetPinned(ctx, pinnedURLs); err != nil {
|
if err := d.store.SetPinned(ctx, pinnedURLs); err != nil {
|
||||||
return nil, fmt.Errorf("pinning URLs: %w", err)
|
return nil, fmt.Errorf("pinning URLs: %w", err)
|
||||||
}
|
}
|
||||||
@ -155,11 +208,15 @@ func (d *DeadLinks) shouldFollowURL(url URL) bool {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
func (d *DeadLinks) getURL(ctx context.Context, url URL) ([]URL, error) {
|
func (d *DeadLinks) getURL(
|
||||||
|
ctx context.Context, client Client, url URL,
|
||||||
|
) (
|
||||||
|
[]URL, error,
|
||||||
|
) {
|
||||||
ctx, cancel := context.WithTimeout(ctx, d.opts.RequestTimeout)
|
ctx, cancel := context.WithTimeout(ctx, d.opts.RequestTimeout)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
mimeType, body, err := d.opts.Client.Get(ctx, url)
|
mimeType, body, err := client.Get(ctx, url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@ -178,14 +235,16 @@ func (d *DeadLinks) getURL(ctx context.Context, url URL) ([]URL, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// checkURL only returns an error if storing the results of the check fails.
|
// checkURL only returns an error if storing the results of the check fails.
|
||||||
func (d *DeadLinks) checkURL(ctx context.Context, url URL) error {
|
func (d *DeadLinks) checkURL(
|
||||||
|
ctx context.Context, client Client, url URL,
|
||||||
|
) error {
|
||||||
var (
|
var (
|
||||||
now = time.Now()
|
now = time.Now()
|
||||||
status = ResourceStatusOK
|
status = ResourceStatusOK
|
||||||
errorStr string
|
errorStr string
|
||||||
)
|
)
|
||||||
|
|
||||||
outgoingURLs, err := d.getURL(ctx, url)
|
outgoingURLs, err := d.getURL(ctx, client, url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
status = ResourceStatusError
|
status = ResourceStatusError
|
||||||
errorStr = err.Error()
|
errorStr = err.Error()
|
||||||
@ -219,14 +278,14 @@ func (d *DeadLinks) update(
|
|||||||
|
|
||||||
wg.Add(d.opts.Concurrency)
|
wg.Add(d.opts.Concurrency)
|
||||||
for i := 0; i < d.opts.Concurrency; i++ {
|
for i := 0; i < d.opts.Concurrency; i++ {
|
||||||
go func() {
|
go func(client Client) {
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
for url := range ch {
|
for url := range ch {
|
||||||
if err := d.checkURL(ctx, url); err != nil {
|
if err := d.checkURL(ctx, client, url); err != nil {
|
||||||
d.onError(ctx, fmt.Errorf("checking url %q: %w", url, err))
|
d.onError(ctx, fmt.Errorf("checking url %q: %w", url, err))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}()
|
}(d.clients[i])
|
||||||
}
|
}
|
||||||
|
|
||||||
var (
|
var (
|
||||||
|
@ -35,7 +35,6 @@
|
|||||||
pkgs.go
|
pkgs.go
|
||||||
pkgs.gotools
|
pkgs.gotools
|
||||||
pkgs.golangci-lint
|
pkgs.golangci-lint
|
||||||
pkgs.sqlite
|
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
|
11
resource.go
11
resource.go
@ -32,14 +32,15 @@ func (ds ResourceStatus) String() string {
|
|||||||
// uniquely identified by a URL.
|
// uniquely identified by a URL.
|
||||||
type Resource struct {
|
type Resource struct {
|
||||||
URL URL
|
URL URL
|
||||||
Status ResourceStatus
|
Status ResourceStatus `yaml:"-"`
|
||||||
Pinned bool
|
Pinned bool `yaml:"-"`
|
||||||
LastChecked time.Time
|
LastChecked time.Time `yaml:"last_checked"`
|
||||||
|
|
||||||
// only set if Status == ResourceStatusError
|
// only set if Status == ResourceStatusError
|
||||||
ErrorString string
|
ErrorString string `yaml:"error"`
|
||||||
|
|
||||||
// Indicate the URLs of resources which link to/are linked from this
|
// Indicate the URLs of resources which link to/are linked from this
|
||||||
// resource.
|
// resource.
|
||||||
IncomingLinkURLs, OutgoingLinkURLs []URL
|
IncomingLinkURLs []URL `yaml:"incoming_links"`
|
||||||
|
OutgoingLinkURLs []URL `yaml:"outgoing_links"`
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user