Compare commits
5 Commits
b6c20e57f9
...
c6361ea488
Author | SHA1 | Date | |
---|---|---|---|
c6361ea488 | |||
07a5acceaf | |||
eaccb83a7b | |||
307e311b61 | |||
d5c85c16b9 |
14
LICENSE.txt
Normal file
14
LICENSE.txt
Normal file
@ -0,0 +1,14 @@
|
||||
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
|
||||
Version 2, December 2004
|
||||
|
||||
Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
|
||||
|
||||
Everyone is permitted to copy and distribute verbatim or modified
|
||||
copies of this license document, and changing it is allowed as long
|
||||
as the name is changed.
|
||||
|
||||
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
|
||||
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
|
||||
|
||||
0. You just DO WHAT THE FUCK YOU WANT TO.
|
||||
|
44
README.md
Normal file
44
README.md
Normal file
@ -0,0 +1,44 @@
|
||||
# DeadLinks
|
||||
|
||||
A tool for crawling and finding links to URLs which no longer exist. deadlinks
|
||||
supports the HTTP(s) and gemini protocols, and is intended for periodically
|
||||
checking links on personal websites and blogs.
|
||||
|
||||
## Library
|
||||
|
||||
The `deadlinks` package is designed to be easily embedded into a process and
|
||||
have its results displayed in something like a status page.
|
||||
|
||||
[See the godocs for more info.](https://godocs.io/code.betamike.com/mediocregopher/deadlinks)
|
||||
|
||||
## Command-Line
|
||||
|
||||
The command-line utility can be installed using `go install`:
|
||||
|
||||
```
|
||||
go install code.betamike.com/mediocregopher/deadlinks/cmd/deadlinks
|
||||
```
|
||||
|
||||
The `-urls` parameter is required. Given one or more URLs it will check each one
|
||||
for any dead links:
|
||||
|
||||
```
|
||||
deadlinks -urls 'https://mediocregopher.com,gemini://mediocregopher.com'
|
||||
```
|
||||
|
||||
Any links which are dead will be output to stdout as YAML objects, each
|
||||
containing the dead URL, the error encountered, and which pages link to it.
|
||||
|
||||
In order to recursively crawl through links you can give one or more regex
|
||||
patterns. Any URL which matches a pattern will have its links checked as well
|
||||
(and if any of those link URLs match a pattern their links will be checked, and
|
||||
so on):
|
||||
|
||||
```
|
||||
deadlinks \
|
||||
-urls 'https://mediocregopher.com,gemini://mediocregopher.com' \
|
||||
-patterns '://mediocregopher.com'
|
||||
```
|
||||
|
||||
There are further options available which affect the utility's behavior, see
|
||||
`deadlinks -h` for more.
|
@ -78,6 +78,7 @@ type client struct {
|
||||
//
|
||||
// Some schemas automatically return success:
|
||||
// - mailto
|
||||
// - data
|
||||
func NewClient(opts *ClientOpts) Client {
|
||||
return &client{*opts.withDefaults()}
|
||||
}
|
||||
@ -261,7 +262,7 @@ func (c *client) get(
|
||||
return c.getGemini(ctx, url, redirectDepth)
|
||||
case "http", "https":
|
||||
return c.getHTTP(ctx, url, redirectDepth)
|
||||
case "mailto":
|
||||
case "mailto", "data":
|
||||
return c.noOpGet()
|
||||
default:
|
||||
return "", nil, fmt.Errorf("unsupported scheme %q", scheme)
|
||||
|
97
cmd/deadlinks/main.go
Normal file
97
cmd/deadlinks/main.go
Normal file
@ -0,0 +1,97 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"io"
|
||||
"log"
|
||||
"os"
|
||||
"os/signal"
|
||||
"runtime"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"code.betamike.com/mediocregopher/deadlinks"
|
||||
"code.betamike.com/mediocregopher/mediocre-go-lib/miter"
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
type loggingClient struct {
|
||||
inner deadlinks.Client
|
||||
}
|
||||
|
||||
func (c loggingClient) Get(
|
||||
ctx context.Context, url deadlinks.URL,
|
||||
) (
|
||||
string, io.ReadCloser, error,
|
||||
) {
|
||||
log.Printf("querying %q", url)
|
||||
return c.inner.Get(ctx, url)
|
||||
}
|
||||
|
||||
func main() {
|
||||
var (
|
||||
storePath = flag.String("store-path", "", "Path to sqlite storage file. If not given then a temporary in-memory storage is used")
|
||||
maxAge = flag.Duration("max-age", 0, "Maximum duration since last check of a resource, before it must be checked again. Must be used with -store-path")
|
||||
urls = flag.String("urls", "", `Comma-separated list of URLs which are always checked. At least one is required`)
|
||||
patternsStr = flag.String("patterns", "", "Comma-separated list of regexps. All URLs which match one of these will have their links checked as well")
|
||||
concurrency = flag.Int("concurrency", runtime.NumCPU()/2, "Number simultaneous requests to make at a time")
|
||||
)
|
||||
|
||||
flag.Parse()
|
||||
|
||||
if *urls == "" {
|
||||
log.Fatal("-urls is required")
|
||||
}
|
||||
|
||||
var patterns []string
|
||||
if *patternsStr != "" {
|
||||
patterns = strings.Split(*patternsStr, ",")
|
||||
}
|
||||
|
||||
ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)
|
||||
defer cancel()
|
||||
|
||||
store := deadlinks.NewSQLiteStore(&deadlinks.SQLiteStoreOpts{
|
||||
Path: *storePath,
|
||||
})
|
||||
defer store.Close()
|
||||
|
||||
dl, err := deadlinks.New(
|
||||
ctx,
|
||||
store,
|
||||
strings.Split(*urls, ","),
|
||||
patterns,
|
||||
&deadlinks.Opts{
|
||||
NewClient: func() deadlinks.Client {
|
||||
return loggingClient{deadlinks.NewClient(nil)}
|
||||
},
|
||||
Concurrency: *concurrency,
|
||||
OnError: func(err error) {
|
||||
log.Printf("runtime error: %v", err)
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("initialization error: %v", err)
|
||||
}
|
||||
|
||||
lastCheckedBefore := time.Now().Add(-*maxAge)
|
||||
|
||||
if err := dl.Update(ctx, lastCheckedBefore); err != nil {
|
||||
log.Fatalf("update encountered error: %v", err)
|
||||
}
|
||||
|
||||
enc := yaml.NewEncoder(os.Stdout)
|
||||
defer os.Stdout.Sync()
|
||||
|
||||
iter := dl.GetByStatus(deadlinks.ResourceStatusError)
|
||||
err = miter.ForEach(ctx, iter, func(r deadlinks.Resource) error {
|
||||
return enc.Encode(r)
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("iterating over errored resources failed: %v", err)
|
||||
}
|
||||
}
|
93
deadlinks.go
93
deadlinks.go
@ -1,6 +1,50 @@
|
||||
// Package deadlinks implements a liveness checker for hyperlinks in HTML and
|
||||
// gemtext documents.
|
||||
//
|
||||
// # URLs
|
||||
//
|
||||
// DeadLinks crawls and keeps track of hyperlinks between different
|
||||
// resources, such as webpages and gemtext documents. If a resource is not
|
||||
// linked to from any other resources then DeadLinks forgets about it.
|
||||
//
|
||||
// For this reason it is required to have a starting set of URLs which DeadLinks
|
||||
// will not forget about; these are the pinned URLs. Pinned URLs act as the
|
||||
// starting point for crawling.
|
||||
//
|
||||
// When DeadLinks traverses a URL link, it will check the liveness of that URL's
|
||||
// resource, but it will not by default recur into _that_ resource's links. It
|
||||
// will only do so if the URL matches one of the given regex patterns which
|
||||
// DeadLinks was configured with.
|
||||
//
|
||||
// # Basic Usage
|
||||
//
|
||||
// DeadLinks can be initialized using `New`:
|
||||
//
|
||||
// store := deadlinks.NewSQLiteStore(nil)
|
||||
// defer store.Close()
|
||||
//
|
||||
// pinnedURLs := []string{"https://some.website.com"}
|
||||
// patterns := []string{"website.com"}
|
||||
//
|
||||
// dl, err := deadlinks.New(ctx, store, pinnedURLs, patterns, nil)
|
||||
//
|
||||
// `Update` is then used to crawl all links, starting with `pinnedURLs`:
|
||||
//
|
||||
// err := dl.Update(ctx, time.Now())
|
||||
//
|
||||
// Finally, `GetByStatus` can be used to query all discovered resources based on
|
||||
// their current status. To retrieve all resources which have some error
|
||||
// (indicating a broken link):
|
||||
//
|
||||
// erroredResources, err := miter.ToSlice(
|
||||
// ctx, dl.GetByStatus(deadlinks.ResourceStatusError),
|
||||
// )
|
||||
//
|
||||
// Note that `GetByStatus` returns a `miter.Iterator`, see its documentation for
|
||||
// more options on how to use it beyond `ToSlice`:
|
||||
//
|
||||
// https://godocs.io/code.betamike.com/mediocregopher/mediocre-go-lib/miter
|
||||
//
|
||||
// # Storage
|
||||
//
|
||||
// By default DeadLinks uses an in-memory SQLite database for tracking the
|
||||
@ -12,9 +56,12 @@
|
||||
// })
|
||||
// defer store.Close()
|
||||
//
|
||||
// dl, err := deadlinks.New(
|
||||
// ctx, store, pinnedURLs, patterns, nil,
|
||||
// )
|
||||
// dl, err := deadlinks.New(ctx, store, pinnedURLs, patterns, nil)
|
||||
//
|
||||
// # Further Customization
|
||||
//
|
||||
// Most functionality of DeadLinks can be extended or superceded by injecting
|
||||
// alternate interface implementations via the various Opts structs.
|
||||
package deadlinks
|
||||
|
||||
import (
|
||||
@ -33,13 +80,13 @@ import (
|
||||
// Opts are optional fields which can be provided to New. A nil Opts is
|
||||
// equivalent to an empty one.
|
||||
type Opts struct {
|
||||
Client Client // Defaults to `NewClient(nil)`
|
||||
NewClient func() Client // Defaults to `func () Client { return NewClient(nil) }`
|
||||
Parser Parser // Defaults to `NewParser()`
|
||||
|
||||
// Concurrency determines the maximum number of URLs which can be checked
|
||||
// simultaneously.
|
||||
//
|
||||
// Default: `runtime.NumCPU()`
|
||||
// Default: `runtime.NumCPU() / 2`
|
||||
Concurrency int
|
||||
|
||||
// OnError, if set, will be called whenever DeadLinks encounters an error
|
||||
@ -49,7 +96,7 @@ type Opts struct {
|
||||
// RequestTimeout determines how long a request for a resource can run
|
||||
// before the resource is considered unavailable.
|
||||
//
|
||||
// Default: 10 * time.Second
|
||||
// Default: 1 * time.Minute
|
||||
RequestTimeout time.Duration
|
||||
}
|
||||
|
||||
@ -58,8 +105,8 @@ func (o *Opts) withDefaults() *Opts {
|
||||
o = new(Opts)
|
||||
}
|
||||
|
||||
if o.Client == nil {
|
||||
o.Client = NewClient(nil)
|
||||
if o.NewClient == nil {
|
||||
o.NewClient = func() Client { return NewClient(nil) }
|
||||
}
|
||||
|
||||
if o.Parser == nil {
|
||||
@ -67,11 +114,11 @@ func (o *Opts) withDefaults() *Opts {
|
||||
}
|
||||
|
||||
if o.Concurrency == 0 {
|
||||
o.Concurrency = runtime.NumCPU()
|
||||
o.Concurrency = runtime.NumCPU() / 2
|
||||
}
|
||||
|
||||
if o.RequestTimeout == 0 {
|
||||
o.RequestTimeout = 10 * time.Second
|
||||
o.RequestTimeout = 1 * time.Minute
|
||||
}
|
||||
|
||||
return o
|
||||
@ -87,6 +134,7 @@ type DeadLinks struct {
|
||||
opts Opts
|
||||
store Store
|
||||
patterns []*regexp.Regexp
|
||||
clients []Client
|
||||
}
|
||||
|
||||
// New initializes and returns a DeadLinks instance which will track the
|
||||
@ -129,6 +177,11 @@ func New(
|
||||
patterns: patterns,
|
||||
}
|
||||
|
||||
d.clients = make([]Client, d.opts.Concurrency)
|
||||
for i := range d.clients {
|
||||
d.clients[i] = d.opts.NewClient()
|
||||
}
|
||||
|
||||
if err := d.store.SetPinned(ctx, pinnedURLs); err != nil {
|
||||
return nil, fmt.Errorf("pinning URLs: %w", err)
|
||||
}
|
||||
@ -155,11 +208,15 @@ func (d *DeadLinks) shouldFollowURL(url URL) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func (d *DeadLinks) getURL(ctx context.Context, url URL) ([]URL, error) {
|
||||
func (d *DeadLinks) getURL(
|
||||
ctx context.Context, client Client, url URL,
|
||||
) (
|
||||
[]URL, error,
|
||||
) {
|
||||
ctx, cancel := context.WithTimeout(ctx, d.opts.RequestTimeout)
|
||||
defer cancel()
|
||||
|
||||
mimeType, body, err := d.opts.Client.Get(ctx, url)
|
||||
mimeType, body, err := client.Get(ctx, url)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@ -178,14 +235,16 @@ func (d *DeadLinks) getURL(ctx context.Context, url URL) ([]URL, error) {
|
||||
}
|
||||
|
||||
// checkURL only returns an error if storing the results of the check fails.
|
||||
func (d *DeadLinks) checkURL(ctx context.Context, url URL) error {
|
||||
func (d *DeadLinks) checkURL(
|
||||
ctx context.Context, client Client, url URL,
|
||||
) error {
|
||||
var (
|
||||
now = time.Now()
|
||||
status = ResourceStatusOK
|
||||
errorStr string
|
||||
)
|
||||
|
||||
outgoingURLs, err := d.getURL(ctx, url)
|
||||
outgoingURLs, err := d.getURL(ctx, client, url)
|
||||
if err != nil {
|
||||
status = ResourceStatusError
|
||||
errorStr = err.Error()
|
||||
@ -219,14 +278,14 @@ func (d *DeadLinks) update(
|
||||
|
||||
wg.Add(d.opts.Concurrency)
|
||||
for i := 0; i < d.opts.Concurrency; i++ {
|
||||
go func() {
|
||||
go func(client Client) {
|
||||
defer wg.Done()
|
||||
for url := range ch {
|
||||
if err := d.checkURL(ctx, url); err != nil {
|
||||
if err := d.checkURL(ctx, client, url); err != nil {
|
||||
d.onError(ctx, fmt.Errorf("checking url %q: %w", url, err))
|
||||
}
|
||||
}
|
||||
}()
|
||||
}(d.clients[i])
|
||||
}
|
||||
|
||||
var (
|
||||
|
@ -35,7 +35,6 @@
|
||||
pkgs.go
|
||||
pkgs.gotools
|
||||
pkgs.golangci-lint
|
||||
pkgs.sqlite
|
||||
];
|
||||
};
|
||||
});
|
||||
|
11
resource.go
11
resource.go
@ -32,14 +32,15 @@ func (ds ResourceStatus) String() string {
|
||||
// uniquely identified by a URL.
|
||||
type Resource struct {
|
||||
URL URL
|
||||
Status ResourceStatus
|
||||
Pinned bool
|
||||
LastChecked time.Time
|
||||
Status ResourceStatus `yaml:"-"`
|
||||
Pinned bool `yaml:"-"`
|
||||
LastChecked time.Time `yaml:"last_checked"`
|
||||
|
||||
// only set if Status == ResourceStatusError
|
||||
ErrorString string
|
||||
ErrorString string `yaml:"error"`
|
||||
|
||||
// Indicate the URLs of resources which link to/are linked from this
|
||||
// resource.
|
||||
IncomingLinkURLs, OutgoingLinkURLs []URL
|
||||
IncomingLinkURLs []URL `yaml:"incoming_links"`
|
||||
OutgoingLinkURLs []URL `yaml:"outgoing_links"`
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user