Compare commits
No commits in common. "c6361ea4886d6502ba16377ea6494acb74a68c4d" and "b6c20e57f9b26438b5ac6e0e6365d9416b977331" have entirely different histories.
c6361ea488
...
b6c20e57f9
14
LICENSE.txt
14
LICENSE.txt
@ -1,14 +0,0 @@
|
|||||||
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
|
|
||||||
Version 2, December 2004
|
|
||||||
|
|
||||||
Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
|
|
||||||
|
|
||||||
Everyone is permitted to copy and distribute verbatim or modified
|
|
||||||
copies of this license document, and changing it is allowed as long
|
|
||||||
as the name is changed.
|
|
||||||
|
|
||||||
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
|
|
||||||
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
|
|
||||||
|
|
||||||
0. You just DO WHAT THE FUCK YOU WANT TO.
|
|
||||||
|
|
44
README.md
44
README.md
@ -1,44 +0,0 @@
|
|||||||
# DeadLinks
|
|
||||||
|
|
||||||
A tool for crawling and finding links to URLs which no longer exist. deadlinks
|
|
||||||
supports the HTTP(s) and gemini protocols, and is intended for periodically
|
|
||||||
checking links on personal websites and blogs.
|
|
||||||
|
|
||||||
## Library
|
|
||||||
|
|
||||||
The `deadlinks` package is designed to be easily embedded into a process and
|
|
||||||
have its results displayed in something like a status page.
|
|
||||||
|
|
||||||
[See the godocs for more info.](https://godocs.io/code.betamike.com/mediocregopher/deadlinks)
|
|
||||||
|
|
||||||
## Command-Line
|
|
||||||
|
|
||||||
The command-line utility can be installed using `go install`:
|
|
||||||
|
|
||||||
```
|
|
||||||
go install code.betamike.com/mediocregopher/deadlinks/cmd/deadlinks
|
|
||||||
```
|
|
||||||
|
|
||||||
The `-urls` parameter is required. Given one or more URLs it will check each one
|
|
||||||
for any dead links:
|
|
||||||
|
|
||||||
```
|
|
||||||
deadlinks -urls 'https://mediocregopher.com,gemini://mediocregopher.com'
|
|
||||||
```
|
|
||||||
|
|
||||||
Any links which are dead will be output to stdout as YAML objects, each
|
|
||||||
containing the dead URL, the error encountered, and which pages link to it.
|
|
||||||
|
|
||||||
In order to recursively crawl through links you can give one or more regex
|
|
||||||
patterns. Any URL which matches a pattern will have its links checked as well
|
|
||||||
(and if any of those link URLs match a pattern their links will be checked, and
|
|
||||||
so on):
|
|
||||||
|
|
||||||
```
|
|
||||||
deadlinks \
|
|
||||||
-urls 'https://mediocregopher.com,gemini://mediocregopher.com' \
|
|
||||||
-patterns '://mediocregopher.com'
|
|
||||||
```
|
|
||||||
|
|
||||||
There are further options available which affect the utility's behavior, see
|
|
||||||
`deadlinks -h` for more.
|
|
@ -78,7 +78,6 @@ type client struct {
|
|||||||
//
|
//
|
||||||
// Some schemas automatically return success:
|
// Some schemas automatically return success:
|
||||||
// - mailto
|
// - mailto
|
||||||
// - data
|
|
||||||
func NewClient(opts *ClientOpts) Client {
|
func NewClient(opts *ClientOpts) Client {
|
||||||
return &client{*opts.withDefaults()}
|
return &client{*opts.withDefaults()}
|
||||||
}
|
}
|
||||||
@ -262,7 +261,7 @@ func (c *client) get(
|
|||||||
return c.getGemini(ctx, url, redirectDepth)
|
return c.getGemini(ctx, url, redirectDepth)
|
||||||
case "http", "https":
|
case "http", "https":
|
||||||
return c.getHTTP(ctx, url, redirectDepth)
|
return c.getHTTP(ctx, url, redirectDepth)
|
||||||
case "mailto", "data":
|
case "mailto":
|
||||||
return c.noOpGet()
|
return c.noOpGet()
|
||||||
default:
|
default:
|
||||||
return "", nil, fmt.Errorf("unsupported scheme %q", scheme)
|
return "", nil, fmt.Errorf("unsupported scheme %q", scheme)
|
||||||
|
@ -1,97 +0,0 @@
|
|||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"flag"
|
|
||||||
"io"
|
|
||||||
"log"
|
|
||||||
"os"
|
|
||||||
"os/signal"
|
|
||||||
"runtime"
|
|
||||||
"strings"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"code.betamike.com/mediocregopher/deadlinks"
|
|
||||||
"code.betamike.com/mediocregopher/mediocre-go-lib/miter"
|
|
||||||
"gopkg.in/yaml.v3"
|
|
||||||
)
|
|
||||||
|
|
||||||
type loggingClient struct {
|
|
||||||
inner deadlinks.Client
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c loggingClient) Get(
|
|
||||||
ctx context.Context, url deadlinks.URL,
|
|
||||||
) (
|
|
||||||
string, io.ReadCloser, error,
|
|
||||||
) {
|
|
||||||
log.Printf("querying %q", url)
|
|
||||||
return c.inner.Get(ctx, url)
|
|
||||||
}
|
|
||||||
|
|
||||||
func main() {
|
|
||||||
var (
|
|
||||||
storePath = flag.String("store-path", "", "Path to sqlite storage file. If not given then a temporary in-memory storage is used")
|
|
||||||
maxAge = flag.Duration("max-age", 0, "Maximum duration since last check of a resource, before it must be checked again. Must be used with -store-path")
|
|
||||||
urls = flag.String("urls", "", `Comma-separated list of URLs which are always checked. At least one is required`)
|
|
||||||
patternsStr = flag.String("patterns", "", "Comma-separated list of regexps. All URLs which match one of these will have their links checked as well")
|
|
||||||
concurrency = flag.Int("concurrency", runtime.NumCPU()/2, "Number simultaneous requests to make at a time")
|
|
||||||
)
|
|
||||||
|
|
||||||
flag.Parse()
|
|
||||||
|
|
||||||
if *urls == "" {
|
|
||||||
log.Fatal("-urls is required")
|
|
||||||
}
|
|
||||||
|
|
||||||
var patterns []string
|
|
||||||
if *patternsStr != "" {
|
|
||||||
patterns = strings.Split(*patternsStr, ",")
|
|
||||||
}
|
|
||||||
|
|
||||||
ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt)
|
|
||||||
defer cancel()
|
|
||||||
|
|
||||||
store := deadlinks.NewSQLiteStore(&deadlinks.SQLiteStoreOpts{
|
|
||||||
Path: *storePath,
|
|
||||||
})
|
|
||||||
defer store.Close()
|
|
||||||
|
|
||||||
dl, err := deadlinks.New(
|
|
||||||
ctx,
|
|
||||||
store,
|
|
||||||
strings.Split(*urls, ","),
|
|
||||||
patterns,
|
|
||||||
&deadlinks.Opts{
|
|
||||||
NewClient: func() deadlinks.Client {
|
|
||||||
return loggingClient{deadlinks.NewClient(nil)}
|
|
||||||
},
|
|
||||||
Concurrency: *concurrency,
|
|
||||||
OnError: func(err error) {
|
|
||||||
log.Printf("runtime error: %v", err)
|
|
||||||
},
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
log.Fatalf("initialization error: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
lastCheckedBefore := time.Now().Add(-*maxAge)
|
|
||||||
|
|
||||||
if err := dl.Update(ctx, lastCheckedBefore); err != nil {
|
|
||||||
log.Fatalf("update encountered error: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
enc := yaml.NewEncoder(os.Stdout)
|
|
||||||
defer os.Stdout.Sync()
|
|
||||||
|
|
||||||
iter := dl.GetByStatus(deadlinks.ResourceStatusError)
|
|
||||||
err = miter.ForEach(ctx, iter, func(r deadlinks.Resource) error {
|
|
||||||
return enc.Encode(r)
|
|
||||||
})
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
log.Fatalf("iterating over errored resources failed: %v", err)
|
|
||||||
}
|
|
||||||
}
|
|
93
deadlinks.go
93
deadlinks.go
@ -1,50 +1,6 @@
|
|||||||
// Package deadlinks implements a liveness checker for hyperlinks in HTML and
|
// Package deadlinks implements a liveness checker for hyperlinks in HTML and
|
||||||
// gemtext documents.
|
// gemtext documents.
|
||||||
//
|
//
|
||||||
// # URLs
|
|
||||||
//
|
|
||||||
// DeadLinks crawls and keeps track of hyperlinks between different
|
|
||||||
// resources, such as webpages and gemtext documents. If a resource is not
|
|
||||||
// linked to from any other resources then DeadLinks forgets about it.
|
|
||||||
//
|
|
||||||
// For this reason it is required to have a starting set of URLs which DeadLinks
|
|
||||||
// will not forget about; these are the pinned URLs. Pinned URLs act as the
|
|
||||||
// starting point for crawling.
|
|
||||||
//
|
|
||||||
// When DeadLinks traverses a URL link, it will check the liveness of that URL's
|
|
||||||
// resource, but it will not by default recur into _that_ resource's links. It
|
|
||||||
// will only do so if the URL matches one of the given regex patterns which
|
|
||||||
// DeadLinks was configured with.
|
|
||||||
//
|
|
||||||
// # Basic Usage
|
|
||||||
//
|
|
||||||
// DeadLinks can be initialized using `New`:
|
|
||||||
//
|
|
||||||
// store := deadlinks.NewSQLiteStore(nil)
|
|
||||||
// defer store.Close()
|
|
||||||
//
|
|
||||||
// pinnedURLs := []string{"https://some.website.com"}
|
|
||||||
// patterns := []string{"website.com"}
|
|
||||||
//
|
|
||||||
// dl, err := deadlinks.New(ctx, store, pinnedURLs, patterns, nil)
|
|
||||||
//
|
|
||||||
// `Update` is then used to crawl all links, starting with `pinnedURLs`:
|
|
||||||
//
|
|
||||||
// err := dl.Update(ctx, time.Now())
|
|
||||||
//
|
|
||||||
// Finally, `GetByStatus` can be used to query all discovered resources based on
|
|
||||||
// their current status. To retrieve all resources which have some error
|
|
||||||
// (indicating a broken link):
|
|
||||||
//
|
|
||||||
// erroredResources, err := miter.ToSlice(
|
|
||||||
// ctx, dl.GetByStatus(deadlinks.ResourceStatusError),
|
|
||||||
// )
|
|
||||||
//
|
|
||||||
// Note that `GetByStatus` returns a `miter.Iterator`, see its documentation for
|
|
||||||
// more options on how to use it beyond `ToSlice`:
|
|
||||||
//
|
|
||||||
// https://godocs.io/code.betamike.com/mediocregopher/mediocre-go-lib/miter
|
|
||||||
//
|
|
||||||
// # Storage
|
// # Storage
|
||||||
//
|
//
|
||||||
// By default DeadLinks uses an in-memory SQLite database for tracking the
|
// By default DeadLinks uses an in-memory SQLite database for tracking the
|
||||||
@ -56,12 +12,9 @@
|
|||||||
// })
|
// })
|
||||||
// defer store.Close()
|
// defer store.Close()
|
||||||
//
|
//
|
||||||
// dl, err := deadlinks.New(ctx, store, pinnedURLs, patterns, nil)
|
// dl, err := deadlinks.New(
|
||||||
//
|
// ctx, store, pinnedURLs, patterns, nil,
|
||||||
// # Further Customization
|
// )
|
||||||
//
|
|
||||||
// Most functionality of DeadLinks can be extended or superceded by injecting
|
|
||||||
// alternate interface implementations via the various Opts structs.
|
|
||||||
package deadlinks
|
package deadlinks
|
||||||
|
|
||||||
import (
|
import (
|
||||||
@ -80,13 +33,13 @@ import (
|
|||||||
// Opts are optional fields which can be provided to New. A nil Opts is
|
// Opts are optional fields which can be provided to New. A nil Opts is
|
||||||
// equivalent to an empty one.
|
// equivalent to an empty one.
|
||||||
type Opts struct {
|
type Opts struct {
|
||||||
NewClient func() Client // Defaults to `func () Client { return NewClient(nil) }`
|
Client Client // Defaults to `NewClient(nil)`
|
||||||
Parser Parser // Defaults to `NewParser()`
|
Parser Parser // Defaults to `NewParser()`
|
||||||
|
|
||||||
// Concurrency determines the maximum number of URLs which can be checked
|
// Concurrency determines the maximum number of URLs which can be checked
|
||||||
// simultaneously.
|
// simultaneously.
|
||||||
//
|
//
|
||||||
// Default: `runtime.NumCPU() / 2`
|
// Default: `runtime.NumCPU()`
|
||||||
Concurrency int
|
Concurrency int
|
||||||
|
|
||||||
// OnError, if set, will be called whenever DeadLinks encounters an error
|
// OnError, if set, will be called whenever DeadLinks encounters an error
|
||||||
@ -96,7 +49,7 @@ type Opts struct {
|
|||||||
// RequestTimeout determines how long a request for a resource can run
|
// RequestTimeout determines how long a request for a resource can run
|
||||||
// before the resource is considered unavailable.
|
// before the resource is considered unavailable.
|
||||||
//
|
//
|
||||||
// Default: 1 * time.Minute
|
// Default: 10 * time.Second
|
||||||
RequestTimeout time.Duration
|
RequestTimeout time.Duration
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -105,8 +58,8 @@ func (o *Opts) withDefaults() *Opts {
|
|||||||
o = new(Opts)
|
o = new(Opts)
|
||||||
}
|
}
|
||||||
|
|
||||||
if o.NewClient == nil {
|
if o.Client == nil {
|
||||||
o.NewClient = func() Client { return NewClient(nil) }
|
o.Client = NewClient(nil)
|
||||||
}
|
}
|
||||||
|
|
||||||
if o.Parser == nil {
|
if o.Parser == nil {
|
||||||
@ -114,11 +67,11 @@ func (o *Opts) withDefaults() *Opts {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if o.Concurrency == 0 {
|
if o.Concurrency == 0 {
|
||||||
o.Concurrency = runtime.NumCPU() / 2
|
o.Concurrency = runtime.NumCPU()
|
||||||
}
|
}
|
||||||
|
|
||||||
if o.RequestTimeout == 0 {
|
if o.RequestTimeout == 0 {
|
||||||
o.RequestTimeout = 1 * time.Minute
|
o.RequestTimeout = 10 * time.Second
|
||||||
}
|
}
|
||||||
|
|
||||||
return o
|
return o
|
||||||
@ -134,7 +87,6 @@ type DeadLinks struct {
|
|||||||
opts Opts
|
opts Opts
|
||||||
store Store
|
store Store
|
||||||
patterns []*regexp.Regexp
|
patterns []*regexp.Regexp
|
||||||
clients []Client
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// New initializes and returns a DeadLinks instance which will track the
|
// New initializes and returns a DeadLinks instance which will track the
|
||||||
@ -177,11 +129,6 @@ func New(
|
|||||||
patterns: patterns,
|
patterns: patterns,
|
||||||
}
|
}
|
||||||
|
|
||||||
d.clients = make([]Client, d.opts.Concurrency)
|
|
||||||
for i := range d.clients {
|
|
||||||
d.clients[i] = d.opts.NewClient()
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := d.store.SetPinned(ctx, pinnedURLs); err != nil {
|
if err := d.store.SetPinned(ctx, pinnedURLs); err != nil {
|
||||||
return nil, fmt.Errorf("pinning URLs: %w", err)
|
return nil, fmt.Errorf("pinning URLs: %w", err)
|
||||||
}
|
}
|
||||||
@ -208,15 +155,11 @@ func (d *DeadLinks) shouldFollowURL(url URL) bool {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
func (d *DeadLinks) getURL(
|
func (d *DeadLinks) getURL(ctx context.Context, url URL) ([]URL, error) {
|
||||||
ctx context.Context, client Client, url URL,
|
|
||||||
) (
|
|
||||||
[]URL, error,
|
|
||||||
) {
|
|
||||||
ctx, cancel := context.WithTimeout(ctx, d.opts.RequestTimeout)
|
ctx, cancel := context.WithTimeout(ctx, d.opts.RequestTimeout)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
||||||
mimeType, body, err := client.Get(ctx, url)
|
mimeType, body, err := d.opts.Client.Get(ctx, url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@ -235,16 +178,14 @@ func (d *DeadLinks) getURL(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// checkURL only returns an error if storing the results of the check fails.
|
// checkURL only returns an error if storing the results of the check fails.
|
||||||
func (d *DeadLinks) checkURL(
|
func (d *DeadLinks) checkURL(ctx context.Context, url URL) error {
|
||||||
ctx context.Context, client Client, url URL,
|
|
||||||
) error {
|
|
||||||
var (
|
var (
|
||||||
now = time.Now()
|
now = time.Now()
|
||||||
status = ResourceStatusOK
|
status = ResourceStatusOK
|
||||||
errorStr string
|
errorStr string
|
||||||
)
|
)
|
||||||
|
|
||||||
outgoingURLs, err := d.getURL(ctx, client, url)
|
outgoingURLs, err := d.getURL(ctx, url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
status = ResourceStatusError
|
status = ResourceStatusError
|
||||||
errorStr = err.Error()
|
errorStr = err.Error()
|
||||||
@ -278,14 +219,14 @@ func (d *DeadLinks) update(
|
|||||||
|
|
||||||
wg.Add(d.opts.Concurrency)
|
wg.Add(d.opts.Concurrency)
|
||||||
for i := 0; i < d.opts.Concurrency; i++ {
|
for i := 0; i < d.opts.Concurrency; i++ {
|
||||||
go func(client Client) {
|
go func() {
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
for url := range ch {
|
for url := range ch {
|
||||||
if err := d.checkURL(ctx, client, url); err != nil {
|
if err := d.checkURL(ctx, url); err != nil {
|
||||||
d.onError(ctx, fmt.Errorf("checking url %q: %w", url, err))
|
d.onError(ctx, fmt.Errorf("checking url %q: %w", url, err))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}(d.clients[i])
|
}()
|
||||||
}
|
}
|
||||||
|
|
||||||
var (
|
var (
|
||||||
|
@ -35,6 +35,7 @@
|
|||||||
pkgs.go
|
pkgs.go
|
||||||
pkgs.gotools
|
pkgs.gotools
|
||||||
pkgs.golangci-lint
|
pkgs.golangci-lint
|
||||||
|
pkgs.sqlite
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
|
11
resource.go
11
resource.go
@ -32,15 +32,14 @@ func (ds ResourceStatus) String() string {
|
|||||||
// uniquely identified by a URL.
|
// uniquely identified by a URL.
|
||||||
type Resource struct {
|
type Resource struct {
|
||||||
URL URL
|
URL URL
|
||||||
Status ResourceStatus `yaml:"-"`
|
Status ResourceStatus
|
||||||
Pinned bool `yaml:"-"`
|
Pinned bool
|
||||||
LastChecked time.Time `yaml:"last_checked"`
|
LastChecked time.Time
|
||||||
|
|
||||||
// only set if Status == ResourceStatusError
|
// only set if Status == ResourceStatusError
|
||||||
ErrorString string `yaml:"error"`
|
ErrorString string
|
||||||
|
|
||||||
// Indicate the URLs of resources which link to/are linked from this
|
// Indicate the URLs of resources which link to/are linked from this
|
||||||
// resource.
|
// resource.
|
||||||
IncomingLinkURLs []URL `yaml:"incoming_links"`
|
IncomingLinkURLs, OutgoingLinkURLs []URL
|
||||||
OutgoingLinkURLs []URL `yaml:"outgoing_links"`
|
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user