Got DeadLinks basic functionality actually working
This commit is contained in:
parent
6539cc2650
commit
4d2c62a472
@ -101,11 +101,12 @@ func (c *client) getGemini(
|
||||
return "", nil, errors.New("too many redirects")
|
||||
}
|
||||
|
||||
newURL, err := url.ResolveReference(res.Meta)
|
||||
metaURL, err := ParseURL(res.Meta)
|
||||
if err != nil {
|
||||
return "", nil, fmt.Errorf("resolving redirect URL %q: %w", res.Meta, err)
|
||||
return "", nil, fmt.Errorf("parsing redirect URL %q: %w", res.Meta, err)
|
||||
}
|
||||
|
||||
newURL := url.ResolveReference(metaURL)
|
||||
return c.get(ctx, newURL, redirectDepth+1)
|
||||
|
||||
default:
|
||||
|
82
deadlinks.go
82
deadlinks.go
@ -10,8 +10,11 @@
|
||||
// store := deadlinks.NewSQLiteStore(&deadlinks.SQLiteStoreOpts{
|
||||
// Path: "/path/to/db/file.sqlite",
|
||||
// })
|
||||
// defer store.Close()
|
||||
//
|
||||
// // TODO initialize DeadLinks
|
||||
// dl, err := deadlinks.New(
|
||||
// ctx, store, pinnedURLs, patterns, nil,
|
||||
// )
|
||||
package deadlinks
|
||||
|
||||
import (
|
||||
@ -20,6 +23,7 @@ import (
|
||||
"fmt"
|
||||
"regexp"
|
||||
"runtime"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
@ -41,6 +45,12 @@ type Opts struct {
|
||||
// OnError, if set, will be called whenever DeadLinks encounters an error
|
||||
// internally that it would otherwise skip over.
|
||||
OnError func(error)
|
||||
|
||||
// RequestTimeout determines how long a request for a resource can run
|
||||
// before the resource is considered unavailable.
|
||||
//
|
||||
// Default: 10 * time.Second
|
||||
RequestTimeout time.Duration
|
||||
}
|
||||
|
||||
func (o *Opts) withDefaults() *Opts {
|
||||
@ -60,6 +70,10 @@ func (o *Opts) withDefaults() *Opts {
|
||||
o.Concurrency = runtime.NumCPU()
|
||||
}
|
||||
|
||||
if o.RequestTimeout == 0 {
|
||||
o.RequestTimeout = 10 * time.Second
|
||||
}
|
||||
|
||||
return o
|
||||
}
|
||||
|
||||
@ -142,12 +156,20 @@ func (d *DeadLinks) shouldFollowURL(url URL) bool {
|
||||
}
|
||||
|
||||
func (d *DeadLinks) getURL(ctx context.Context, url URL) ([]URL, error) {
|
||||
ctx, cancel := context.WithTimeout(ctx, d.opts.RequestTimeout)
|
||||
defer cancel()
|
||||
|
||||
mimeType, body, err := d.opts.Client.Get(ctx, url)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer body.Close()
|
||||
|
||||
// strip off mimeType extensions
|
||||
if i := strings.Index(mimeType, ";"); i > 0 {
|
||||
mimeType = mimeType[:i]
|
||||
}
|
||||
|
||||
if !d.shouldFollowURL(url) {
|
||||
return nil, nil
|
||||
}
|
||||
@ -169,6 +191,10 @@ func (d *DeadLinks) checkURL(ctx context.Context, url URL) error {
|
||||
errorStr = err.Error()
|
||||
}
|
||||
|
||||
for i := range outgoingURLs {
|
||||
outgoingURLs[i] = url.ResolveReference(outgoingURLs[i])
|
||||
}
|
||||
|
||||
err = d.store.Update(ctx, now, url, status, errorStr, outgoingURLs)
|
||||
if err != nil {
|
||||
return fmt.Errorf(
|
||||
@ -180,13 +206,12 @@ func (d *DeadLinks) checkURL(ctx context.Context, url URL) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Update runs through all pinned or previously discovered URLs which were
|
||||
// last checked prior to the given time (or which have never been checked) and
|
||||
// updates the internal storage with any new URLs and links to dead URLs which
|
||||
// it finds.
|
||||
func (d *DeadLinks) Update(
|
||||
// returns the number of URLs checked.
|
||||
func (d *DeadLinks) update(
|
||||
ctx context.Context, lastCheckedBefore time.Time,
|
||||
) error {
|
||||
) (
|
||||
int, error,
|
||||
) {
|
||||
var (
|
||||
wg = new(sync.WaitGroup)
|
||||
ch = make(chan URL, d.opts.Concurrency)
|
||||
@ -204,14 +229,9 @@ func (d *DeadLinks) Update(
|
||||
}()
|
||||
}
|
||||
|
||||
var err error
|
||||
|
||||
// Because checking URLs can result in new URLs being inserted into the
|
||||
// Store, we query the Store in a loop until it stops producing
|
||||
// unvisited/stale URLs.
|
||||
for {
|
||||
var (
|
||||
n int
|
||||
err error
|
||||
urlsIter = d.store.GetURLsByLastChecked(lastCheckedBefore)
|
||||
)
|
||||
|
||||
@ -221,16 +241,34 @@ func (d *DeadLinks) Update(
|
||||
return nil
|
||||
})
|
||||
|
||||
if err != nil || n == 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
close(ch)
|
||||
wg.Wait()
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("iterating urls needing checked: %w", err)
|
||||
return 0, fmt.Errorf("iterating urls needing checked: %w", err)
|
||||
}
|
||||
|
||||
return n, nil
|
||||
}
|
||||
|
||||
// Update runs through all pinned or previously discovered URLs which were
|
||||
// last checked prior to the given time (or which have never been checked) and
|
||||
// updates the internal storage with any new URLs and links to dead URLs which
|
||||
// it finds.
|
||||
func (d *DeadLinks) Update(
|
||||
ctx context.Context, lastCheckedBefore time.Time,
|
||||
) error {
|
||||
|
||||
// Because we are iterating over the same dataset which is being updated it
|
||||
// is required that we re-attempt the update multiple times, until all
|
||||
// possible updates have been done.
|
||||
for {
|
||||
n, err := d.update(ctx, lastCheckedBefore)
|
||||
if err != nil {
|
||||
return err
|
||||
} else if n == 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if err := d.store.GC(ctx); err != nil {
|
||||
@ -240,4 +278,8 @@ func (d *DeadLinks) Update(
|
||||
return nil
|
||||
}
|
||||
|
||||
// TODO expose GetByStatus
|
||||
// GetByStatus returns an iterator which will return all Resources with the
|
||||
// given status.
|
||||
func (d *DeadLinks) GetByStatus(status ResourceStatus) miter.Iterator[Resource] {
|
||||
return d.store.GetByStatus(status)
|
||||
}
|
||||
|
8
url.go
8
url.go
@ -44,10 +44,6 @@ func (u URL) toStd() *url.URL {
|
||||
}
|
||||
|
||||
// ResolveReference is equivalend to the method of the same name in `net/url`.
|
||||
func (u URL) ResolveReference(u2Str string) (URL, error) {
|
||||
u2, err := url.Parse(u2Str)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return URL(u.toStd().ResolveReference(u2).String()), nil
|
||||
func (u URL) ResolveReference(u2 URL) URL {
|
||||
return URL(u.toStd().ResolveReference(u2.toStd()).String())
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user