Got DeadLinks basic functionality actually working
This commit is contained in:
parent
6539cc2650
commit
4d2c62a472
@ -101,11 +101,12 @@ func (c *client) getGemini(
|
|||||||
return "", nil, errors.New("too many redirects")
|
return "", nil, errors.New("too many redirects")
|
||||||
}
|
}
|
||||||
|
|
||||||
newURL, err := url.ResolveReference(res.Meta)
|
metaURL, err := ParseURL(res.Meta)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", nil, fmt.Errorf("resolving redirect URL %q: %w", res.Meta, err)
|
return "", nil, fmt.Errorf("parsing redirect URL %q: %w", res.Meta, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
newURL := url.ResolveReference(metaURL)
|
||||||
return c.get(ctx, newURL, redirectDepth+1)
|
return c.get(ctx, newURL, redirectDepth+1)
|
||||||
|
|
||||||
default:
|
default:
|
||||||
|
100
deadlinks.go
100
deadlinks.go
@ -10,8 +10,11 @@
|
|||||||
// store := deadlinks.NewSQLiteStore(&deadlinks.SQLiteStoreOpts{
|
// store := deadlinks.NewSQLiteStore(&deadlinks.SQLiteStoreOpts{
|
||||||
// Path: "/path/to/db/file.sqlite",
|
// Path: "/path/to/db/file.sqlite",
|
||||||
// })
|
// })
|
||||||
|
// defer store.Close()
|
||||||
//
|
//
|
||||||
// // TODO initialize DeadLinks
|
// dl, err := deadlinks.New(
|
||||||
|
// ctx, store, pinnedURLs, patterns, nil,
|
||||||
|
// )
|
||||||
package deadlinks
|
package deadlinks
|
||||||
|
|
||||||
import (
|
import (
|
||||||
@ -20,6 +23,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"regexp"
|
"regexp"
|
||||||
"runtime"
|
"runtime"
|
||||||
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@ -41,6 +45,12 @@ type Opts struct {
|
|||||||
// OnError, if set, will be called whenever DeadLinks encounters an error
|
// OnError, if set, will be called whenever DeadLinks encounters an error
|
||||||
// internally that it would otherwise skip over.
|
// internally that it would otherwise skip over.
|
||||||
OnError func(error)
|
OnError func(error)
|
||||||
|
|
||||||
|
// RequestTimeout determines how long a request for a resource can run
|
||||||
|
// before the resource is considered unavailable.
|
||||||
|
//
|
||||||
|
// Default: 10 * time.Second
|
||||||
|
RequestTimeout time.Duration
|
||||||
}
|
}
|
||||||
|
|
||||||
func (o *Opts) withDefaults() *Opts {
|
func (o *Opts) withDefaults() *Opts {
|
||||||
@ -60,6 +70,10 @@ func (o *Opts) withDefaults() *Opts {
|
|||||||
o.Concurrency = runtime.NumCPU()
|
o.Concurrency = runtime.NumCPU()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if o.RequestTimeout == 0 {
|
||||||
|
o.RequestTimeout = 10 * time.Second
|
||||||
|
}
|
||||||
|
|
||||||
return o
|
return o
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -142,12 +156,20 @@ func (d *DeadLinks) shouldFollowURL(url URL) bool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (d *DeadLinks) getURL(ctx context.Context, url URL) ([]URL, error) {
|
func (d *DeadLinks) getURL(ctx context.Context, url URL) ([]URL, error) {
|
||||||
|
ctx, cancel := context.WithTimeout(ctx, d.opts.RequestTimeout)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
mimeType, body, err := d.opts.Client.Get(ctx, url)
|
mimeType, body, err := d.opts.Client.Get(ctx, url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
defer body.Close()
|
defer body.Close()
|
||||||
|
|
||||||
|
// strip off mimeType extensions
|
||||||
|
if i := strings.Index(mimeType, ";"); i > 0 {
|
||||||
|
mimeType = mimeType[:i]
|
||||||
|
}
|
||||||
|
|
||||||
if !d.shouldFollowURL(url) {
|
if !d.shouldFollowURL(url) {
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
@ -169,6 +191,10 @@ func (d *DeadLinks) checkURL(ctx context.Context, url URL) error {
|
|||||||
errorStr = err.Error()
|
errorStr = err.Error()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for i := range outgoingURLs {
|
||||||
|
outgoingURLs[i] = url.ResolveReference(outgoingURLs[i])
|
||||||
|
}
|
||||||
|
|
||||||
err = d.store.Update(ctx, now, url, status, errorStr, outgoingURLs)
|
err = d.store.Update(ctx, now, url, status, errorStr, outgoingURLs)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf(
|
return fmt.Errorf(
|
||||||
@ -180,13 +206,12 @@ func (d *DeadLinks) checkURL(ctx context.Context, url URL) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update runs through all pinned or previously discovered URLs which were
|
// returns the number of URLs checked.
|
||||||
// last checked prior to the given time (or which have never been checked) and
|
func (d *DeadLinks) update(
|
||||||
// updates the internal storage with any new URLs and links to dead URLs which
|
|
||||||
// it finds.
|
|
||||||
func (d *DeadLinks) Update(
|
|
||||||
ctx context.Context, lastCheckedBefore time.Time,
|
ctx context.Context, lastCheckedBefore time.Time,
|
||||||
) error {
|
) (
|
||||||
|
int, error,
|
||||||
|
) {
|
||||||
var (
|
var (
|
||||||
wg = new(sync.WaitGroup)
|
wg = new(sync.WaitGroup)
|
||||||
ch = make(chan URL, d.opts.Concurrency)
|
ch = make(chan URL, d.opts.Concurrency)
|
||||||
@ -204,33 +229,46 @@ func (d *DeadLinks) Update(
|
|||||||
}()
|
}()
|
||||||
}
|
}
|
||||||
|
|
||||||
var err error
|
var (
|
||||||
|
n int
|
||||||
|
err error
|
||||||
|
urlsIter = d.store.GetURLsByLastChecked(lastCheckedBefore)
|
||||||
|
)
|
||||||
|
|
||||||
// Because checking URLs can result in new URLs being inserted into the
|
err = miter.ForEach(ctx, urlsIter, func(url URL) error {
|
||||||
// Store, we query the Store in a loop until it stops producing
|
ch <- url
|
||||||
// unvisited/stale URLs.
|
n++
|
||||||
for {
|
return nil
|
||||||
var (
|
})
|
||||||
n int
|
|
||||||
urlsIter = d.store.GetURLsByLastChecked(lastCheckedBefore)
|
|
||||||
)
|
|
||||||
|
|
||||||
err = miter.ForEach(ctx, urlsIter, func(url URL) error {
|
|
||||||
ch <- url
|
|
||||||
n++
|
|
||||||
return nil
|
|
||||||
})
|
|
||||||
|
|
||||||
if err != nil || n == 0 {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
close(ch)
|
close(ch)
|
||||||
wg.Wait()
|
wg.Wait()
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("iterating urls needing checked: %w", err)
|
return 0, fmt.Errorf("iterating urls needing checked: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return n, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update runs through all pinned or previously discovered URLs which were
|
||||||
|
// last checked prior to the given time (or which have never been checked) and
|
||||||
|
// updates the internal storage with any new URLs and links to dead URLs which
|
||||||
|
// it finds.
|
||||||
|
func (d *DeadLinks) Update(
|
||||||
|
ctx context.Context, lastCheckedBefore time.Time,
|
||||||
|
) error {
|
||||||
|
|
||||||
|
// Because we are iterating over the same dataset which is being updated it
|
||||||
|
// is required that we re-attempt the update multiple times, until all
|
||||||
|
// possible updates have been done.
|
||||||
|
for {
|
||||||
|
n, err := d.update(ctx, lastCheckedBefore)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
} else if n == 0 {
|
||||||
|
break
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := d.store.GC(ctx); err != nil {
|
if err := d.store.GC(ctx); err != nil {
|
||||||
@ -240,4 +278,8 @@ func (d *DeadLinks) Update(
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO expose GetByStatus
|
// GetByStatus returns an iterator which will return all Resources with the
|
||||||
|
// given status.
|
||||||
|
func (d *DeadLinks) GetByStatus(status ResourceStatus) miter.Iterator[Resource] {
|
||||||
|
return d.store.GetByStatus(status)
|
||||||
|
}
|
||||||
|
8
url.go
8
url.go
@ -44,10 +44,6 @@ func (u URL) toStd() *url.URL {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// ResolveReference is equivalend to the method of the same name in `net/url`.
|
// ResolveReference is equivalend to the method of the same name in `net/url`.
|
||||||
func (u URL) ResolveReference(u2Str string) (URL, error) {
|
func (u URL) ResolveReference(u2 URL) URL {
|
||||||
u2, err := url.Parse(u2Str)
|
return URL(u.toStd().ResolveReference(u2.toStd()).String())
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
}
|
|
||||||
return URL(u.toStd().ResolveReference(u2).String()), nil
|
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user