Got DeadLinks basic functionality actually working

This commit is contained in:
Brian Picciano 2023-12-29 20:35:02 +01:00
parent 6539cc2650
commit 4d2c62a472
3 changed files with 76 additions and 37 deletions

View File

@ -101,11 +101,12 @@ func (c *client) getGemini(
return "", nil, errors.New("too many redirects") return "", nil, errors.New("too many redirects")
} }
newURL, err := url.ResolveReference(res.Meta) metaURL, err := ParseURL(res.Meta)
if err != nil { if err != nil {
return "", nil, fmt.Errorf("resolving redirect URL %q: %w", res.Meta, err) return "", nil, fmt.Errorf("parsing redirect URL %q: %w", res.Meta, err)
} }
newURL := url.ResolveReference(metaURL)
return c.get(ctx, newURL, redirectDepth+1) return c.get(ctx, newURL, redirectDepth+1)
default: default:

View File

@ -10,8 +10,11 @@
// store := deadlinks.NewSQLiteStore(&deadlinks.SQLiteStoreOpts{ // store := deadlinks.NewSQLiteStore(&deadlinks.SQLiteStoreOpts{
// Path: "/path/to/db/file.sqlite", // Path: "/path/to/db/file.sqlite",
// }) // })
// defer store.Close()
// //
// // TODO initialize DeadLinks // dl, err := deadlinks.New(
// ctx, store, pinnedURLs, patterns, nil,
// )
package deadlinks package deadlinks
import ( import (
@ -20,6 +23,7 @@ import (
"fmt" "fmt"
"regexp" "regexp"
"runtime" "runtime"
"strings"
"sync" "sync"
"time" "time"
@ -41,6 +45,12 @@ type Opts struct {
// OnError, if set, will be called whenever DeadLinks encounters an error // OnError, if set, will be called whenever DeadLinks encounters an error
// internally that it would otherwise skip over. // internally that it would otherwise skip over.
OnError func(error) OnError func(error)
// RequestTimeout determines how long a request for a resource can run
// before the resource is considered unavailable.
//
// Default: 10 * time.Second
RequestTimeout time.Duration
} }
func (o *Opts) withDefaults() *Opts { func (o *Opts) withDefaults() *Opts {
@ -60,6 +70,10 @@ func (o *Opts) withDefaults() *Opts {
o.Concurrency = runtime.NumCPU() o.Concurrency = runtime.NumCPU()
} }
if o.RequestTimeout == 0 {
o.RequestTimeout = 10 * time.Second
}
return o return o
} }
@ -142,12 +156,20 @@ func (d *DeadLinks) shouldFollowURL(url URL) bool {
} }
func (d *DeadLinks) getURL(ctx context.Context, url URL) ([]URL, error) { func (d *DeadLinks) getURL(ctx context.Context, url URL) ([]URL, error) {
ctx, cancel := context.WithTimeout(ctx, d.opts.RequestTimeout)
defer cancel()
mimeType, body, err := d.opts.Client.Get(ctx, url) mimeType, body, err := d.opts.Client.Get(ctx, url)
if err != nil { if err != nil {
return nil, err return nil, err
} }
defer body.Close() defer body.Close()
// strip off mimeType extensions
if i := strings.Index(mimeType, ";"); i > 0 {
mimeType = mimeType[:i]
}
if !d.shouldFollowURL(url) { if !d.shouldFollowURL(url) {
return nil, nil return nil, nil
} }
@ -169,6 +191,10 @@ func (d *DeadLinks) checkURL(ctx context.Context, url URL) error {
errorStr = err.Error() errorStr = err.Error()
} }
for i := range outgoingURLs {
outgoingURLs[i] = url.ResolveReference(outgoingURLs[i])
}
err = d.store.Update(ctx, now, url, status, errorStr, outgoingURLs) err = d.store.Update(ctx, now, url, status, errorStr, outgoingURLs)
if err != nil { if err != nil {
return fmt.Errorf( return fmt.Errorf(
@ -180,13 +206,12 @@ func (d *DeadLinks) checkURL(ctx context.Context, url URL) error {
return nil return nil
} }
// Update runs through all pinned or previously discovered URLs which were // returns the number of URLs checked.
// last checked prior to the given time (or which have never been checked) and func (d *DeadLinks) update(
// updates the internal storage with any new URLs and links to dead URLs which
// it finds.
func (d *DeadLinks) Update(
ctx context.Context, lastCheckedBefore time.Time, ctx context.Context, lastCheckedBefore time.Time,
) error { ) (
int, error,
) {
var ( var (
wg = new(sync.WaitGroup) wg = new(sync.WaitGroup)
ch = make(chan URL, d.opts.Concurrency) ch = make(chan URL, d.opts.Concurrency)
@ -204,14 +229,9 @@ func (d *DeadLinks) Update(
}() }()
} }
var err error
// Because checking URLs can result in new URLs being inserted into the
// Store, we query the Store in a loop until it stops producing
// unvisited/stale URLs.
for {
var ( var (
n int n int
err error
urlsIter = d.store.GetURLsByLastChecked(lastCheckedBefore) urlsIter = d.store.GetURLsByLastChecked(lastCheckedBefore)
) )
@ -221,16 +241,34 @@ func (d *DeadLinks) Update(
return nil return nil
}) })
if err != nil || n == 0 {
break
}
}
close(ch) close(ch)
wg.Wait() wg.Wait()
if err != nil { if err != nil {
return fmt.Errorf("iterating urls needing checked: %w", err) return 0, fmt.Errorf("iterating urls needing checked: %w", err)
}
return n, nil
}
// Update runs through all pinned or previously discovered URLs which were
// last checked prior to the given time (or which have never been checked) and
// updates the internal storage with any new URLs and links to dead URLs which
// it finds.
func (d *DeadLinks) Update(
ctx context.Context, lastCheckedBefore time.Time,
) error {
// Because we are iterating over the same dataset which is being updated it
// is required that we re-attempt the update multiple times, until all
// possible updates have been done.
for {
n, err := d.update(ctx, lastCheckedBefore)
if err != nil {
return err
} else if n == 0 {
break
}
} }
if err := d.store.GC(ctx); err != nil { if err := d.store.GC(ctx); err != nil {
@ -240,4 +278,8 @@ func (d *DeadLinks) Update(
return nil return nil
} }
// TODO expose GetByStatus // GetByStatus returns an iterator which will return all Resources with the
// given status.
func (d *DeadLinks) GetByStatus(status ResourceStatus) miter.Iterator[Resource] {
return d.store.GetByStatus(status)
}

8
url.go
View File

@ -44,10 +44,6 @@ func (u URL) toStd() *url.URL {
} }
// ResolveReference is equivalend to the method of the same name in `net/url`. // ResolveReference is equivalend to the method of the same name in `net/url`.
func (u URL) ResolveReference(u2Str string) (URL, error) { func (u URL) ResolveReference(u2 URL) URL {
u2, err := url.Parse(u2Str) return URL(u.toStd().ResolveReference(u2.toStd()).String())
if err != nil {
return "", err
}
return URL(u.toStd().ResolveReference(u2).String()), nil
} }