Got DeadLinks basic functionality actually working

main
Brian Picciano 4 months ago
parent 6539cc2650
commit 4d2c62a472
  1. 5
      client.go
  2. 100
      deadlinks.go
  3. 8
      url.go

@ -101,11 +101,12 @@ func (c *client) getGemini(
return "", nil, errors.New("too many redirects")
}
newURL, err := url.ResolveReference(res.Meta)
metaURL, err := ParseURL(res.Meta)
if err != nil {
return "", nil, fmt.Errorf("resolving redirect URL %q: %w", res.Meta, err)
return "", nil, fmt.Errorf("parsing redirect URL %q: %w", res.Meta, err)
}
newURL := url.ResolveReference(metaURL)
return c.get(ctx, newURL, redirectDepth+1)
default:

@ -10,8 +10,11 @@
// store := deadlinks.NewSQLiteStore(&deadlinks.SQLiteStoreOpts{
// Path: "/path/to/db/file.sqlite",
// })
// defer store.Close()
//
// // TODO initialize DeadLinks
// dl, err := deadlinks.New(
// ctx, store, pinnedURLs, patterns, nil,
// )
package deadlinks
import (
@ -20,6 +23,7 @@ import (
"fmt"
"regexp"
"runtime"
"strings"
"sync"
"time"
@ -41,6 +45,12 @@ type Opts struct {
// OnError, if set, will be called whenever DeadLinks encounters an error
// internally that it would otherwise skip over.
OnError func(error)
// RequestTimeout determines how long a request for a resource can run
// before the resource is considered unavailable.
//
// Default: 10 * time.Second
RequestTimeout time.Duration
}
func (o *Opts) withDefaults() *Opts {
@ -60,6 +70,10 @@ func (o *Opts) withDefaults() *Opts {
o.Concurrency = runtime.NumCPU()
}
if o.RequestTimeout == 0 {
o.RequestTimeout = 10 * time.Second
}
return o
}
@ -142,12 +156,20 @@ func (d *DeadLinks) shouldFollowURL(url URL) bool {
}
func (d *DeadLinks) getURL(ctx context.Context, url URL) ([]URL, error) {
ctx, cancel := context.WithTimeout(ctx, d.opts.RequestTimeout)
defer cancel()
mimeType, body, err := d.opts.Client.Get(ctx, url)
if err != nil {
return nil, err
}
defer body.Close()
// strip off mimeType extensions
if i := strings.Index(mimeType, ";"); i > 0 {
mimeType = mimeType[:i]
}
if !d.shouldFollowURL(url) {
return nil, nil
}
@ -169,6 +191,10 @@ func (d *DeadLinks) checkURL(ctx context.Context, url URL) error {
errorStr = err.Error()
}
for i := range outgoingURLs {
outgoingURLs[i] = url.ResolveReference(outgoingURLs[i])
}
err = d.store.Update(ctx, now, url, status, errorStr, outgoingURLs)
if err != nil {
return fmt.Errorf(
@ -180,13 +206,12 @@ func (d *DeadLinks) checkURL(ctx context.Context, url URL) error {
return nil
}
// Update runs through all pinned or previously discovered URLs which were
// last checked prior to the given time (or which have never been checked) and
// updates the internal storage with any new URLs and links to dead URLs which
// it finds.
func (d *DeadLinks) Update(
// returns the number of URLs checked.
func (d *DeadLinks) update(
ctx context.Context, lastCheckedBefore time.Time,
) error {
) (
int, error,
) {
var (
wg = new(sync.WaitGroup)
ch = make(chan URL, d.opts.Concurrency)
@ -204,33 +229,46 @@ func (d *DeadLinks) Update(
}()
}
var err error
// Because checking URLs can result in new URLs being inserted into the
// Store, we query the Store in a loop until it stops producing
// unvisited/stale URLs.
for {
var (
n int
urlsIter = d.store.GetURLsByLastChecked(lastCheckedBefore)
)
err = miter.ForEach(ctx, urlsIter, func(url URL) error {
ch <- url
n++
return nil
})
var (
n int
err error
urlsIter = d.store.GetURLsByLastChecked(lastCheckedBefore)
)
if err != nil || n == 0 {
break
}
}
err = miter.ForEach(ctx, urlsIter, func(url URL) error {
ch <- url
n++
return nil
})
close(ch)
wg.Wait()
if err != nil {
return fmt.Errorf("iterating urls needing checked: %w", err)
return 0, fmt.Errorf("iterating urls needing checked: %w", err)
}
return n, nil
}
// Update runs through all pinned or previously discovered URLs which were
// last checked prior to the given time (or which have never been checked) and
// updates the internal storage with any new URLs and links to dead URLs which
// it finds.
func (d *DeadLinks) Update(
ctx context.Context, lastCheckedBefore time.Time,
) error {
// Because we are iterating over the same dataset which is being updated it
// is required that we re-attempt the update multiple times, until all
// possible updates have been done.
for {
n, err := d.update(ctx, lastCheckedBefore)
if err != nil {
return err
} else if n == 0 {
break
}
}
if err := d.store.GC(ctx); err != nil {
@ -240,4 +278,8 @@ func (d *DeadLinks) Update(
return nil
}
// TODO expose GetByStatus
// GetByStatus returns an iterator which will return all Resources with the
// given status.
func (d *DeadLinks) GetByStatus(status ResourceStatus) miter.Iterator[Resource] {
return d.store.GetByStatus(status)
}

@ -44,10 +44,6 @@ func (u URL) toStd() *url.URL {
}
// ResolveReference is equivalend to the method of the same name in `net/url`.
func (u URL) ResolveReference(u2Str string) (URL, error) {
u2, err := url.Parse(u2Str)
if err != nil {
return "", err
}
return URL(u.toStd().ResolveReference(u2).String()), nil
func (u URL) ResolveReference(u2 URL) URL {
return URL(u.toStd().ResolveReference(u2.toStd()).String())
}

Loading…
Cancel
Save