From 4d2c62a4729adb7b4f083cd5944da0fbc5aede83 Mon Sep 17 00:00:00 2001 From: Brian Picciano Date: Fri, 29 Dec 2023 20:35:02 +0100 Subject: [PATCH] Got DeadLinks basic functionality actually working --- client.go | 5 +-- deadlinks.go | 100 ++++++++++++++++++++++++++++++++++++--------------- url.go | 8 ++--- 3 files changed, 76 insertions(+), 37 deletions(-) diff --git a/client.go b/client.go index f3e8082..068d985 100644 --- a/client.go +++ b/client.go @@ -101,11 +101,12 @@ func (c *client) getGemini( return "", nil, errors.New("too many redirects") } - newURL, err := url.ResolveReference(res.Meta) + metaURL, err := ParseURL(res.Meta) if err != nil { - return "", nil, fmt.Errorf("resolving redirect URL %q: %w", res.Meta, err) + return "", nil, fmt.Errorf("parsing redirect URL %q: %w", res.Meta, err) } + newURL := url.ResolveReference(metaURL) return c.get(ctx, newURL, redirectDepth+1) default: diff --git a/deadlinks.go b/deadlinks.go index 02dfcc4..6ff4a28 100644 --- a/deadlinks.go +++ b/deadlinks.go @@ -10,8 +10,11 @@ // store := deadlinks.NewSQLiteStore(&deadlinks.SQLiteStoreOpts{ // Path: "/path/to/db/file.sqlite", // }) +// defer store.Close() // -// // TODO initialize DeadLinks +// dl, err := deadlinks.New( +// ctx, store, pinnedURLs, patterns, nil, +// ) package deadlinks import ( @@ -20,6 +23,7 @@ import ( "fmt" "regexp" "runtime" + "strings" "sync" "time" @@ -41,6 +45,12 @@ type Opts struct { // OnError, if set, will be called whenever DeadLinks encounters an error // internally that it would otherwise skip over. OnError func(error) + + // RequestTimeout determines how long a request for a resource can run + // before the resource is considered unavailable. + // + // Default: 10 * time.Second + RequestTimeout time.Duration } func (o *Opts) withDefaults() *Opts { @@ -60,6 +70,10 @@ func (o *Opts) withDefaults() *Opts { o.Concurrency = runtime.NumCPU() } + if o.RequestTimeout == 0 { + o.RequestTimeout = 10 * time.Second + } + return o } @@ -142,12 +156,20 @@ func (d *DeadLinks) shouldFollowURL(url URL) bool { } func (d *DeadLinks) getURL(ctx context.Context, url URL) ([]URL, error) { + ctx, cancel := context.WithTimeout(ctx, d.opts.RequestTimeout) + defer cancel() + mimeType, body, err := d.opts.Client.Get(ctx, url) if err != nil { return nil, err } defer body.Close() + // strip off mimeType extensions + if i := strings.Index(mimeType, ";"); i > 0 { + mimeType = mimeType[:i] + } + if !d.shouldFollowURL(url) { return nil, nil } @@ -169,6 +191,10 @@ func (d *DeadLinks) checkURL(ctx context.Context, url URL) error { errorStr = err.Error() } + for i := range outgoingURLs { + outgoingURLs[i] = url.ResolveReference(outgoingURLs[i]) + } + err = d.store.Update(ctx, now, url, status, errorStr, outgoingURLs) if err != nil { return fmt.Errorf( @@ -180,13 +206,12 @@ func (d *DeadLinks) checkURL(ctx context.Context, url URL) error { return nil } -// Update runs through all pinned or previously discovered URLs which were -// last checked prior to the given time (or which have never been checked) and -// updates the internal storage with any new URLs and links to dead URLs which -// it finds. -func (d *DeadLinks) Update( +// returns the number of URLs checked. +func (d *DeadLinks) update( ctx context.Context, lastCheckedBefore time.Time, -) error { +) ( + int, error, +) { var ( wg = new(sync.WaitGroup) ch = make(chan URL, d.opts.Concurrency) @@ -204,33 +229,46 @@ func (d *DeadLinks) Update( }() } - var err error - - // Because checking URLs can result in new URLs being inserted into the - // Store, we query the Store in a loop until it stops producing - // unvisited/stale URLs. - for { - var ( - n int - urlsIter = d.store.GetURLsByLastChecked(lastCheckedBefore) - ) - - err = miter.ForEach(ctx, urlsIter, func(url URL) error { - ch <- url - n++ - return nil - }) + var ( + n int + err error + urlsIter = d.store.GetURLsByLastChecked(lastCheckedBefore) + ) - if err != nil || n == 0 { - break - } - } + err = miter.ForEach(ctx, urlsIter, func(url URL) error { + ch <- url + n++ + return nil + }) close(ch) wg.Wait() if err != nil { - return fmt.Errorf("iterating urls needing checked: %w", err) + return 0, fmt.Errorf("iterating urls needing checked: %w", err) + } + + return n, nil +} + +// Update runs through all pinned or previously discovered URLs which were +// last checked prior to the given time (or which have never been checked) and +// updates the internal storage with any new URLs and links to dead URLs which +// it finds. +func (d *DeadLinks) Update( + ctx context.Context, lastCheckedBefore time.Time, +) error { + + // Because we are iterating over the same dataset which is being updated it + // is required that we re-attempt the update multiple times, until all + // possible updates have been done. + for { + n, err := d.update(ctx, lastCheckedBefore) + if err != nil { + return err + } else if n == 0 { + break + } } if err := d.store.GC(ctx); err != nil { @@ -240,4 +278,8 @@ func (d *DeadLinks) Update( return nil } -// TODO expose GetByStatus +// GetByStatus returns an iterator which will return all Resources with the +// given status. +func (d *DeadLinks) GetByStatus(status ResourceStatus) miter.Iterator[Resource] { + return d.store.GetByStatus(status) +} diff --git a/url.go b/url.go index 86fc379..40593c3 100644 --- a/url.go +++ b/url.go @@ -44,10 +44,6 @@ func (u URL) toStd() *url.URL { } // ResolveReference is equivalend to the method of the same name in `net/url`. -func (u URL) ResolveReference(u2Str string) (URL, error) { - u2, err := url.Parse(u2Str) - if err != nil { - return "", err - } - return URL(u.toStd().ResolveReference(u2).String()), nil +func (u URL) ResolveReference(u2 URL) URL { + return URL(u.toStd().ResolveReference(u2.toStd()).String()) }