|
|
|
@ -10,8 +10,11 @@ |
|
|
|
|
// store := deadlinks.NewSQLiteStore(&deadlinks.SQLiteStoreOpts{
|
|
|
|
|
// Path: "/path/to/db/file.sqlite",
|
|
|
|
|
// })
|
|
|
|
|
// defer store.Close()
|
|
|
|
|
//
|
|
|
|
|
// // TODO initialize DeadLinks
|
|
|
|
|
// dl, err := deadlinks.New(
|
|
|
|
|
// ctx, store, pinnedURLs, patterns, nil,
|
|
|
|
|
// )
|
|
|
|
|
package deadlinks |
|
|
|
|
|
|
|
|
|
import ( |
|
|
|
@ -20,6 +23,7 @@ import ( |
|
|
|
|
"fmt" |
|
|
|
|
"regexp" |
|
|
|
|
"runtime" |
|
|
|
|
"strings" |
|
|
|
|
"sync" |
|
|
|
|
"time" |
|
|
|
|
|
|
|
|
@ -41,6 +45,12 @@ type Opts struct { |
|
|
|
|
// OnError, if set, will be called whenever DeadLinks encounters an error
|
|
|
|
|
// internally that it would otherwise skip over.
|
|
|
|
|
OnError func(error) |
|
|
|
|
|
|
|
|
|
// RequestTimeout determines how long a request for a resource can run
|
|
|
|
|
// before the resource is considered unavailable.
|
|
|
|
|
//
|
|
|
|
|
// Default: 10 * time.Second
|
|
|
|
|
RequestTimeout time.Duration |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
func (o *Opts) withDefaults() *Opts { |
|
|
|
@ -60,6 +70,10 @@ func (o *Opts) withDefaults() *Opts { |
|
|
|
|
o.Concurrency = runtime.NumCPU() |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if o.RequestTimeout == 0 { |
|
|
|
|
o.RequestTimeout = 10 * time.Second |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return o |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -142,12 +156,20 @@ func (d *DeadLinks) shouldFollowURL(url URL) bool { |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
func (d *DeadLinks) getURL(ctx context.Context, url URL) ([]URL, error) { |
|
|
|
|
ctx, cancel := context.WithTimeout(ctx, d.opts.RequestTimeout) |
|
|
|
|
defer cancel() |
|
|
|
|
|
|
|
|
|
mimeType, body, err := d.opts.Client.Get(ctx, url) |
|
|
|
|
if err != nil { |
|
|
|
|
return nil, err |
|
|
|
|
} |
|
|
|
|
defer body.Close() |
|
|
|
|
|
|
|
|
|
// strip off mimeType extensions
|
|
|
|
|
if i := strings.Index(mimeType, ";"); i > 0 { |
|
|
|
|
mimeType = mimeType[:i] |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if !d.shouldFollowURL(url) { |
|
|
|
|
return nil, nil |
|
|
|
|
} |
|
|
|
@ -169,6 +191,10 @@ func (d *DeadLinks) checkURL(ctx context.Context, url URL) error { |
|
|
|
|
errorStr = err.Error() |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
for i := range outgoingURLs { |
|
|
|
|
outgoingURLs[i] = url.ResolveReference(outgoingURLs[i]) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
err = d.store.Update(ctx, now, url, status, errorStr, outgoingURLs) |
|
|
|
|
if err != nil { |
|
|
|
|
return fmt.Errorf( |
|
|
|
@ -180,13 +206,12 @@ func (d *DeadLinks) checkURL(ctx context.Context, url URL) error { |
|
|
|
|
return nil |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Update runs through all pinned or previously discovered URLs which were
|
|
|
|
|
// last checked prior to the given time (or which have never been checked) and
|
|
|
|
|
// updates the internal storage with any new URLs and links to dead URLs which
|
|
|
|
|
// it finds.
|
|
|
|
|
func (d *DeadLinks) Update( |
|
|
|
|
// returns the number of URLs checked.
|
|
|
|
|
func (d *DeadLinks) update( |
|
|
|
|
ctx context.Context, lastCheckedBefore time.Time, |
|
|
|
|
) error { |
|
|
|
|
) ( |
|
|
|
|
int, error, |
|
|
|
|
) { |
|
|
|
|
var ( |
|
|
|
|
wg = new(sync.WaitGroup) |
|
|
|
|
ch = make(chan URL, d.opts.Concurrency) |
|
|
|
@ -204,33 +229,46 @@ func (d *DeadLinks) Update( |
|
|
|
|
}() |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
var err error |
|
|
|
|
|
|
|
|
|
// Because checking URLs can result in new URLs being inserted into the
|
|
|
|
|
// Store, we query the Store in a loop until it stops producing
|
|
|
|
|
// unvisited/stale URLs.
|
|
|
|
|
for { |
|
|
|
|
var ( |
|
|
|
|
n int |
|
|
|
|
urlsIter = d.store.GetURLsByLastChecked(lastCheckedBefore) |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
err = miter.ForEach(ctx, urlsIter, func(url URL) error { |
|
|
|
|
ch <- url |
|
|
|
|
n++ |
|
|
|
|
return nil |
|
|
|
|
}) |
|
|
|
|
var ( |
|
|
|
|
n int |
|
|
|
|
err error |
|
|
|
|
urlsIter = d.store.GetURLsByLastChecked(lastCheckedBefore) |
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
if err != nil || n == 0 { |
|
|
|
|
break |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
err = miter.ForEach(ctx, urlsIter, func(url URL) error { |
|
|
|
|
ch <- url |
|
|
|
|
n++ |
|
|
|
|
return nil |
|
|
|
|
}) |
|
|
|
|
|
|
|
|
|
close(ch) |
|
|
|
|
wg.Wait() |
|
|
|
|
|
|
|
|
|
if err != nil { |
|
|
|
|
return fmt.Errorf("iterating urls needing checked: %w", err) |
|
|
|
|
return 0, fmt.Errorf("iterating urls needing checked: %w", err) |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
return n, nil |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Update runs through all pinned or previously discovered URLs which were
|
|
|
|
|
// last checked prior to the given time (or which have never been checked) and
|
|
|
|
|
// updates the internal storage with any new URLs and links to dead URLs which
|
|
|
|
|
// it finds.
|
|
|
|
|
func (d *DeadLinks) Update( |
|
|
|
|
ctx context.Context, lastCheckedBefore time.Time, |
|
|
|
|
) error { |
|
|
|
|
|
|
|
|
|
// Because we are iterating over the same dataset which is being updated it
|
|
|
|
|
// is required that we re-attempt the update multiple times, until all
|
|
|
|
|
// possible updates have been done.
|
|
|
|
|
for { |
|
|
|
|
n, err := d.update(ctx, lastCheckedBefore) |
|
|
|
|
if err != nil { |
|
|
|
|
return err |
|
|
|
|
} else if n == 0 { |
|
|
|
|
break |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
if err := d.store.GC(ctx); err != nil { |
|
|
|
@ -240,4 +278,8 @@ func (d *DeadLinks) Update( |
|
|
|
|
return nil |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// TODO expose GetByStatus
|
|
|
|
|
// GetByStatus returns an iterator which will return all Resources with the
|
|
|
|
|
// given status.
|
|
|
|
|
func (d *DeadLinks) GetByStatus(status ResourceStatus) miter.Iterator[Resource] { |
|
|
|
|
return d.store.GetByStatus(status) |
|
|
|
|
} |
|
|
|
|