diff --git a/deadlinks.go b/deadlinks.go index 5d125c5..dd96582 100644 --- a/deadlinks.go +++ b/deadlinks.go @@ -1,5 +1,17 @@ // Package deadlinks implements a liveness checker for hyperlinks in HTML and // gemtext documents. +// +// # Storage +// +// By default DeadLinks uses an in-memory SQLite database for tracking the +// status of resources and the links between them. If memory usage becomes a +// problem it is also possible to use a SQLite database file: +// +// store := deadlinks.NewSQLiteStore(&deadlinks.SQLiteStoreOpts{ +// Path: "/path/to/db/file.sqlite", +// }) +// +// // TODO initialize DeadLinks package deadlinks import ( diff --git a/store.go b/store.go index 3dd0faf..d875f7c 100644 --- a/store.go +++ b/store.go @@ -14,8 +14,9 @@ import ( migrate "github.com/rubenv/sql-migrate" ) -// Store keeps track of the current status of all discovered Resources. -// Resources with no incoming links will be periodically cleaned out. +// Store keeps track of the current status of all discovered Resources, and +// links between them. A Resource which is neither pinned nor linked to from +// another Resource is considered to not exist. // // An implementation of Store must be thread-safe. type Store interface { @@ -30,8 +31,10 @@ type Store interface { SetPinned(context.Context, []URL) error // Update updates the Resource identified by the given URL with the given - // arguments. The Resource must have been Touch'd previously, or this - // returns an error. + // arguments. + // + // Update returns an error if the URL has not been pinned nor referenced as + // an outgoing URL of a different Resource. Update( ctx context.Context, now time.Time, @@ -40,6 +43,9 @@ type Store interface { errorString string, outgoing []URL, ) error + + // GC will garbage collect the store, removing any orphaned Resources. + GC(context.Context) error } var migrations = &migrate.MemoryMigrationSource{Migrations: []*migrate.Migration{ @@ -74,13 +80,26 @@ var migrations = &migrate.MemoryMigrationSource{Migrations: []*migrate.Migration }, }} -/* -TODO -- initialization options - - cleanup period -- document SQLiteStore properly -- teardown the cleanup goroutine -*/ +// SQLiteSQLiteStoreOpts are optional fields which can be provided to NewSQLiteStore. +// A nil SQLiteSQLiteStoreOpts is equivalent to an empty one. +type SQLiteStoreOpts struct { + // Path to the database file to use. + // + // Defaults to ":memory:", indicating an in-memory database will be used. + Path string +} + +func (o *SQLiteStoreOpts) withDefaults() *SQLiteStoreOpts { + if o == nil { + o = new(SQLiteStoreOpts) + } + + if o.Path == "" { + o.Path = ":memory:" + } + + return o +} type SQLiteStore struct { db *sql.DB @@ -88,10 +107,12 @@ type SQLiteStore struct { var _ Store = (*SQLiteStore)(nil) -// NewInMemStore returns a Store implementation which uses an in-memory SQLite +// NewSQLiteStore returns a Store implementation which uses an in-memory SQLite // db. -func NewInMemStore() *SQLiteStore { - db, err := sql.Open("sqlite3", ":memory:?_foreign_keys=1") +func NewSQLiteStore(o *SQLiteStoreOpts) *SQLiteStore { + o = o.withDefaults() + + db, err := sql.Open("sqlite3", o.Path+"?_foreign_keys=1") if err != nil { panic(fmt.Errorf("opening sqlite in memory: %w", err)) } @@ -141,7 +162,8 @@ func (s *SQLiteStore) GetByStatus(status ResourceStatus) miter.Iterator[Resource JOIN urls ON (urls.id = resources.url_id) LEFT JOIN incoming ON (incoming.url_id = resources.url_id) LEFT JOIN outgoing ON (outgoing.url_id = resources.url_id) - WHERE status = ?` + WHERE status = ? + AND (pinned OR incoming.urls IS NOT NULL)` return miter.Lazily(func(ctx context.Context) (miter.Iterator[Resource], error) { rows, err := s.db.QueryContext(ctx, query, status) @@ -208,10 +230,18 @@ func (s *SQLiteStore) GetURLsByLastChecked( olderThan time.Time, ) miter.Iterator[URL] { const query = ` + WITH + incoming(url_id, urls) AS ( + SELECT to_url_id, COUNT(1) + FROM links + GROUP BY to_url_id + ) SELECT url FROM resources JOIN urls ON (urls.id = resources.url_id) - WHERE last_checked < ?` + LEFT JOIN incoming ON (incoming.url_id = resources.url_id) + WHERE last_checked < ? + AND (pinned OR incoming.urls IS NOT NULL)` return miter.Lazily(func(ctx context.Context) (miter.Iterator[URL], error) { rows, err := s.db.QueryContext(ctx, query, olderThan.Unix()) @@ -379,7 +409,8 @@ func (s *SQLiteStore) Update( return nil } -func (s *SQLiteStore) deleteOrphans(ctx context.Context) error { +// GC implements the method for the Store interface. +func (s *SQLiteStore) GC(ctx context.Context) error { const query = ` WITH orphans AS ( SELECT url_id FROM resources diff --git a/store_test.go b/store_test.go index 7b0467f..bace044 100644 --- a/store_test.go +++ b/store_test.go @@ -20,7 +20,7 @@ func newSQLiteStoreHarness() *sqliteStoreHarness { var ( ctx = context.Background() now = time.Now().Truncate(time.Second).UTC() - store = NewInMemStore() + store = NewSQLiteStore(nil) ) return &sqliteStoreHarness{ @@ -77,10 +77,11 @@ func TestSQLiteStore(t *testing.T) { h.assertGetByStatus(t, nil, ResourceStatusOK) h.assertGetByStatus(t, []Resource{a, b}, ResourceStatusUnknown) - a.Pinned = false assert.NoError(t, h.store.SetPinned(h.ctx, []URL{urlB})) h.assertGetByStatus(t, nil, ResourceStatusOK) - h.assertGetByStatus(t, []Resource{a, b}, ResourceStatusUnknown) + // GetByStatus should not return resources which are not pinned and have + // no incoming links + h.assertGetByStatus(t, []Resource{b}, ResourceStatusUnknown) }) t.Run("Update", func(t *testing.T) { @@ -174,10 +175,15 @@ func TestSQLiteStore(t *testing.T) { assertGetURLsByLastChecked([]URL{urlA, urlC}, nowB) assertGetURLsByLastChecked([]URL{urlA, urlC}, nowA.Add(1*time.Second)) assertGetURLsByLastChecked([]URL{urlC}, nowA) - assertGetURLsByLastChecked([]URL{urlC}, h.now) + + // A Resource which is not pinned and has no incoming links should not + // be returned + assert.NoError(t, h.store.SetPinned(h.ctx, []URL{urlA, urlB})) + assertGetURLsByLastChecked([]URL{urlA, urlB}, nowB.Add(1*time.Second)) + assertGetURLsByLastChecked([]URL{}, nowA) }) - t.Run("deleteOrphans", func(t *testing.T) { + t.Run("GC", func(t *testing.T) { t.Parallel() var ( @@ -203,7 +209,11 @@ func TestSQLiteStore(t *testing.T) { h.ctx, h.now, urlC, ResourceStatusUnknown, "", []URL{urlD}, )) - assert.NoError(t, h.store.deleteOrphans(h.ctx)) + assert.NoError(t, h.store.GC(h.ctx)) + h.assertGetByStatus(t, []Resource{a, b}, ResourceStatusUnknown) + + // Calling again shouldn't do anything + assert.NoError(t, h.store.GC(h.ctx)) h.assertGetByStatus(t, []Resource{a, b}, ResourceStatusUnknown) }) }