From 8007f090f2ac6c7504a58c06e73dfb46a95cb29a Mon Sep 17 00:00:00 2001 From: Brian Picciano Date: Thu, 28 Dec 2023 16:22:58 +0100 Subject: [PATCH] Implement Parser with gemtext support --- parser.go | 95 +++++++++++++++++++++++++++++++++++++ parser_test.go | 124 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 219 insertions(+) create mode 100644 parser.go create mode 100644 parser_test.go diff --git a/parser.go b/parser.go new file mode 100644 index 0000000..8c2a9c3 --- /dev/null +++ b/parser.go @@ -0,0 +1,95 @@ +package deadlinks + +import ( + "bufio" + "errors" + "fmt" + "io" + "strings" + "sync" +) + +// Parser is a thread-safe type which can parse URLs out of the body of a file, +// using a mimeType to determine what kind of file it is. +// +// The returned URLs may be either relative or absolute, and may or may not +// include other URL elements like scheme, host, etc... +// +// It is not required that the Parser fully reads the body io.Reader. +// +// If an error is returned then some set of URLs may still be returned. +type Parser interface { + Parse(mimeType string, body io.Reader) ([]URL, error) +} + +type parser struct { + brPool sync.Pool +} + +// NewParser returns a basic Parser supporting some commonly used document +// types which support hyperlinks. The returned Parser will return an empty URL +// set for all unsupported MIME types. +// +// Supported MIME types: +// - text/gemtext +// - text/html (TODO) +// - application/rss+xml (TODO) +// - application/atom+xml (TODO) +func NewParser() Parser { + return &parser{ + brPool: sync.Pool{ + New: func() any { return bufio.NewReader(nil) }, + }, + } +} + +func parseGemtext(body *bufio.Reader) ([]URL, error) { + var ( + urls []URL + errs []error + ) + + for { + line, err := body.ReadString('\n') + + if strings.HasPrefix(line, "=> ") { + if parts := strings.Fields(line); len(parts) >= 2 { + u, err := ParseURL(parts[1]) + if err != nil { + errs = append(errs, fmt.Errorf( + "parsing URL from line %q: %w", line, err, + )) + continue + } + + urls = append(urls, u) + } + } + + if errors.Is(err, io.EOF) { + break + } else if err != nil { + errs = append(errs, err) + break + } + } + + return urls, errors.Join(errs...) +} + +var parsersByMimeType = map[string]func(*bufio.Reader) ([]URL, error){ + "text/gemini": parseGemtext, +} + +func (p *parser) Parse(mimeType string, body io.Reader) ([]URL, error) { + fn, ok := parsersByMimeType[mimeType] + if !ok { + return nil, nil + } + + br := p.brPool.Get().(*bufio.Reader) + br.Reset(body) + defer p.brPool.Put(br) + + return fn(br) +} diff --git a/parser_test.go b/parser_test.go new file mode 100644 index 0000000..eec6b5d --- /dev/null +++ b/parser_test.go @@ -0,0 +1,124 @@ +package deadlinks + +import ( + "bytes" + "fmt" + "strings" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestParser(t *testing.T) { + t.Parallel() + + parser := NewParser() + + tests := []struct { + mimeType string + body string + wantURLs []URL + wantErrs []string + }{ + { + "image/jpg", + "ANYTHING", + nil, + nil, + }, + { + "text/gemini", + ``, + nil, + nil, + }, + { + "text/gemini", + ` +# HEADER + +=> https://foo.com some link +=> empty/path +=> /foo/bar here's an absolute path +=> what.com a domain? + +ok here's some text + `, + []URL{ + "https://foo.com", + "empty/path", + "/foo/bar", + "what.com", + }, + nil, + }, + { + "text/gemini", + ` +# HEADER + +=> https://foo.com some link +=> empty/path +=> /foo/bar here's an absolute path +=> what.com a domain? + +ok here's some text + `, + []URL{ + "https://foo.com", + "empty/path", + "/foo/bar", + "what.com", + }, + nil, + }, + { + "text/gemini", + ` +=> : NO FISH ALLOWED +=> /good/dog +`, + []URL{"/good/dog"}, + []string{ + `parsing URL from line "=> : NO FISH ALLOWED\n": parse ":": missing protocol scheme`, + }, + }, + } + + for i := range tests { + test := tests[i] + name := fmt.Sprintf( + "%d-%s", i, strings.ReplaceAll(test.mimeType, "/", "_"), + ) + t.Run(name, func(t *testing.T) { + t.Parallel() + + body := bytes.NewBufferString(test.body) + gotURLs, gotErr := parser.Parse(test.mimeType, body) + + assert.Equal(t, test.wantURLs, gotURLs) + if len(test.wantErrs) == 0 { + assert.NoError(t, gotErr) + return + } + + type joinedErr interface { + Unwrap() []error + } + + var gotErrs []error + if joinedErr, ok := gotErr.(joinedErr); ok { + gotErrs = joinedErr.Unwrap() + } else if gotErr != nil { + gotErrs = []error{gotErr} + } + + gotErrStrs := make([]string, len(gotErrs)) + for i := range gotErrs { + gotErrStrs[i] = gotErrs[i].Error() + } + + assert.Equal(t, test.wantErrs, gotErrStrs) + }) + } +}