Implement Parser with gemtext support

This commit is contained in:
Brian Picciano 2023-12-28 16:22:58 +01:00
parent 571da7e2ac
commit 8007f090f2
2 changed files with 219 additions and 0 deletions

95
parser.go Normal file
View File

@ -0,0 +1,95 @@
package deadlinks
import (
"bufio"
"errors"
"fmt"
"io"
"strings"
"sync"
)
// Parser is a thread-safe type which can parse URLs out of the body of a file,
// using a mimeType to determine what kind of file it is.
//
// The returned URLs may be either relative or absolute, and may or may not
// include other URL elements like scheme, host, etc...
//
// It is not required that the Parser fully reads the body io.Reader.
//
// If an error is returned then some set of URLs may still be returned.
type Parser interface {
Parse(mimeType string, body io.Reader) ([]URL, error)
}
type parser struct {
brPool sync.Pool
}
// NewParser returns a basic Parser supporting some commonly used document
// types which support hyperlinks. The returned Parser will return an empty URL
// set for all unsupported MIME types.
//
// Supported MIME types:
// - text/gemtext
// - text/html (TODO)
// - application/rss+xml (TODO)
// - application/atom+xml (TODO)
func NewParser() Parser {
return &parser{
brPool: sync.Pool{
New: func() any { return bufio.NewReader(nil) },
},
}
}
func parseGemtext(body *bufio.Reader) ([]URL, error) {
var (
urls []URL
errs []error
)
for {
line, err := body.ReadString('\n')
if strings.HasPrefix(line, "=> ") {
if parts := strings.Fields(line); len(parts) >= 2 {
u, err := ParseURL(parts[1])
if err != nil {
errs = append(errs, fmt.Errorf(
"parsing URL from line %q: %w", line, err,
))
continue
}
urls = append(urls, u)
}
}
if errors.Is(err, io.EOF) {
break
} else if err != nil {
errs = append(errs, err)
break
}
}
return urls, errors.Join(errs...)
}
var parsersByMimeType = map[string]func(*bufio.Reader) ([]URL, error){
"text/gemini": parseGemtext,
}
func (p *parser) Parse(mimeType string, body io.Reader) ([]URL, error) {
fn, ok := parsersByMimeType[mimeType]
if !ok {
return nil, nil
}
br := p.brPool.Get().(*bufio.Reader)
br.Reset(body)
defer p.brPool.Put(br)
return fn(br)
}

124
parser_test.go Normal file
View File

@ -0,0 +1,124 @@
package deadlinks
import (
"bytes"
"fmt"
"strings"
"testing"
"github.com/stretchr/testify/assert"
)
func TestParser(t *testing.T) {
t.Parallel()
parser := NewParser()
tests := []struct {
mimeType string
body string
wantURLs []URL
wantErrs []string
}{
{
"image/jpg",
"ANYTHING",
nil,
nil,
},
{
"text/gemini",
``,
nil,
nil,
},
{
"text/gemini",
`
# HEADER
=> https://foo.com some link
=> empty/path
=> /foo/bar here's an absolute path
=> what.com a domain?
ok here's some text
`,
[]URL{
"https://foo.com",
"empty/path",
"/foo/bar",
"what.com",
},
nil,
},
{
"text/gemini",
`
# HEADER
=> https://foo.com some link
=> empty/path
=> /foo/bar here's an absolute path
=> what.com a domain?
ok here's some text
`,
[]URL{
"https://foo.com",
"empty/path",
"/foo/bar",
"what.com",
},
nil,
},
{
"text/gemini",
`
=> : NO FISH ALLOWED
=> /good/dog
`,
[]URL{"/good/dog"},
[]string{
`parsing URL from line "=> : NO FISH ALLOWED\n": parse ":": missing protocol scheme`,
},
},
}
for i := range tests {
test := tests[i]
name := fmt.Sprintf(
"%d-%s", i, strings.ReplaceAll(test.mimeType, "/", "_"),
)
t.Run(name, func(t *testing.T) {
t.Parallel()
body := bytes.NewBufferString(test.body)
gotURLs, gotErr := parser.Parse(test.mimeType, body)
assert.Equal(t, test.wantURLs, gotURLs)
if len(test.wantErrs) == 0 {
assert.NoError(t, gotErr)
return
}
type joinedErr interface {
Unwrap() []error
}
var gotErrs []error
if joinedErr, ok := gotErr.(joinedErr); ok {
gotErrs = joinedErr.Unwrap()
} else if gotErr != nil {
gotErrs = []error{gotErr}
}
gotErrStrs := make([]string, len(gotErrs))
for i := range gotErrs {
gotErrStrs[i] = gotErrs[i].Error()
}
assert.Equal(t, test.wantErrs, gotErrStrs)
})
}
}