deadlinks/parser.go

96 lines
2.0 KiB
Go
Raw Normal View History

2023-12-28 15:22:58 +00:00
package deadlinks
import (
"bufio"
"errors"
"fmt"
"io"
"strings"
"sync"
)
// Parser is a thread-safe type which can parse URLs out of the body of a file,
// using a mimeType to determine what kind of file it is.
//
// The returned URLs may be either relative or absolute, and may or may not
// include other URL elements like scheme, host, etc...
//
// It is not required that the Parser fully reads the body io.Reader.
//
// If an error is returned then some set of URLs may still be returned.
type Parser interface {
Parse(mimeType string, body io.Reader) ([]URL, error)
}
type parser struct {
brPool sync.Pool
}
// NewParser returns a basic Parser supporting some commonly used document
// types which support hyperlinks. The returned Parser will return an empty URL
// set for all unsupported MIME types.
//
// Supported MIME types:
// - text/gemtext
// - text/html (TODO)
// - application/rss+xml (TODO)
// - application/atom+xml (TODO)
func NewParser() Parser {
return &parser{
brPool: sync.Pool{
New: func() any { return bufio.NewReader(nil) },
},
}
}
func parseGemtext(body *bufio.Reader) ([]URL, error) {
var (
urls []URL
errs []error
)
for {
line, err := body.ReadString('\n')
if strings.HasPrefix(line, "=> ") {
if parts := strings.Fields(line); len(parts) >= 2 {
u, err := ParseURL(parts[1])
if err != nil {
errs = append(errs, fmt.Errorf(
"parsing URL from line %q: %w", line, err,
))
continue
}
urls = append(urls, u)
}
}
if errors.Is(err, io.EOF) {
break
} else if err != nil {
errs = append(errs, err)
break
}
}
return urls, errors.Join(errs...)
}
var parsersByMimeType = map[string]func(*bufio.Reader) ([]URL, error){
"text/gemini": parseGemtext,
}
func (p *parser) Parse(mimeType string, body io.Reader) ([]URL, error) {
fn, ok := parsersByMimeType[mimeType]
if !ok {
return nil, nil
}
br := p.brPool.Get().(*bufio.Reader)
br.Reset(body)
defer p.brPool.Put(br)
return fn(br)
}