96 lines
2.0 KiB
Go
96 lines
2.0 KiB
Go
package deadlinks
|
|
|
|
import (
|
|
"bufio"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"strings"
|
|
"sync"
|
|
)
|
|
|
|
// Parser is a thread-safe type which can parse URLs out of the body of a file,
|
|
// using a mimeType to determine what kind of file it is.
|
|
//
|
|
// The returned URLs may be either relative or absolute, and may or may not
|
|
// include other URL elements like scheme, host, etc...
|
|
//
|
|
// It is not required that the Parser fully reads the body io.Reader.
|
|
//
|
|
// If an error is returned then some set of URLs may still be returned.
|
|
type Parser interface {
|
|
Parse(mimeType string, body io.Reader) ([]URL, error)
|
|
}
|
|
|
|
type parser struct {
|
|
brPool sync.Pool
|
|
}
|
|
|
|
// NewParser returns a basic Parser supporting some commonly used document
|
|
// types which support hyperlinks. The returned Parser will return an empty URL
|
|
// set for all unsupported MIME types.
|
|
//
|
|
// Supported MIME types:
|
|
// - text/gemtext
|
|
// - text/html (TODO)
|
|
// - application/rss+xml (TODO)
|
|
// - application/atom+xml (TODO)
|
|
func NewParser() Parser {
|
|
return &parser{
|
|
brPool: sync.Pool{
|
|
New: func() any { return bufio.NewReader(nil) },
|
|
},
|
|
}
|
|
}
|
|
|
|
func parseGemtext(body *bufio.Reader) ([]URL, error) {
|
|
var (
|
|
urls []URL
|
|
errs []error
|
|
)
|
|
|
|
for {
|
|
line, err := body.ReadString('\n')
|
|
|
|
if strings.HasPrefix(line, "=> ") {
|
|
if parts := strings.Fields(line); len(parts) >= 2 {
|
|
u, err := ParseURL(parts[1])
|
|
if err != nil {
|
|
errs = append(errs, fmt.Errorf(
|
|
"parsing URL from line %q: %w", line, err,
|
|
))
|
|
continue
|
|
}
|
|
|
|
urls = append(urls, u)
|
|
}
|
|
}
|
|
|
|
if errors.Is(err, io.EOF) {
|
|
break
|
|
} else if err != nil {
|
|
errs = append(errs, err)
|
|
break
|
|
}
|
|
}
|
|
|
|
return urls, errors.Join(errs...)
|
|
}
|
|
|
|
var parsersByMimeType = map[string]func(*bufio.Reader) ([]URL, error){
|
|
"text/gemini": parseGemtext,
|
|
}
|
|
|
|
func (p *parser) Parse(mimeType string, body io.Reader) ([]URL, error) {
|
|
fn, ok := parsersByMimeType[mimeType]
|
|
if !ok {
|
|
return nil, nil
|
|
}
|
|
|
|
br := p.brPool.Get().(*bufio.Reader)
|
|
br.Reset(body)
|
|
defer p.brPool.Put(br)
|
|
|
|
return fn(br)
|
|
}
|