A tool for crawling and finding links to URLs which no longer exist
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
deadlinks/parser.go

207 lines
4.0 KiB

package deadlinks
import (
"bufio"
"errors"
"fmt"
"io"
"strings"
"sync"
"github.com/mmcdole/gofeed"
"golang.org/x/net/html"
)
// Parser is a thread-safe type which can parse URLs out of the body of a file,
// using a mimeType to determine what kind of file it is.
//
// The returned URLs may be either relative or absolute, and may or may not
// include other URL elements like scheme, host, etc...
//
// It is not required that the Parser fully reads the body io.Reader.
//
// If an error is returned then some set of URLs may still be returned.
type Parser interface {
Parse(mimeType string, body io.Reader) ([]URL, error)
}
type parser struct {
brPool sync.Pool
}
// NewParser returns a basic Parser supporting some commonly used document
// types which support hyperlinks. The returned Parser will return an empty URL
// set for all unsupported MIME types.
//
// Supported MIME types:
// - text/gemtext
// - text/html
// - application/rss+xml
// - application/atom+xml
// - application/feed+json
func NewParser() Parser {
return &parser{
brPool: sync.Pool{
New: func() any { return bufio.NewReader(nil) },
},
}
}
func parseGemtext(body *bufio.Reader) ([]URL, error) {
var (
urls []URL
errs []error
)
for {
line, err := body.ReadString('\n')
if strings.HasPrefix(line, "=> ") {
if parts := strings.Fields(line); len(parts) >= 2 {
u, err := ParseURL(parts[1])
if err != nil {
errs = append(errs, fmt.Errorf(
"parsing URL from line %q: %w", line, err,
))
continue
}
urls = append(urls, u)
}
}
if errors.Is(err, io.EOF) {
break
} else if err != nil {
errs = append(errs, err)
break
}
}
return urls, errors.Join(errs...)
}
// parses a URL string out of a token, if there is one. Assumes that Token is a
// StartTagToken.
func parseHTMLToken(tok html.Token) string {
for _, attr := range tok.Attr {
switch attr.Key {
case "href":
switch tok.Data {
case "a", "link":
return attr.Val
}
case "src":
switch tok.Data {
case "img", "script":
return attr.Val
}
}
}
return ""
}
func parseHTML(body *bufio.Reader) ([]URL, error) {
var (
t = html.NewTokenizer(body)
urls []URL
errs []error
)
for {
switch t.Next() {
case html.ErrorToken:
if errors.Is(t.Err(), io.EOF) {
return urls, nil
}
return urls, errors.Join(append(errs, t.Err())...)
case html.StartTagToken, html.SelfClosingTagToken:
tok := t.Token()
if urlStr := parseHTMLToken(tok); urlStr != "" {
url, err := ParseURL(urlStr)
if err != nil {
errs = append(errs, fmt.Errorf("parsing url from token %v: %w", tok, err))
continue
}
urls = append(urls, url)
}
}
}
}
func parseFeed(body *bufio.Reader) ([]URL, error) {
feed, err := gofeed.NewParser().Parse(body)
if err != nil {
return nil, err
}
var (
urls []URL
errs []error
)
tryAppend := func(s string) {
if s == "" {
return
}
url, err := ParseURL(s)
if err != nil {
errs = append(errs, fmt.Errorf("parsing URL %q: %w", s, err))
} else {
urls = append(urls, url)
}
}
tryAppend(feed.Link)
tryAppend(feed.FeedLink)
for _, l := range feed.Links {
tryAppend(l)
}
if feed.Image != nil {
tryAppend(feed.Image.URL)
}
for _, item := range feed.Items {
tryAppend(item.Link)
for _, l := range item.Links {
tryAppend(l)
}
if item.Image != nil {
tryAppend(item.Image.URL)
}
for _, enc := range item.Enclosures {
if enc != nil {
tryAppend(enc.URL)
}
}
}
return urls, errors.Join(errs...)
}
var parsersByMimeType = map[string]func(*bufio.Reader) ([]URL, error){
"text/gemini": parseGemtext,
"text/html": parseHTML,
"application/rss+xml": parseFeed,
"application/atom+xml": parseFeed,
"application/feed+json": parseFeed,
}
func (p *parser) Parse(mimeType string, body io.Reader) ([]URL, error) {
fn, ok := parsersByMimeType[mimeType]
if !ok {
return nil, nil
}
br := p.brPool.Get().(*bufio.Reader)
br.Reset(body)
defer p.brPool.Put(br)
return fn(br)
}