2023-12-28 15:22:58 +00:00
|
|
|
package deadlinks
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bufio"
|
|
|
|
"errors"
|
|
|
|
"fmt"
|
|
|
|
"io"
|
|
|
|
"strings"
|
|
|
|
"sync"
|
2023-12-30 10:22:09 +00:00
|
|
|
|
2023-12-30 10:33:47 +00:00
|
|
|
"github.com/mmcdole/gofeed"
|
2023-12-30 10:22:09 +00:00
|
|
|
"golang.org/x/net/html"
|
2023-12-28 15:22:58 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
// Parser is a thread-safe type which can parse URLs out of the body of a file,
|
|
|
|
// using a mimeType to determine what kind of file it is.
|
|
|
|
//
|
|
|
|
// The returned URLs may be either relative or absolute, and may or may not
|
|
|
|
// include other URL elements like scheme, host, etc...
|
|
|
|
//
|
|
|
|
// It is not required that the Parser fully reads the body io.Reader.
|
|
|
|
//
|
|
|
|
// If an error is returned then some set of URLs may still be returned.
|
|
|
|
type Parser interface {
|
|
|
|
Parse(mimeType string, body io.Reader) ([]URL, error)
|
|
|
|
}
|
|
|
|
|
|
|
|
type parser struct {
|
|
|
|
brPool sync.Pool
|
|
|
|
}
|
|
|
|
|
|
|
|
// NewParser returns a basic Parser supporting some commonly used document
|
|
|
|
// types which support hyperlinks. The returned Parser will return an empty URL
|
|
|
|
// set for all unsupported MIME types.
|
|
|
|
//
|
|
|
|
// Supported MIME types:
|
|
|
|
// - text/gemtext
|
2023-12-30 10:22:09 +00:00
|
|
|
// - text/html
|
2023-12-30 10:33:47 +00:00
|
|
|
// - application/rss+xml
|
|
|
|
// - application/atom+xml
|
|
|
|
// - application/feed+json
|
2023-12-28 15:22:58 +00:00
|
|
|
func NewParser() Parser {
|
|
|
|
return &parser{
|
|
|
|
brPool: sync.Pool{
|
|
|
|
New: func() any { return bufio.NewReader(nil) },
|
|
|
|
},
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func parseGemtext(body *bufio.Reader) ([]URL, error) {
|
|
|
|
var (
|
|
|
|
urls []URL
|
|
|
|
errs []error
|
|
|
|
)
|
|
|
|
|
|
|
|
for {
|
|
|
|
line, err := body.ReadString('\n')
|
|
|
|
|
|
|
|
if strings.HasPrefix(line, "=> ") {
|
|
|
|
if parts := strings.Fields(line); len(parts) >= 2 {
|
|
|
|
u, err := ParseURL(parts[1])
|
|
|
|
if err != nil {
|
|
|
|
errs = append(errs, fmt.Errorf(
|
|
|
|
"parsing URL from line %q: %w", line, err,
|
|
|
|
))
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
urls = append(urls, u)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if errors.Is(err, io.EOF) {
|
|
|
|
break
|
|
|
|
} else if err != nil {
|
|
|
|
errs = append(errs, err)
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return urls, errors.Join(errs...)
|
|
|
|
}
|
|
|
|
|
2023-12-30 10:22:09 +00:00
|
|
|
// parses a URL string out of a token, if there is one. Assumes that Token is a
|
|
|
|
// StartTagToken.
|
|
|
|
func parseHTMLToken(tok html.Token) string {
|
|
|
|
for _, attr := range tok.Attr {
|
|
|
|
switch attr.Key {
|
|
|
|
case "href":
|
|
|
|
switch tok.Data {
|
|
|
|
case "a", "link":
|
|
|
|
return attr.Val
|
|
|
|
}
|
|
|
|
|
|
|
|
case "src":
|
|
|
|
switch tok.Data {
|
|
|
|
case "img", "script":
|
|
|
|
return attr.Val
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return ""
|
|
|
|
}
|
|
|
|
|
|
|
|
func parseHTML(body *bufio.Reader) ([]URL, error) {
|
|
|
|
var (
|
|
|
|
t = html.NewTokenizer(body)
|
|
|
|
urls []URL
|
|
|
|
errs []error
|
|
|
|
)
|
|
|
|
|
|
|
|
for {
|
|
|
|
switch t.Next() {
|
|
|
|
case html.ErrorToken:
|
|
|
|
if errors.Is(t.Err(), io.EOF) {
|
|
|
|
return urls, nil
|
|
|
|
}
|
|
|
|
return urls, errors.Join(append(errs, t.Err())...)
|
|
|
|
|
|
|
|
case html.StartTagToken, html.SelfClosingTagToken:
|
|
|
|
tok := t.Token()
|
|
|
|
if urlStr := parseHTMLToken(tok); urlStr != "" {
|
|
|
|
url, err := ParseURL(urlStr)
|
|
|
|
if err != nil {
|
|
|
|
errs = append(errs, fmt.Errorf("parsing url from token %v: %w", tok, err))
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
urls = append(urls, url)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-12-30 10:33:47 +00:00
|
|
|
func parseFeed(body *bufio.Reader) ([]URL, error) {
|
|
|
|
feed, err := gofeed.NewParser().Parse(body)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
var (
|
|
|
|
urls []URL
|
|
|
|
errs []error
|
|
|
|
)
|
|
|
|
|
|
|
|
tryAppend := func(s string) {
|
|
|
|
if s == "" {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
url, err := ParseURL(s)
|
|
|
|
if err != nil {
|
|
|
|
errs = append(errs, fmt.Errorf("parsing URL %q: %w", s, err))
|
|
|
|
} else {
|
|
|
|
urls = append(urls, url)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
tryAppend(feed.Link)
|
|
|
|
tryAppend(feed.FeedLink)
|
|
|
|
for _, l := range feed.Links {
|
|
|
|
tryAppend(l)
|
|
|
|
}
|
|
|
|
|
|
|
|
if feed.Image != nil {
|
|
|
|
tryAppend(feed.Image.URL)
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, item := range feed.Items {
|
|
|
|
tryAppend(item.Link)
|
|
|
|
for _, l := range item.Links {
|
|
|
|
tryAppend(l)
|
|
|
|
}
|
|
|
|
|
|
|
|
if item.Image != nil {
|
|
|
|
tryAppend(item.Image.URL)
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, enc := range item.Enclosures {
|
|
|
|
if enc != nil {
|
|
|
|
tryAppend(enc.URL)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return urls, errors.Join(errs...)
|
|
|
|
}
|
|
|
|
|
2023-12-28 15:22:58 +00:00
|
|
|
var parsersByMimeType = map[string]func(*bufio.Reader) ([]URL, error){
|
2023-12-30 10:33:47 +00:00
|
|
|
"text/gemini": parseGemtext,
|
|
|
|
"text/html": parseHTML,
|
|
|
|
"application/rss+xml": parseFeed,
|
|
|
|
"application/atom+xml": parseFeed,
|
|
|
|
"application/feed+json": parseFeed,
|
2023-12-28 15:22:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (p *parser) Parse(mimeType string, body io.Reader) ([]URL, error) {
|
|
|
|
fn, ok := parsersByMimeType[mimeType]
|
|
|
|
if !ok {
|
|
|
|
return nil, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
br := p.brPool.Get().(*bufio.Reader)
|
|
|
|
br.Reset(body)
|
|
|
|
defer p.brPool.Put(br)
|
|
|
|
|
|
|
|
return fn(br)
|
|
|
|
}
|