package deadlinks import ( "bufio" "errors" "fmt" "io" "strings" "sync" "github.com/mmcdole/gofeed" "golang.org/x/net/html" ) // Parser is a thread-safe type which can parse URLs out of the body of a file, // using a mimeType to determine what kind of file it is. // // The returned URLs may be either relative or absolute, and may or may not // include other URL elements like scheme, host, etc... // // It is not required that the Parser fully reads the body io.Reader. // // If an error is returned then some set of URLs may still be returned. type Parser interface { Parse(mimeType string, body io.Reader) ([]URL, error) } type parser struct { brPool sync.Pool } // NewParser returns a basic Parser supporting some commonly used document // types which support hyperlinks. The returned Parser will return an empty URL // set for all unsupported MIME types. // // Supported MIME types: // - text/gemtext // - text/html // - application/rss+xml // - application/atom+xml // - application/feed+json func NewParser() Parser { return &parser{ brPool: sync.Pool{ New: func() any { return bufio.NewReader(nil) }, }, } } func parseGemtext(body *bufio.Reader) ([]URL, error) { var ( urls []URL errs []error ) for { line, err := body.ReadString('\n') if strings.HasPrefix(line, "=> ") { if parts := strings.Fields(line); len(parts) >= 2 { u, err := ParseURL(parts[1]) if err != nil { errs = append(errs, fmt.Errorf( "parsing URL from line %q: %w", line, err, )) continue } urls = append(urls, u) } } if errors.Is(err, io.EOF) { break } else if err != nil { errs = append(errs, err) break } } return urls, errors.Join(errs...) } // parses a URL string out of a token, if there is one. Assumes that Token is a // StartTagToken. func parseHTMLToken(tok html.Token) string { for _, attr := range tok.Attr { switch attr.Key { case "href": switch tok.Data { case "a", "link": return attr.Val } case "src": switch tok.Data { case "img", "script": return attr.Val } } } return "" } func parseHTML(body *bufio.Reader) ([]URL, error) { var ( t = html.NewTokenizer(body) urls []URL errs []error ) for { switch t.Next() { case html.ErrorToken: if errors.Is(t.Err(), io.EOF) { return urls, nil } return urls, errors.Join(append(errs, t.Err())...) case html.StartTagToken, html.SelfClosingTagToken: tok := t.Token() if urlStr := parseHTMLToken(tok); urlStr != "" { url, err := ParseURL(urlStr) if err != nil { errs = append(errs, fmt.Errorf("parsing url from token %v: %w", tok, err)) continue } urls = append(urls, url) } } } } func parseFeed(body *bufio.Reader) ([]URL, error) { feed, err := gofeed.NewParser().Parse(body) if err != nil { return nil, err } var ( urls []URL errs []error ) tryAppend := func(s string) { if s == "" { return } url, err := ParseURL(s) if err != nil { errs = append(errs, fmt.Errorf("parsing URL %q: %w", s, err)) } else { urls = append(urls, url) } } tryAppend(feed.Link) tryAppend(feed.FeedLink) for _, l := range feed.Links { tryAppend(l) } if feed.Image != nil { tryAppend(feed.Image.URL) } for _, item := range feed.Items { tryAppend(item.Link) for _, l := range item.Links { tryAppend(l) } if item.Image != nil { tryAppend(item.Image.URL) } for _, enc := range item.Enclosures { if enc != nil { tryAppend(enc.URL) } } } return urls, errors.Join(errs...) } var parsersByMimeType = map[string]func(*bufio.Reader) ([]URL, error){ "text/gemini": parseGemtext, "text/html": parseHTML, "application/rss+xml": parseFeed, "application/atom+xml": parseFeed, "application/feed+json": parseFeed, } func (p *parser) Parse(mimeType string, body io.Reader) ([]URL, error) { fn, ok := parsersByMimeType[mimeType] if !ok { return nil, nil } br := p.brPool.Get().(*bufio.Reader) br.Reset(body) defer p.brPool.Put(br) return fn(br) }