package deadlinks import ( "bufio" "errors" "fmt" "io" "strings" "sync" ) // Parser is a thread-safe type which can parse URLs out of the body of a file, // using a mimeType to determine what kind of file it is. // // The returned URLs may be either relative or absolute, and may or may not // include other URL elements like scheme, host, etc... // // It is not required that the Parser fully reads the body io.Reader. // // If an error is returned then some set of URLs may still be returned. type Parser interface { Parse(mimeType string, body io.Reader) ([]URL, error) } type parser struct { brPool sync.Pool } // NewParser returns a basic Parser supporting some commonly used document // types which support hyperlinks. The returned Parser will return an empty URL // set for all unsupported MIME types. // // Supported MIME types: // - text/gemtext // - text/html (TODO) // - application/rss+xml (TODO) // - application/atom+xml (TODO) func NewParser() Parser { return &parser{ brPool: sync.Pool{ New: func() any { return bufio.NewReader(nil) }, }, } } func parseGemtext(body *bufio.Reader) ([]URL, error) { var ( urls []URL errs []error ) for { line, err := body.ReadString('\n') if strings.HasPrefix(line, "=> ") { if parts := strings.Fields(line); len(parts) >= 2 { u, err := ParseURL(parts[1]) if err != nil { errs = append(errs, fmt.Errorf( "parsing URL from line %q: %w", line, err, )) continue } urls = append(urls, u) } } if errors.Is(err, io.EOF) { break } else if err != nil { errs = append(errs, err) break } } return urls, errors.Join(errs...) } var parsersByMimeType = map[string]func(*bufio.Reader) ([]URL, error){ "text/gemini": parseGemtext, } func (p *parser) Parse(mimeType string, body io.Reader) ([]URL, error) { fn, ok := parsersByMimeType[mimeType] if !ok { return nil, nil } br := p.brPool.Get().(*bufio.Reader) br.Reset(body) defer p.brPool.Put(br) return fn(br) }