From 2124fd4639874f11e93c8e3d3f402750808d66ba Mon Sep 17 00:00:00 2001 From: Brian Picciano Date: Sat, 30 Dec 2023 11:22:09 +0100 Subject: [PATCH] Added HTTP(s)/HTML support --- client.go | 129 ++++++++++++++++++++++++++++++++++++++++++++++++- parser.go | 55 ++++++++++++++++++++- parser_test.go | 11 +++++ 3 files changed, 192 insertions(+), 3 deletions(-) diff --git a/client.go b/client.go index 068d985..7c1ff75 100644 --- a/client.go +++ b/client.go @@ -1,10 +1,12 @@ package deadlinks import ( + "bytes" "context" "errors" "fmt" "io" + "net/http" "git.sr.ht/~adnano/go-gemini" ) @@ -27,6 +29,13 @@ type ClientOpts struct { Do(context.Context, *gemini.Request) (*gemini.Response, error) } + // HTTPClient will be used for retrieving resources via the http protocol. + // + // Defaults to `new(http.Client)`. + HTTPClient interface { + Do(*http.Request) (*http.Response, error) + } + // MaxRedirects indicates the maximum number of redirects which will be // allowed when resolving a resource. A negative value indicates no // redirects are allowed. @@ -44,6 +53,10 @@ func (o *ClientOpts) withDefaults() *ClientOpts { o.GeminiClient = new(gemini.Client) } + if o.HTTPClient == nil { + o.HTTPClient = new(http.Client) + } + if o.MaxRedirects == 0 { o.MaxRedirects = 10 } @@ -61,11 +74,15 @@ type client struct { // // Supported URL schemas: // - gemini -// - http/https (TODO) +// - http/https func NewClient(opts *ClientOpts) Client { return &client{*opts.withDefaults()} } +func emptyReadCloser() io.ReadCloser { + return io.NopCloser(new(bytes.Buffer)) +} + func (c *client) getGemini( ctx context.Context, url URL, redirectDepth int, ) ( @@ -90,7 +107,8 @@ func (c *client) getGemini( case 1: // input required // Assume that input required is fine, even though we don't know the // MIME type. - return "", res.Body, nil + defer res.Body.Close() + return "", emptyReadCloser(), nil case 2: // success return res.Meta, res.Body, nil @@ -117,6 +135,111 @@ func (c *client) getGemini( } } +type concatHTTPBody struct { + orig io.ReadCloser + multi io.Reader +} + +func (b concatHTTPBody) Read(bb []byte) (int, error) { + return b.multi.Read(bb) +} + +func (b concatHTTPBody) Close() error { + return b.orig.Close() +} + +func httpResponseMIMEType(res *http.Response) (string, error) { + if t := res.Header.Get("Content-Type"); t != "" { + return t, nil + } + + // content type header not provided, do mime type sniffing. + // http.DetectContentType only requires up to the first 512 bytes of the + // body, according to its documentation, so we pull that off. + head := new(bytes.Buffer) + _, err := io.CopyN(head, res.Body, 512) + if err != nil && !errors.Is(err, io.EOF) { + return "", fmt.Errorf("reading head of response body: %w", err) + } + + mimeType := http.DetectContentType(head.Bytes()) + + // since some of the body has been read off the original reader, we have to + // re-concattenate that portion to the beginning of the stream. + res.Body = concatHTTPBody{ + orig: res.Body, + multi: io.MultiReader(head, res.Body), + } + + return mimeType, nil +} + +func (c *client) getHTTP( + ctx context.Context, url URL, redirectDepth int, +) ( + string, io.ReadCloser, error, +) { + req, err := http.NewRequestWithContext(ctx, "GET", string(url), nil) + if err != nil { + return "", nil, fmt.Errorf("building request: %w", err) + } + + res, err := c.opts.HTTPClient.Do(req) + if err != nil { + return "", nil, fmt.Errorf("performing request: %w", err) + } + + mimeType, err := httpResponseMIMEType(res) + if err != nil { + res.Body.Close() + return "", nil, fmt.Errorf("determining response MIME type: %w", err) + } + + statusCodeCategory := res.StatusCode / 100 + + switch { + case statusCodeCategory == 1: // informational + defer res.Body.Close() + return "", emptyReadCloser(), nil + + case statusCodeCategory == 2: // success + return mimeType, res.Body, nil + + // redirects + case res.StatusCode == 301, + res.StatusCode == 302, + res.StatusCode == 307, + res.StatusCode == 308: + defer res.Body.Close() + + loc, err := res.Location() + if err != nil { + return "", nil, fmt.Errorf( + "getting Location header of response with code %v", res.StatusCode, + ) + } + + locURL, err := ParseURL(loc.String()) + if err != nil { + return "", nil, fmt.Errorf("parsing redirect URL %v: %w", loc, err) + } + + newURL := url.ResolveReference(locURL) + return c.get(ctx, newURL, redirectDepth+1) + + case statusCodeCategory == 3: // unsupported redirections + defer res.Body.Close() + return "", emptyReadCloser(), nil + + // all other response codes, 4xx and 5xx, are considered errors + default: + defer res.Body.Close() + return "", nil, fmt.Errorf( + "response code %d (%v)", res.StatusCode, res.Status, + ) + } +} + func (c *client) get( ctx context.Context, url URL, redirectDepth int, ) ( @@ -127,6 +250,8 @@ func (c *client) get( switch scheme { case "gemini": return c.getGemini(ctx, url, redirectDepth) + case "http", "https": + return c.getHTTP(ctx, url, redirectDepth) default: return "", nil, fmt.Errorf("unsupported scheme %q", scheme) } diff --git a/parser.go b/parser.go index 8c2a9c3..6293060 100644 --- a/parser.go +++ b/parser.go @@ -7,6 +7,8 @@ import ( "io" "strings" "sync" + + "golang.org/x/net/html" ) // Parser is a thread-safe type which can parse URLs out of the body of a file, @@ -32,7 +34,7 @@ type parser struct { // // Supported MIME types: // - text/gemtext -// - text/html (TODO) +// - text/html // - application/rss+xml (TODO) // - application/atom+xml (TODO) func NewParser() Parser { @@ -77,8 +79,59 @@ func parseGemtext(body *bufio.Reader) ([]URL, error) { return urls, errors.Join(errs...) } +// parses a URL string out of a token, if there is one. Assumes that Token is a +// StartTagToken. +func parseHTMLToken(tok html.Token) string { + for _, attr := range tok.Attr { + switch attr.Key { + case "href": + switch tok.Data { + case "a", "link": + return attr.Val + } + + case "src": + switch tok.Data { + case "img", "script": + return attr.Val + } + } + } + return "" +} + +func parseHTML(body *bufio.Reader) ([]URL, error) { + var ( + t = html.NewTokenizer(body) + urls []URL + errs []error + ) + + for { + switch t.Next() { + case html.ErrorToken: + if errors.Is(t.Err(), io.EOF) { + return urls, nil + } + return urls, errors.Join(append(errs, t.Err())...) + + case html.StartTagToken, html.SelfClosingTagToken: + tok := t.Token() + if urlStr := parseHTMLToken(tok); urlStr != "" { + url, err := ParseURL(urlStr) + if err != nil { + errs = append(errs, fmt.Errorf("parsing url from token %v: %w", tok, err)) + continue + } + urls = append(urls, url) + } + } + } +} + var parsersByMimeType = map[string]func(*bufio.Reader) ([]URL, error){ "text/gemini": parseGemtext, + "text/html": parseHTML, } func (p *parser) Parse(mimeType string, body io.Reader) ([]URL, error) { diff --git a/parser_test.go b/parser_test.go index eec6b5d..f8e0377 100644 --- a/parser_test.go +++ b/parser_test.go @@ -83,6 +83,17 @@ ok here's some text `parsing URL from line "=> : NO FISH ALLOWED\n": parse ":": missing protocol scheme`, }, }, + { + "text/html", + ` + OHAI + + + + `, + []URL{"foo.com", "/bar", "/bar/baz", "style.css"}, + nil, + }, } for i := range tests {