From 2124fd4639874f11e93c8e3d3f402750808d66ba Mon Sep 17 00:00:00 2001
From: Brian Picciano <me@mediocregopher.com>
Date: Sat, 30 Dec 2023 11:22:09 +0100
Subject: [PATCH] Added HTTP(s)/HTML support

---
 client.go      | 129 ++++++++++++++++++++++++++++++++++++++++++++++++-
 parser.go      |  55 ++++++++++++++++++++-
 parser_test.go |  11 +++++
 3 files changed, 192 insertions(+), 3 deletions(-)

diff --git a/client.go b/client.go
index 068d985..7c1ff75 100644
--- a/client.go
+++ b/client.go
@@ -1,10 +1,12 @@
 package deadlinks
 
 import (
+	"bytes"
 	"context"
 	"errors"
 	"fmt"
 	"io"
+	"net/http"
 
 	"git.sr.ht/~adnano/go-gemini"
 )
@@ -27,6 +29,13 @@ type ClientOpts struct {
 		Do(context.Context, *gemini.Request) (*gemini.Response, error)
 	}
 
+	// HTTPClient will be used for retrieving resources via the http protocol.
+	//
+	// Defaults to `new(http.Client)`.
+	HTTPClient interface {
+		Do(*http.Request) (*http.Response, error)
+	}
+
 	// MaxRedirects indicates the maximum number of redirects which will be
 	// allowed when resolving a resource. A negative value indicates no
 	// redirects are allowed.
@@ -44,6 +53,10 @@ func (o *ClientOpts) withDefaults() *ClientOpts {
 		o.GeminiClient = new(gemini.Client)
 	}
 
+	if o.HTTPClient == nil {
+		o.HTTPClient = new(http.Client)
+	}
+
 	if o.MaxRedirects == 0 {
 		o.MaxRedirects = 10
 	}
@@ -61,11 +74,15 @@ type client struct {
 //
 // Supported URL schemas:
 // - gemini
-// - http/https (TODO)
+// - http/https
 func NewClient(opts *ClientOpts) Client {
 	return &client{*opts.withDefaults()}
 }
 
+func emptyReadCloser() io.ReadCloser {
+	return io.NopCloser(new(bytes.Buffer))
+}
+
 func (c *client) getGemini(
 	ctx context.Context, url URL, redirectDepth int,
 ) (
@@ -90,7 +107,8 @@ func (c *client) getGemini(
 	case 1: // input required
 		// Assume that input required is fine, even though we don't know the
 		// MIME type.
-		return "", res.Body, nil
+		defer res.Body.Close()
+		return "", emptyReadCloser(), nil
 
 	case 2: // success
 		return res.Meta, res.Body, nil
@@ -117,6 +135,111 @@ func (c *client) getGemini(
 	}
 }
 
+type concatHTTPBody struct {
+	orig  io.ReadCloser
+	multi io.Reader
+}
+
+func (b concatHTTPBody) Read(bb []byte) (int, error) {
+	return b.multi.Read(bb)
+}
+
+func (b concatHTTPBody) Close() error {
+	return b.orig.Close()
+}
+
+func httpResponseMIMEType(res *http.Response) (string, error) {
+	if t := res.Header.Get("Content-Type"); t != "" {
+		return t, nil
+	}
+
+	// content type header not provided, do mime type sniffing.
+	// http.DetectContentType only requires up to the first 512 bytes of the
+	// body, according to its documentation, so we pull that off.
+	head := new(bytes.Buffer)
+	_, err := io.CopyN(head, res.Body, 512)
+	if err != nil && !errors.Is(err, io.EOF) {
+		return "", fmt.Errorf("reading head of response body: %w", err)
+	}
+
+	mimeType := http.DetectContentType(head.Bytes())
+
+	// since some of the body has been read off the original reader, we have to
+	// re-concattenate that portion to the beginning of the stream.
+	res.Body = concatHTTPBody{
+		orig:  res.Body,
+		multi: io.MultiReader(head, res.Body),
+	}
+
+	return mimeType, nil
+}
+
+func (c *client) getHTTP(
+	ctx context.Context, url URL, redirectDepth int,
+) (
+	string, io.ReadCloser, error,
+) {
+	req, err := http.NewRequestWithContext(ctx, "GET", string(url), nil)
+	if err != nil {
+		return "", nil, fmt.Errorf("building request: %w", err)
+	}
+
+	res, err := c.opts.HTTPClient.Do(req)
+	if err != nil {
+		return "", nil, fmt.Errorf("performing request: %w", err)
+	}
+
+	mimeType, err := httpResponseMIMEType(res)
+	if err != nil {
+		res.Body.Close()
+		return "", nil, fmt.Errorf("determining response MIME type: %w", err)
+	}
+
+	statusCodeCategory := res.StatusCode / 100
+
+	switch {
+	case statusCodeCategory == 1: // informational
+		defer res.Body.Close()
+		return "", emptyReadCloser(), nil
+
+	case statusCodeCategory == 2: // success
+		return mimeType, res.Body, nil
+
+	// redirects
+	case res.StatusCode == 301,
+		res.StatusCode == 302,
+		res.StatusCode == 307,
+		res.StatusCode == 308:
+		defer res.Body.Close()
+
+		loc, err := res.Location()
+		if err != nil {
+			return "", nil, fmt.Errorf(
+				"getting Location header of response with code %v", res.StatusCode,
+			)
+		}
+
+		locURL, err := ParseURL(loc.String())
+		if err != nil {
+			return "", nil, fmt.Errorf("parsing redirect URL %v: %w", loc, err)
+		}
+
+		newURL := url.ResolveReference(locURL)
+		return c.get(ctx, newURL, redirectDepth+1)
+
+	case statusCodeCategory == 3: // unsupported redirections
+		defer res.Body.Close()
+		return "", emptyReadCloser(), nil
+
+		// all other response codes, 4xx and 5xx, are considered errors
+	default:
+		defer res.Body.Close()
+		return "", nil, fmt.Errorf(
+			"response code %d (%v)", res.StatusCode, res.Status,
+		)
+	}
+}
+
 func (c *client) get(
 	ctx context.Context, url URL, redirectDepth int,
 ) (
@@ -127,6 +250,8 @@ func (c *client) get(
 	switch scheme {
 	case "gemini":
 		return c.getGemini(ctx, url, redirectDepth)
+	case "http", "https":
+		return c.getHTTP(ctx, url, redirectDepth)
 	default:
 		return "", nil, fmt.Errorf("unsupported scheme %q", scheme)
 	}
diff --git a/parser.go b/parser.go
index 8c2a9c3..6293060 100644
--- a/parser.go
+++ b/parser.go
@@ -7,6 +7,8 @@ import (
 	"io"
 	"strings"
 	"sync"
+
+	"golang.org/x/net/html"
 )
 
 // Parser is a thread-safe type which can parse URLs out of the body of a file,
@@ -32,7 +34,7 @@ type parser struct {
 //
 // Supported MIME types:
 // - text/gemtext
-// - text/html (TODO)
+// - text/html
 // - application/rss+xml (TODO)
 // - application/atom+xml (TODO)
 func NewParser() Parser {
@@ -77,8 +79,59 @@ func parseGemtext(body *bufio.Reader) ([]URL, error) {
 	return urls, errors.Join(errs...)
 }
 
+// parses a URL string out of a token, if there is one. Assumes that Token is a
+// StartTagToken.
+func parseHTMLToken(tok html.Token) string {
+	for _, attr := range tok.Attr {
+		switch attr.Key {
+		case "href":
+			switch tok.Data {
+			case "a", "link":
+				return attr.Val
+			}
+
+		case "src":
+			switch tok.Data {
+			case "img", "script":
+				return attr.Val
+			}
+		}
+	}
+	return ""
+}
+
+func parseHTML(body *bufio.Reader) ([]URL, error) {
+	var (
+		t    = html.NewTokenizer(body)
+		urls []URL
+		errs []error
+	)
+
+	for {
+		switch t.Next() {
+		case html.ErrorToken:
+			if errors.Is(t.Err(), io.EOF) {
+				return urls, nil
+			}
+			return urls, errors.Join(append(errs, t.Err())...)
+
+		case html.StartTagToken, html.SelfClosingTagToken:
+			tok := t.Token()
+			if urlStr := parseHTMLToken(tok); urlStr != "" {
+				url, err := ParseURL(urlStr)
+				if err != nil {
+					errs = append(errs, fmt.Errorf("parsing url from token %v: %w", tok, err))
+					continue
+				}
+				urls = append(urls, url)
+			}
+		}
+	}
+}
+
 var parsersByMimeType = map[string]func(*bufio.Reader) ([]URL, error){
 	"text/gemini": parseGemtext,
+	"text/html":   parseHTML,
 }
 
 func (p *parser) Parse(mimeType string, body io.Reader) ([]URL, error) {
diff --git a/parser_test.go b/parser_test.go
index eec6b5d..f8e0377 100644
--- a/parser_test.go
+++ b/parser_test.go
@@ -83,6 +83,17 @@ ok here's some text
 				`parsing URL from line "=> : NO FISH ALLOWED\n": parse ":": missing protocol scheme`,
 			},
 		},
+		{
+			"text/html",
+			`
+				<a href="foo.com">OHAI</a>
+				<img src="/bar">
+				<img src="/bar/baz" />
+				<link rel="stylesheet" href="style.css" />
+			`,
+			[]URL{"foo.com", "/bar", "/bar/baz", "style.css"},
+			nil,
+		},
 	}
 
 	for i := range tests {