diff --git a/client.go b/client.go
index 068d985..7c1ff75 100644
--- a/client.go
+++ b/client.go
@@ -1,10 +1,12 @@
package deadlinks
import (
+ "bytes"
"context"
"errors"
"fmt"
"io"
+ "net/http"
"git.sr.ht/~adnano/go-gemini"
)
@@ -27,6 +29,13 @@ type ClientOpts struct {
Do(context.Context, *gemini.Request) (*gemini.Response, error)
}
+ // HTTPClient will be used for retrieving resources via the http protocol.
+ //
+ // Defaults to `new(http.Client)`.
+ HTTPClient interface {
+ Do(*http.Request) (*http.Response, error)
+ }
+
// MaxRedirects indicates the maximum number of redirects which will be
// allowed when resolving a resource. A negative value indicates no
// redirects are allowed.
@@ -44,6 +53,10 @@ func (o *ClientOpts) withDefaults() *ClientOpts {
o.GeminiClient = new(gemini.Client)
}
+ if o.HTTPClient == nil {
+ o.HTTPClient = new(http.Client)
+ }
+
if o.MaxRedirects == 0 {
o.MaxRedirects = 10
}
@@ -61,11 +74,15 @@ type client struct {
//
// Supported URL schemas:
// - gemini
-// - http/https (TODO)
+// - http/https
func NewClient(opts *ClientOpts) Client {
return &client{*opts.withDefaults()}
}
+func emptyReadCloser() io.ReadCloser {
+ return io.NopCloser(new(bytes.Buffer))
+}
+
func (c *client) getGemini(
ctx context.Context, url URL, redirectDepth int,
) (
@@ -90,7 +107,8 @@ func (c *client) getGemini(
case 1: // input required
// Assume that input required is fine, even though we don't know the
// MIME type.
- return "", res.Body, nil
+ defer res.Body.Close()
+ return "", emptyReadCloser(), nil
case 2: // success
return res.Meta, res.Body, nil
@@ -117,6 +135,111 @@ func (c *client) getGemini(
}
}
+type concatHTTPBody struct {
+ orig io.ReadCloser
+ multi io.Reader
+}
+
+func (b concatHTTPBody) Read(bb []byte) (int, error) {
+ return b.multi.Read(bb)
+}
+
+func (b concatHTTPBody) Close() error {
+ return b.orig.Close()
+}
+
+func httpResponseMIMEType(res *http.Response) (string, error) {
+ if t := res.Header.Get("Content-Type"); t != "" {
+ return t, nil
+ }
+
+ // content type header not provided, do mime type sniffing.
+ // http.DetectContentType only requires up to the first 512 bytes of the
+ // body, according to its documentation, so we pull that off.
+ head := new(bytes.Buffer)
+ _, err := io.CopyN(head, res.Body, 512)
+ if err != nil && !errors.Is(err, io.EOF) {
+ return "", fmt.Errorf("reading head of response body: %w", err)
+ }
+
+ mimeType := http.DetectContentType(head.Bytes())
+
+ // since some of the body has been read off the original reader, we have to
+ // re-concattenate that portion to the beginning of the stream.
+ res.Body = concatHTTPBody{
+ orig: res.Body,
+ multi: io.MultiReader(head, res.Body),
+ }
+
+ return mimeType, nil
+}
+
+func (c *client) getHTTP(
+ ctx context.Context, url URL, redirectDepth int,
+) (
+ string, io.ReadCloser, error,
+) {
+ req, err := http.NewRequestWithContext(ctx, "GET", string(url), nil)
+ if err != nil {
+ return "", nil, fmt.Errorf("building request: %w", err)
+ }
+
+ res, err := c.opts.HTTPClient.Do(req)
+ if err != nil {
+ return "", nil, fmt.Errorf("performing request: %w", err)
+ }
+
+ mimeType, err := httpResponseMIMEType(res)
+ if err != nil {
+ res.Body.Close()
+ return "", nil, fmt.Errorf("determining response MIME type: %w", err)
+ }
+
+ statusCodeCategory := res.StatusCode / 100
+
+ switch {
+ case statusCodeCategory == 1: // informational
+ defer res.Body.Close()
+ return "", emptyReadCloser(), nil
+
+ case statusCodeCategory == 2: // success
+ return mimeType, res.Body, nil
+
+ // redirects
+ case res.StatusCode == 301,
+ res.StatusCode == 302,
+ res.StatusCode == 307,
+ res.StatusCode == 308:
+ defer res.Body.Close()
+
+ loc, err := res.Location()
+ if err != nil {
+ return "", nil, fmt.Errorf(
+ "getting Location header of response with code %v", res.StatusCode,
+ )
+ }
+
+ locURL, err := ParseURL(loc.String())
+ if err != nil {
+ return "", nil, fmt.Errorf("parsing redirect URL %v: %w", loc, err)
+ }
+
+ newURL := url.ResolveReference(locURL)
+ return c.get(ctx, newURL, redirectDepth+1)
+
+ case statusCodeCategory == 3: // unsupported redirections
+ defer res.Body.Close()
+ return "", emptyReadCloser(), nil
+
+ // all other response codes, 4xx and 5xx, are considered errors
+ default:
+ defer res.Body.Close()
+ return "", nil, fmt.Errorf(
+ "response code %d (%v)", res.StatusCode, res.Status,
+ )
+ }
+}
+
func (c *client) get(
ctx context.Context, url URL, redirectDepth int,
) (
@@ -127,6 +250,8 @@ func (c *client) get(
switch scheme {
case "gemini":
return c.getGemini(ctx, url, redirectDepth)
+ case "http", "https":
+ return c.getHTTP(ctx, url, redirectDepth)
default:
return "", nil, fmt.Errorf("unsupported scheme %q", scheme)
}
diff --git a/parser.go b/parser.go
index 8c2a9c3..6293060 100644
--- a/parser.go
+++ b/parser.go
@@ -7,6 +7,8 @@ import (
"io"
"strings"
"sync"
+
+ "golang.org/x/net/html"
)
// Parser is a thread-safe type which can parse URLs out of the body of a file,
@@ -32,7 +34,7 @@ type parser struct {
//
// Supported MIME types:
// - text/gemtext
-// - text/html (TODO)
+// - text/html
// - application/rss+xml (TODO)
// - application/atom+xml (TODO)
func NewParser() Parser {
@@ -77,8 +79,59 @@ func parseGemtext(body *bufio.Reader) ([]URL, error) {
return urls, errors.Join(errs...)
}
+// parses a URL string out of a token, if there is one. Assumes that Token is a
+// StartTagToken.
+func parseHTMLToken(tok html.Token) string {
+ for _, attr := range tok.Attr {
+ switch attr.Key {
+ case "href":
+ switch tok.Data {
+ case "a", "link":
+ return attr.Val
+ }
+
+ case "src":
+ switch tok.Data {
+ case "img", "script":
+ return attr.Val
+ }
+ }
+ }
+ return ""
+}
+
+func parseHTML(body *bufio.Reader) ([]URL, error) {
+ var (
+ t = html.NewTokenizer(body)
+ urls []URL
+ errs []error
+ )
+
+ for {
+ switch t.Next() {
+ case html.ErrorToken:
+ if errors.Is(t.Err(), io.EOF) {
+ return urls, nil
+ }
+ return urls, errors.Join(append(errs, t.Err())...)
+
+ case html.StartTagToken, html.SelfClosingTagToken:
+ tok := t.Token()
+ if urlStr := parseHTMLToken(tok); urlStr != "" {
+ url, err := ParseURL(urlStr)
+ if err != nil {
+ errs = append(errs, fmt.Errorf("parsing url from token %v: %w", tok, err))
+ continue
+ }
+ urls = append(urls, url)
+ }
+ }
+ }
+}
+
var parsersByMimeType = map[string]func(*bufio.Reader) ([]URL, error){
"text/gemini": parseGemtext,
+ "text/html": parseHTML,
}
func (p *parser) Parse(mimeType string, body io.Reader) ([]URL, error) {
diff --git a/parser_test.go b/parser_test.go
index eec6b5d..f8e0377 100644
--- a/parser_test.go
+++ b/parser_test.go
@@ -83,6 +83,17 @@ ok here's some text
`parsing URL from line "=> : NO FISH ALLOWED\n": parse ":": missing protocol scheme`,
},
},
+ {
+ "text/html",
+ `
+ OHAI
+
+
+
+ `,
+ []URL{"foo.com", "/bar", "/bar/baz", "style.css"},
+ nil,
+ },
}
for i := range tests {