Added HTTP(s)/HTML support

This commit is contained in:
Brian Picciano 2023-12-30 11:22:09 +01:00
parent 4d2c62a472
commit 2124fd4639
3 changed files with 192 additions and 3 deletions

129
client.go
View File

@ -1,10 +1,12 @@
package deadlinks
import (
"bytes"
"context"
"errors"
"fmt"
"io"
"net/http"
"git.sr.ht/~adnano/go-gemini"
)
@ -27,6 +29,13 @@ type ClientOpts struct {
Do(context.Context, *gemini.Request) (*gemini.Response, error)
}
// HTTPClient will be used for retrieving resources via the http protocol.
//
// Defaults to `new(http.Client)`.
HTTPClient interface {
Do(*http.Request) (*http.Response, error)
}
// MaxRedirects indicates the maximum number of redirects which will be
// allowed when resolving a resource. A negative value indicates no
// redirects are allowed.
@ -44,6 +53,10 @@ func (o *ClientOpts) withDefaults() *ClientOpts {
o.GeminiClient = new(gemini.Client)
}
if o.HTTPClient == nil {
o.HTTPClient = new(http.Client)
}
if o.MaxRedirects == 0 {
o.MaxRedirects = 10
}
@ -61,11 +74,15 @@ type client struct {
//
// Supported URL schemas:
// - gemini
// - http/https (TODO)
// - http/https
func NewClient(opts *ClientOpts) Client {
return &client{*opts.withDefaults()}
}
func emptyReadCloser() io.ReadCloser {
return io.NopCloser(new(bytes.Buffer))
}
func (c *client) getGemini(
ctx context.Context, url URL, redirectDepth int,
) (
@ -90,7 +107,8 @@ func (c *client) getGemini(
case 1: // input required
// Assume that input required is fine, even though we don't know the
// MIME type.
return "", res.Body, nil
defer res.Body.Close()
return "", emptyReadCloser(), nil
case 2: // success
return res.Meta, res.Body, nil
@ -117,6 +135,111 @@ func (c *client) getGemini(
}
}
type concatHTTPBody struct {
orig io.ReadCloser
multi io.Reader
}
func (b concatHTTPBody) Read(bb []byte) (int, error) {
return b.multi.Read(bb)
}
func (b concatHTTPBody) Close() error {
return b.orig.Close()
}
func httpResponseMIMEType(res *http.Response) (string, error) {
if t := res.Header.Get("Content-Type"); t != "" {
return t, nil
}
// content type header not provided, do mime type sniffing.
// http.DetectContentType only requires up to the first 512 bytes of the
// body, according to its documentation, so we pull that off.
head := new(bytes.Buffer)
_, err := io.CopyN(head, res.Body, 512)
if err != nil && !errors.Is(err, io.EOF) {
return "", fmt.Errorf("reading head of response body: %w", err)
}
mimeType := http.DetectContentType(head.Bytes())
// since some of the body has been read off the original reader, we have to
// re-concattenate that portion to the beginning of the stream.
res.Body = concatHTTPBody{
orig: res.Body,
multi: io.MultiReader(head, res.Body),
}
return mimeType, nil
}
func (c *client) getHTTP(
ctx context.Context, url URL, redirectDepth int,
) (
string, io.ReadCloser, error,
) {
req, err := http.NewRequestWithContext(ctx, "GET", string(url), nil)
if err != nil {
return "", nil, fmt.Errorf("building request: %w", err)
}
res, err := c.opts.HTTPClient.Do(req)
if err != nil {
return "", nil, fmt.Errorf("performing request: %w", err)
}
mimeType, err := httpResponseMIMEType(res)
if err != nil {
res.Body.Close()
return "", nil, fmt.Errorf("determining response MIME type: %w", err)
}
statusCodeCategory := res.StatusCode / 100
switch {
case statusCodeCategory == 1: // informational
defer res.Body.Close()
return "", emptyReadCloser(), nil
case statusCodeCategory == 2: // success
return mimeType, res.Body, nil
// redirects
case res.StatusCode == 301,
res.StatusCode == 302,
res.StatusCode == 307,
res.StatusCode == 308:
defer res.Body.Close()
loc, err := res.Location()
if err != nil {
return "", nil, fmt.Errorf(
"getting Location header of response with code %v", res.StatusCode,
)
}
locURL, err := ParseURL(loc.String())
if err != nil {
return "", nil, fmt.Errorf("parsing redirect URL %v: %w", loc, err)
}
newURL := url.ResolveReference(locURL)
return c.get(ctx, newURL, redirectDepth+1)
case statusCodeCategory == 3: // unsupported redirections
defer res.Body.Close()
return "", emptyReadCloser(), nil
// all other response codes, 4xx and 5xx, are considered errors
default:
defer res.Body.Close()
return "", nil, fmt.Errorf(
"response code %d (%v)", res.StatusCode, res.Status,
)
}
}
func (c *client) get(
ctx context.Context, url URL, redirectDepth int,
) (
@ -127,6 +250,8 @@ func (c *client) get(
switch scheme {
case "gemini":
return c.getGemini(ctx, url, redirectDepth)
case "http", "https":
return c.getHTTP(ctx, url, redirectDepth)
default:
return "", nil, fmt.Errorf("unsupported scheme %q", scheme)
}

View File

@ -7,6 +7,8 @@ import (
"io"
"strings"
"sync"
"golang.org/x/net/html"
)
// Parser is a thread-safe type which can parse URLs out of the body of a file,
@ -32,7 +34,7 @@ type parser struct {
//
// Supported MIME types:
// - text/gemtext
// - text/html (TODO)
// - text/html
// - application/rss+xml (TODO)
// - application/atom+xml (TODO)
func NewParser() Parser {
@ -77,8 +79,59 @@ func parseGemtext(body *bufio.Reader) ([]URL, error) {
return urls, errors.Join(errs...)
}
// parses a URL string out of a token, if there is one. Assumes that Token is a
// StartTagToken.
func parseHTMLToken(tok html.Token) string {
for _, attr := range tok.Attr {
switch attr.Key {
case "href":
switch tok.Data {
case "a", "link":
return attr.Val
}
case "src":
switch tok.Data {
case "img", "script":
return attr.Val
}
}
}
return ""
}
func parseHTML(body *bufio.Reader) ([]URL, error) {
var (
t = html.NewTokenizer(body)
urls []URL
errs []error
)
for {
switch t.Next() {
case html.ErrorToken:
if errors.Is(t.Err(), io.EOF) {
return urls, nil
}
return urls, errors.Join(append(errs, t.Err())...)
case html.StartTagToken, html.SelfClosingTagToken:
tok := t.Token()
if urlStr := parseHTMLToken(tok); urlStr != "" {
url, err := ParseURL(urlStr)
if err != nil {
errs = append(errs, fmt.Errorf("parsing url from token %v: %w", tok, err))
continue
}
urls = append(urls, url)
}
}
}
}
var parsersByMimeType = map[string]func(*bufio.Reader) ([]URL, error){
"text/gemini": parseGemtext,
"text/html": parseHTML,
}
func (p *parser) Parse(mimeType string, body io.Reader) ([]URL, error) {

View File

@ -83,6 +83,17 @@ ok here's some text
`parsing URL from line "=> : NO FISH ALLOWED\n": parse ":": missing protocol scheme`,
},
},
{
"text/html",
`
<a href="foo.com">OHAI</a>
<img src="/bar">
<img src="/bar/baz" />
<link rel="stylesheet" href="style.css" />
`,
[]URL{"foo.com", "/bar", "/bar/baz", "style.css"},
nil,
},
}
for i := range tests {