Added HTTP(s)/HTML support
This commit is contained in:
parent
4d2c62a472
commit
2124fd4639
129
client.go
129
client.go
@ -1,10 +1,12 @@
|
||||
package deadlinks
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
|
||||
"git.sr.ht/~adnano/go-gemini"
|
||||
)
|
||||
@ -27,6 +29,13 @@ type ClientOpts struct {
|
||||
Do(context.Context, *gemini.Request) (*gemini.Response, error)
|
||||
}
|
||||
|
||||
// HTTPClient will be used for retrieving resources via the http protocol.
|
||||
//
|
||||
// Defaults to `new(http.Client)`.
|
||||
HTTPClient interface {
|
||||
Do(*http.Request) (*http.Response, error)
|
||||
}
|
||||
|
||||
// MaxRedirects indicates the maximum number of redirects which will be
|
||||
// allowed when resolving a resource. A negative value indicates no
|
||||
// redirects are allowed.
|
||||
@ -44,6 +53,10 @@ func (o *ClientOpts) withDefaults() *ClientOpts {
|
||||
o.GeminiClient = new(gemini.Client)
|
||||
}
|
||||
|
||||
if o.HTTPClient == nil {
|
||||
o.HTTPClient = new(http.Client)
|
||||
}
|
||||
|
||||
if o.MaxRedirects == 0 {
|
||||
o.MaxRedirects = 10
|
||||
}
|
||||
@ -61,11 +74,15 @@ type client struct {
|
||||
//
|
||||
// Supported URL schemas:
|
||||
// - gemini
|
||||
// - http/https (TODO)
|
||||
// - http/https
|
||||
func NewClient(opts *ClientOpts) Client {
|
||||
return &client{*opts.withDefaults()}
|
||||
}
|
||||
|
||||
func emptyReadCloser() io.ReadCloser {
|
||||
return io.NopCloser(new(bytes.Buffer))
|
||||
}
|
||||
|
||||
func (c *client) getGemini(
|
||||
ctx context.Context, url URL, redirectDepth int,
|
||||
) (
|
||||
@ -90,7 +107,8 @@ func (c *client) getGemini(
|
||||
case 1: // input required
|
||||
// Assume that input required is fine, even though we don't know the
|
||||
// MIME type.
|
||||
return "", res.Body, nil
|
||||
defer res.Body.Close()
|
||||
return "", emptyReadCloser(), nil
|
||||
|
||||
case 2: // success
|
||||
return res.Meta, res.Body, nil
|
||||
@ -117,6 +135,111 @@ func (c *client) getGemini(
|
||||
}
|
||||
}
|
||||
|
||||
type concatHTTPBody struct {
|
||||
orig io.ReadCloser
|
||||
multi io.Reader
|
||||
}
|
||||
|
||||
func (b concatHTTPBody) Read(bb []byte) (int, error) {
|
||||
return b.multi.Read(bb)
|
||||
}
|
||||
|
||||
func (b concatHTTPBody) Close() error {
|
||||
return b.orig.Close()
|
||||
}
|
||||
|
||||
func httpResponseMIMEType(res *http.Response) (string, error) {
|
||||
if t := res.Header.Get("Content-Type"); t != "" {
|
||||
return t, nil
|
||||
}
|
||||
|
||||
// content type header not provided, do mime type sniffing.
|
||||
// http.DetectContentType only requires up to the first 512 bytes of the
|
||||
// body, according to its documentation, so we pull that off.
|
||||
head := new(bytes.Buffer)
|
||||
_, err := io.CopyN(head, res.Body, 512)
|
||||
if err != nil && !errors.Is(err, io.EOF) {
|
||||
return "", fmt.Errorf("reading head of response body: %w", err)
|
||||
}
|
||||
|
||||
mimeType := http.DetectContentType(head.Bytes())
|
||||
|
||||
// since some of the body has been read off the original reader, we have to
|
||||
// re-concattenate that portion to the beginning of the stream.
|
||||
res.Body = concatHTTPBody{
|
||||
orig: res.Body,
|
||||
multi: io.MultiReader(head, res.Body),
|
||||
}
|
||||
|
||||
return mimeType, nil
|
||||
}
|
||||
|
||||
func (c *client) getHTTP(
|
||||
ctx context.Context, url URL, redirectDepth int,
|
||||
) (
|
||||
string, io.ReadCloser, error,
|
||||
) {
|
||||
req, err := http.NewRequestWithContext(ctx, "GET", string(url), nil)
|
||||
if err != nil {
|
||||
return "", nil, fmt.Errorf("building request: %w", err)
|
||||
}
|
||||
|
||||
res, err := c.opts.HTTPClient.Do(req)
|
||||
if err != nil {
|
||||
return "", nil, fmt.Errorf("performing request: %w", err)
|
||||
}
|
||||
|
||||
mimeType, err := httpResponseMIMEType(res)
|
||||
if err != nil {
|
||||
res.Body.Close()
|
||||
return "", nil, fmt.Errorf("determining response MIME type: %w", err)
|
||||
}
|
||||
|
||||
statusCodeCategory := res.StatusCode / 100
|
||||
|
||||
switch {
|
||||
case statusCodeCategory == 1: // informational
|
||||
defer res.Body.Close()
|
||||
return "", emptyReadCloser(), nil
|
||||
|
||||
case statusCodeCategory == 2: // success
|
||||
return mimeType, res.Body, nil
|
||||
|
||||
// redirects
|
||||
case res.StatusCode == 301,
|
||||
res.StatusCode == 302,
|
||||
res.StatusCode == 307,
|
||||
res.StatusCode == 308:
|
||||
defer res.Body.Close()
|
||||
|
||||
loc, err := res.Location()
|
||||
if err != nil {
|
||||
return "", nil, fmt.Errorf(
|
||||
"getting Location header of response with code %v", res.StatusCode,
|
||||
)
|
||||
}
|
||||
|
||||
locURL, err := ParseURL(loc.String())
|
||||
if err != nil {
|
||||
return "", nil, fmt.Errorf("parsing redirect URL %v: %w", loc, err)
|
||||
}
|
||||
|
||||
newURL := url.ResolveReference(locURL)
|
||||
return c.get(ctx, newURL, redirectDepth+1)
|
||||
|
||||
case statusCodeCategory == 3: // unsupported redirections
|
||||
defer res.Body.Close()
|
||||
return "", emptyReadCloser(), nil
|
||||
|
||||
// all other response codes, 4xx and 5xx, are considered errors
|
||||
default:
|
||||
defer res.Body.Close()
|
||||
return "", nil, fmt.Errorf(
|
||||
"response code %d (%v)", res.StatusCode, res.Status,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
func (c *client) get(
|
||||
ctx context.Context, url URL, redirectDepth int,
|
||||
) (
|
||||
@ -127,6 +250,8 @@ func (c *client) get(
|
||||
switch scheme {
|
||||
case "gemini":
|
||||
return c.getGemini(ctx, url, redirectDepth)
|
||||
case "http", "https":
|
||||
return c.getHTTP(ctx, url, redirectDepth)
|
||||
default:
|
||||
return "", nil, fmt.Errorf("unsupported scheme %q", scheme)
|
||||
}
|
||||
|
55
parser.go
55
parser.go
@ -7,6 +7,8 @@ import (
|
||||
"io"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
// Parser is a thread-safe type which can parse URLs out of the body of a file,
|
||||
@ -32,7 +34,7 @@ type parser struct {
|
||||
//
|
||||
// Supported MIME types:
|
||||
// - text/gemtext
|
||||
// - text/html (TODO)
|
||||
// - text/html
|
||||
// - application/rss+xml (TODO)
|
||||
// - application/atom+xml (TODO)
|
||||
func NewParser() Parser {
|
||||
@ -77,8 +79,59 @@ func parseGemtext(body *bufio.Reader) ([]URL, error) {
|
||||
return urls, errors.Join(errs...)
|
||||
}
|
||||
|
||||
// parses a URL string out of a token, if there is one. Assumes that Token is a
|
||||
// StartTagToken.
|
||||
func parseHTMLToken(tok html.Token) string {
|
||||
for _, attr := range tok.Attr {
|
||||
switch attr.Key {
|
||||
case "href":
|
||||
switch tok.Data {
|
||||
case "a", "link":
|
||||
return attr.Val
|
||||
}
|
||||
|
||||
case "src":
|
||||
switch tok.Data {
|
||||
case "img", "script":
|
||||
return attr.Val
|
||||
}
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func parseHTML(body *bufio.Reader) ([]URL, error) {
|
||||
var (
|
||||
t = html.NewTokenizer(body)
|
||||
urls []URL
|
||||
errs []error
|
||||
)
|
||||
|
||||
for {
|
||||
switch t.Next() {
|
||||
case html.ErrorToken:
|
||||
if errors.Is(t.Err(), io.EOF) {
|
||||
return urls, nil
|
||||
}
|
||||
return urls, errors.Join(append(errs, t.Err())...)
|
||||
|
||||
case html.StartTagToken, html.SelfClosingTagToken:
|
||||
tok := t.Token()
|
||||
if urlStr := parseHTMLToken(tok); urlStr != "" {
|
||||
url, err := ParseURL(urlStr)
|
||||
if err != nil {
|
||||
errs = append(errs, fmt.Errorf("parsing url from token %v: %w", tok, err))
|
||||
continue
|
||||
}
|
||||
urls = append(urls, url)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var parsersByMimeType = map[string]func(*bufio.Reader) ([]URL, error){
|
||||
"text/gemini": parseGemtext,
|
||||
"text/html": parseHTML,
|
||||
}
|
||||
|
||||
func (p *parser) Parse(mimeType string, body io.Reader) ([]URL, error) {
|
||||
|
@ -83,6 +83,17 @@ ok here's some text
|
||||
`parsing URL from line "=> : NO FISH ALLOWED\n": parse ":": missing protocol scheme`,
|
||||
},
|
||||
},
|
||||
{
|
||||
"text/html",
|
||||
`
|
||||
<a href="foo.com">OHAI</a>
|
||||
<img src="/bar">
|
||||
<img src="/bar/baz" />
|
||||
<link rel="stylesheet" href="style.css" />
|
||||
`,
|
||||
[]URL{"foo.com", "/bar", "/bar/baz", "style.css"},
|
||||
nil,
|
||||
},
|
||||
}
|
||||
|
||||
for i := range tests {
|
||||
|
Loading…
Reference in New Issue
Block a user