A tool for crawling and finding links to URLs which no longer exist
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
deadlinks/client.go

289 lines
6.7 KiB

package deadlinks
import (
"bytes"
"context"
"errors"
"fmt"
"io"
"net/http"
"git.sr.ht/~adnano/go-gemini"
)
// Client is a thread-safe type which fetches a resource at the given URL,
// returning its MIME type and body. If the MIME type is not known then empty
// string should be returned.
type Client interface {
Get(context.Context, URL) (string, io.ReadCloser, error)
}
// ClientOpts are optional fields which can be provided to NewClient. A nil
// ClientOpts is equivalent to an empty one.
type ClientOpts struct {
// GeminiClient will be used for retrieving resources via the gemini
// protocol.
//
// Defaults to `new(gemini.Client)`.
GeminiClient interface {
Do(context.Context, *gemini.Request) (*gemini.Response, error)
}
// HTTPClient will be used for retrieving resources via the http protocol.
//
// Defaults to `new(http.Client)`.
HTTPClient interface {
Do(*http.Request) (*http.Response, error)
}
// HTTPUserAgent overwrites the user agent used by the HTTPClient.
//
// Defaults to whatever http.Client uses by default.
HTTPUserAgent string
// MaxRedirects indicates the maximum number of redirects which will be
// allowed when resolving a resource. A negative value indicates no
// redirects are allowed.
//
// Default: 10.
MaxRedirects int
}
func (o *ClientOpts) withDefaults() *ClientOpts {
if o == nil {
o = new(ClientOpts)
}
if o.GeminiClient == nil {
o.GeminiClient = new(gemini.Client)
}
if o.HTTPClient == nil {
o.HTTPClient = &http.Client{
Transport: http.DefaultTransport.(*http.Transport).Clone(),
}
}
if o.MaxRedirects == 0 {
o.MaxRedirects = 10
}
return o
}
type client struct {
opts ClientOpts
}
// NewClient initializes and returns a Client which supports commonly used
// transport protocols. The returned Client will error when it encounters an
// unfamiliar protocol.
//
// Supported URL schemas:
// - gemini
// - http/https
//
// Some schemas automatically return success:
// - mailto
// - data
func NewClient(opts *ClientOpts) Client {
return &client{*opts.withDefaults()}
}
func emptyReadCloser() io.ReadCloser {
return io.NopCloser(new(bytes.Buffer))
}
func (c *client) getGemini(
ctx context.Context, url URL, redirectDepth int,
) (
string, io.ReadCloser, error,
) {
req, err := gemini.NewRequest(string(url))
if err != nil {
return "", nil, fmt.Errorf("building request: %w", err)
}
// TODO allow specifying client cert
res, err := c.opts.GeminiClient.Do(ctx, req)
if err != nil {
return "", nil, fmt.Errorf("performing request: %w", err)
}
// all status numbers are grouped by their first digit, and actions taken
// can be entirely based on that.
switch res.Status / 10 {
case 1: // input required
// Assume that input required is fine, even though we don't know the
// MIME type.
defer res.Body.Close()
return "", emptyReadCloser(), nil
case 2: // success
return res.Meta, res.Body, nil
case 3: // redirect
defer res.Body.Close()
if redirectDepth >= c.opts.MaxRedirects {
return "", nil, errors.New("too many redirects")
}
metaURL, err := ParseURL(res.Meta)
if err != nil {
return "", nil, fmt.Errorf("parsing redirect URL %q: %w", res.Meta, err)
}
newURL := url.ResolveReference(metaURL)
return c.get(ctx, newURL, redirectDepth+1)
default:
defer res.Body.Close()
return "", nil, fmt.Errorf(
"response code %d (%v): %q", res.Status, res.Status, res.Meta,
)
}
}
type concatHTTPBody struct {
orig io.ReadCloser
multi io.Reader
}
func (b concatHTTPBody) Read(bb []byte) (int, error) {
return b.multi.Read(bb)
}
func (b concatHTTPBody) Close() error {
return b.orig.Close()
}
func httpResponseMIMEType(res *http.Response) (string, error) {
if t := res.Header.Get("Content-Type"); t != "" {
return t, nil
}
// content type header not provided, do mime type sniffing.
// http.DetectContentType only requires up to the first 512 bytes of the
// body, according to its documentation, so we pull that off.
head := new(bytes.Buffer)
_, err := io.CopyN(head, res.Body, 512)
if err != nil && !errors.Is(err, io.EOF) {
return "", fmt.Errorf("reading head of response body: %w", err)
}
mimeType := http.DetectContentType(head.Bytes())
// since some of the body has been read off the original reader, we have to
// re-concattenate that portion to the beginning of the stream.
res.Body = concatHTTPBody{
orig: res.Body,
multi: io.MultiReader(head, res.Body),
}
return mimeType, nil
}
func (c *client) getHTTP(
ctx context.Context, url URL, redirectDepth int,
) (
string, io.ReadCloser, error,
) {
req, err := http.NewRequestWithContext(ctx, "GET", string(url), nil)
if err != nil {
return "", nil, fmt.Errorf("building request: %w", err)
}
if c.opts.HTTPUserAgent != "" {
req.Header.Set("User-Agent", c.opts.HTTPUserAgent)
}
res, err := c.opts.HTTPClient.Do(req)
if err != nil {
return "", nil, fmt.Errorf("performing request: %w", err)
}
mimeType, err := httpResponseMIMEType(res)
if err != nil {
res.Body.Close()
return "", nil, fmt.Errorf("determining response MIME type: %w", err)
}
statusCodeCategory := res.StatusCode / 100
switch {
case statusCodeCategory == 1: // informational
defer res.Body.Close()
return "", emptyReadCloser(), nil
case statusCodeCategory == 2: // success
return mimeType, res.Body, nil
// redirects
case res.StatusCode == 301,
res.StatusCode == 302,
res.StatusCode == 307,
res.StatusCode == 308:
defer res.Body.Close()
loc, err := res.Location()
if err != nil {
return "", nil, fmt.Errorf(
"getting Location header of response with code %v", res.StatusCode,
)
}
locURL, err := ParseURL(loc.String())
if err != nil {
return "", nil, fmt.Errorf("parsing redirect URL %v: %w", loc, err)
}
newURL := url.ResolveReference(locURL)
return c.get(ctx, newURL, redirectDepth+1)
case statusCodeCategory == 3: // unsupported redirections
defer res.Body.Close()
return "", emptyReadCloser(), nil
// all other response codes, 4xx and 5xx, are considered errors
default:
defer res.Body.Close()
return "", nil, fmt.Errorf(
"response code %d (%v)", res.StatusCode, res.Status,
)
}
}
func (c *client) noOpGet() (
string, io.ReadCloser, error,
) {
return "", emptyReadCloser(), nil
}
func (c *client) get(
ctx context.Context, url URL, redirectDepth int,
) (
string, io.ReadCloser, error,
) {
scheme := url.toStd().Scheme
switch scheme {
case "gemini":
return c.getGemini(ctx, url, redirectDepth)
case "http", "https":
return c.getHTTP(ctx, url, redirectDepth)
case "mailto", "data":
return c.noOpGet()
default:
return "", nil, fmt.Errorf("unsupported scheme %q", scheme)
}
}
func (c *client) Get(
ctx context.Context, url URL,
) (
string, io.ReadCloser, error,
) {
return c.get(ctx, url, 0)
}