181 lines
4.4 KiB
Go
181 lines
4.4 KiB
Go
package crawl
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"net/url"
|
|
"sync"
|
|
|
|
"github.com/pkg/errors"
|
|
"github.com/temoto/robotstxt"
|
|
|
|
"github.com/glidea/zenfeed/pkg/util/text_convert"
|
|
)
|
|
|
|
type Crawler interface {
|
|
Markdown(ctx context.Context, u string) ([]byte, error)
|
|
}
|
|
|
|
type local struct {
|
|
hc *http.Client
|
|
|
|
robotsDataCache sync.Map
|
|
}
|
|
|
|
func NewLocal() Crawler {
|
|
return &local{
|
|
hc: &http.Client{},
|
|
}
|
|
}
|
|
|
|
func (c *local) Markdown(ctx context.Context, u string) ([]byte, error) {
|
|
// Check if the page is allowed.
|
|
if err := c.checkAllowed(ctx, u); err != nil {
|
|
return nil, errors.Wrapf(err, "check robots.txt for %s", u)
|
|
}
|
|
|
|
// Prepare the request.
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
|
|
if err != nil {
|
|
return nil, errors.Wrapf(err, "create request for %s", u)
|
|
}
|
|
req.Header.Set("User-Agent", userAgent)
|
|
|
|
// Send the request.
|
|
resp, err := c.hc.Do(req)
|
|
if err != nil {
|
|
return nil, errors.Wrapf(err, "fetch %s", u)
|
|
}
|
|
defer func() { _ = resp.Body.Close() }()
|
|
|
|
// Parse the response.
|
|
if resp.StatusCode != http.StatusOK {
|
|
return nil, errors.Errorf("received non-200 status code %d from %s", resp.StatusCode, u)
|
|
}
|
|
bodyBytes, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return nil, errors.Wrapf(err, "read body from %s", u)
|
|
}
|
|
|
|
// Convert the body to markdown.
|
|
mdBytes, err := textconvert.HTMLToMarkdown(bodyBytes)
|
|
if err != nil {
|
|
return nil, errors.Wrap(err, "convert html to markdown")
|
|
}
|
|
|
|
return mdBytes, nil
|
|
}
|
|
|
|
const userAgent = "ZenFeed"
|
|
|
|
func (c *local) checkAllowed(ctx context.Context, u string) error {
|
|
parsedURL, err := url.Parse(u)
|
|
if err != nil {
|
|
return errors.Wrapf(err, "parse url %s", u)
|
|
}
|
|
|
|
d, err := c.getRobotsData(ctx, parsedURL.Host)
|
|
if err != nil {
|
|
return errors.Wrapf(err, "check robots.txt for %s", parsedURL.Host)
|
|
}
|
|
if !d.TestAgent(parsedURL.Path, userAgent) {
|
|
return errors.Errorf("disallowed by robots.txt for %s", u)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// getRobotsData fetches and parses robots.txt for a given host.
|
|
func (c *local) getRobotsData(ctx context.Context, host string) (*robotstxt.RobotsData, error) {
|
|
// Check the cache.
|
|
if data, found := c.robotsDataCache.Load(host); found {
|
|
return data.(*robotstxt.RobotsData), nil
|
|
}
|
|
|
|
// Prepare the request.
|
|
robotsURL := fmt.Sprintf("https://%s/robots.txt", host)
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, robotsURL, nil)
|
|
if err != nil {
|
|
return nil, errors.Wrapf(err, "create request for %s", robotsURL)
|
|
}
|
|
req.Header.Set("User-Agent", userAgent)
|
|
|
|
// Send the request.
|
|
resp, err := c.hc.Do(req)
|
|
if err != nil {
|
|
return nil, errors.Wrapf(err, "fetch %s", robotsURL)
|
|
}
|
|
defer func() { _ = resp.Body.Close() }()
|
|
|
|
// Parse the response.
|
|
switch resp.StatusCode {
|
|
case http.StatusOK:
|
|
data, err := robotstxt.FromResponse(resp)
|
|
if err != nil {
|
|
return nil, errors.Wrapf(err, "parse robots.txt from %s", robotsURL)
|
|
}
|
|
c.robotsDataCache.Store(host, data)
|
|
|
|
return data, nil
|
|
|
|
case http.StatusNotFound:
|
|
data := &robotstxt.RobotsData{}
|
|
c.robotsDataCache.Store(host, data)
|
|
|
|
return data, nil
|
|
|
|
case http.StatusUnauthorized, http.StatusForbidden:
|
|
return nil, errors.Errorf("access to %s denied (status %d)", robotsURL, resp.StatusCode)
|
|
default:
|
|
return nil, errors.Errorf("unexpected status %d fetching %s", resp.StatusCode, robotsURL)
|
|
}
|
|
}
|
|
|
|
type jina struct {
|
|
hc *http.Client
|
|
token string
|
|
}
|
|
|
|
func NewJina(token string) Crawler {
|
|
return &jina{
|
|
hc: &http.Client{},
|
|
|
|
// If token is empty, will not affect to use, but rate limit will be lower.
|
|
// See https://jina.ai/api-dashboard/rate-limit.
|
|
token: token,
|
|
}
|
|
}
|
|
|
|
func (c *jina) Markdown(ctx context.Context, u string) ([]byte, error) {
|
|
proxyURL := "https://r.jina.ai/" + u
|
|
req, err := http.NewRequestWithContext(ctx, http.MethodGet, proxyURL, nil)
|
|
if err != nil {
|
|
return nil, errors.Wrapf(err, "create request for %s", u)
|
|
}
|
|
|
|
req.Header.Set("X-Engine", "browser")
|
|
req.Header.Set("X-Robots-Txt", userAgent)
|
|
if c.token != "" {
|
|
req.Header.Set("Authorization", "Bearer "+c.token)
|
|
}
|
|
|
|
resp, err := c.hc.Do(req)
|
|
if err != nil {
|
|
return nil, errors.Wrapf(err, "fetch %s", proxyURL)
|
|
}
|
|
defer func() { _ = resp.Body.Close() }()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
return nil, errors.Errorf("received non-200 status code %d from %s", resp.StatusCode, proxyURL)
|
|
}
|
|
|
|
mdBytes, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return nil, errors.Wrapf(err, "read body from %s", proxyURL)
|
|
}
|
|
|
|
return mdBytes, nil
|
|
}
|