Files
zenfeed/pkg/util/crawl/crawl.go
2025-06-05 23:46:54 +08:00

181 lines
4.4 KiB
Go

package crawl
import (
"context"
"fmt"
"io"
"net/http"
"net/url"
"sync"
"github.com/pkg/errors"
"github.com/temoto/robotstxt"
"github.com/glidea/zenfeed/pkg/util/text_convert"
)
type Crawler interface {
Markdown(ctx context.Context, u string) ([]byte, error)
}
type local struct {
hc *http.Client
robotsDataCache sync.Map
}
func NewLocal() Crawler {
return &local{
hc: &http.Client{},
}
}
func (c *local) Markdown(ctx context.Context, u string) ([]byte, error) {
// Check if the page is allowed.
if err := c.checkAllowed(ctx, u); err != nil {
return nil, errors.Wrapf(err, "check robots.txt for %s", u)
}
// Prepare the request.
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
if err != nil {
return nil, errors.Wrapf(err, "create request for %s", u)
}
req.Header.Set("User-Agent", userAgent)
// Send the request.
resp, err := c.hc.Do(req)
if err != nil {
return nil, errors.Wrapf(err, "fetch %s", u)
}
defer func() { _ = resp.Body.Close() }()
// Parse the response.
if resp.StatusCode != http.StatusOK {
return nil, errors.Errorf("received non-200 status code %d from %s", resp.StatusCode, u)
}
bodyBytes, err := io.ReadAll(resp.Body)
if err != nil {
return nil, errors.Wrapf(err, "read body from %s", u)
}
// Convert the body to markdown.
mdBytes, err := textconvert.HTMLToMarkdown(bodyBytes)
if err != nil {
return nil, errors.Wrap(err, "convert html to markdown")
}
return mdBytes, nil
}
const userAgent = "ZenFeed"
func (c *local) checkAllowed(ctx context.Context, u string) error {
parsedURL, err := url.Parse(u)
if err != nil {
return errors.Wrapf(err, "parse url %s", u)
}
d, err := c.getRobotsData(ctx, parsedURL.Host)
if err != nil {
return errors.Wrapf(err, "check robots.txt for %s", parsedURL.Host)
}
if !d.TestAgent(parsedURL.Path, userAgent) {
return errors.Errorf("disallowed by robots.txt for %s", u)
}
return nil
}
// getRobotsData fetches and parses robots.txt for a given host.
func (c *local) getRobotsData(ctx context.Context, host string) (*robotstxt.RobotsData, error) {
// Check the cache.
if data, found := c.robotsDataCache.Load(host); found {
return data.(*robotstxt.RobotsData), nil
}
// Prepare the request.
robotsURL := fmt.Sprintf("https://%s/robots.txt", host)
req, err := http.NewRequestWithContext(ctx, http.MethodGet, robotsURL, nil)
if err != nil {
return nil, errors.Wrapf(err, "create request for %s", robotsURL)
}
req.Header.Set("User-Agent", userAgent)
// Send the request.
resp, err := c.hc.Do(req)
if err != nil {
return nil, errors.Wrapf(err, "fetch %s", robotsURL)
}
defer func() { _ = resp.Body.Close() }()
// Parse the response.
switch resp.StatusCode {
case http.StatusOK:
data, err := robotstxt.FromResponse(resp)
if err != nil {
return nil, errors.Wrapf(err, "parse robots.txt from %s", robotsURL)
}
c.robotsDataCache.Store(host, data)
return data, nil
case http.StatusNotFound:
data := &robotstxt.RobotsData{}
c.robotsDataCache.Store(host, data)
return data, nil
case http.StatusUnauthorized, http.StatusForbidden:
return nil, errors.Errorf("access to %s denied (status %d)", robotsURL, resp.StatusCode)
default:
return nil, errors.Errorf("unexpected status %d fetching %s", resp.StatusCode, robotsURL)
}
}
type jina struct {
hc *http.Client
token string
}
func NewJina(token string) Crawler {
return &jina{
hc: &http.Client{},
// If token is empty, will not affect to use, but rate limit will be lower.
// See https://jina.ai/api-dashboard/rate-limit.
token: token,
}
}
func (c *jina) Markdown(ctx context.Context, u string) ([]byte, error) {
proxyURL := "https://r.jina.ai/" + u
req, err := http.NewRequestWithContext(ctx, http.MethodGet, proxyURL, nil)
if err != nil {
return nil, errors.Wrapf(err, "create request for %s", u)
}
req.Header.Set("X-Engine", "browser")
req.Header.Set("X-Robots-Txt", userAgent)
if c.token != "" {
req.Header.Set("Authorization", "Bearer "+c.token)
}
resp, err := c.hc.Do(req)
if err != nil {
return nil, errors.Wrapf(err, "fetch %s", proxyURL)
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusOK {
return nil, errors.Errorf("received non-200 status code %d from %s", resp.StatusCode, proxyURL)
}
mdBytes, err := io.ReadAll(resp.Body)
if err != nil {
return nil, errors.Wrapf(err, "read body from %s", proxyURL)
}
return mdBytes, nil
}