add rss & crawl & webhook
This commit is contained in:
176
pkg/util/crawl/crawl.go
Normal file
176
pkg/util/crawl/crawl.go
Normal file
@@ -0,0 +1,176 @@
|
||||
package crawl
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"sync"
|
||||
|
||||
"github.com/pkg/errors"
|
||||
"github.com/temoto/robotstxt"
|
||||
|
||||
"github.com/glidea/zenfeed/pkg/util/text_convert"
|
||||
)
|
||||
|
||||
type Crawler interface {
|
||||
Markdown(ctx context.Context, u string) ([]byte, error)
|
||||
}
|
||||
|
||||
type local struct {
|
||||
hc *http.Client
|
||||
|
||||
robotsDataCache sync.Map
|
||||
}
|
||||
|
||||
func NewLocal() Crawler {
|
||||
return &local{
|
||||
hc: &http.Client{},
|
||||
}
|
||||
}
|
||||
|
||||
func (c *local) Markdown(ctx context.Context, u string) ([]byte, error) {
|
||||
// Check if the page is allowed.
|
||||
if err := c.checkAllowed(ctx, u); err != nil {
|
||||
return nil, errors.Wrapf(err, "check robots.txt for %s", u)
|
||||
}
|
||||
|
||||
// Prepare the request.
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
|
||||
if err != nil {
|
||||
return nil, errors.Wrapf(err, "create request for %s", u)
|
||||
}
|
||||
req.Header.Set("User-Agent", userAgent)
|
||||
|
||||
// Send the request.
|
||||
resp, err := c.hc.Do(req)
|
||||
if err != nil {
|
||||
return nil, errors.Wrapf(err, "fetch %s", u)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Parse the response.
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, errors.Errorf("received non-200 status code %d from %s", resp.StatusCode, u)
|
||||
}
|
||||
bodyBytes, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, errors.Wrapf(err, "read body from %s", u)
|
||||
}
|
||||
|
||||
// Convert the body to markdown.
|
||||
mdBytes, err := textconvert.HTMLToMarkdown(bodyBytes)
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "convert html to markdown")
|
||||
}
|
||||
|
||||
return mdBytes, nil
|
||||
}
|
||||
|
||||
const userAgent = "ZenFeed"
|
||||
|
||||
func (c *local) checkAllowed(ctx context.Context, u string) error {
|
||||
parsedURL, err := url.Parse(u)
|
||||
if err != nil {
|
||||
return errors.Wrapf(err, "parse url %s", u)
|
||||
}
|
||||
|
||||
d, err := c.getRobotsData(ctx, parsedURL.Host)
|
||||
if err != nil {
|
||||
return errors.Wrapf(err, "check robots.txt for %s", parsedURL.Host)
|
||||
}
|
||||
if !d.TestAgent(parsedURL.Path, userAgent) {
|
||||
return errors.Errorf("disallowed by robots.txt for %s", u)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// getRobotsData fetches and parses robots.txt for a given host.
|
||||
func (c *local) getRobotsData(ctx context.Context, host string) (*robotstxt.RobotsData, error) {
|
||||
// Check the cache.
|
||||
if data, found := c.robotsDataCache.Load(host); found {
|
||||
return data.(*robotstxt.RobotsData), nil
|
||||
}
|
||||
|
||||
// Prepare the request.
|
||||
robotsURL := fmt.Sprintf("https://%s/robots.txt", host)
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, robotsURL, nil)
|
||||
if err != nil {
|
||||
return nil, errors.Wrapf(err, "create request for %s", robotsURL)
|
||||
}
|
||||
req.Header.Set("User-Agent", userAgent)
|
||||
|
||||
// Send the request.
|
||||
resp, err := c.hc.Do(req)
|
||||
if err != nil {
|
||||
return nil, errors.Wrapf(err, "fetch %s", robotsURL)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Parse the response.
|
||||
switch resp.StatusCode {
|
||||
case http.StatusOK:
|
||||
data, err := robotstxt.FromResponse(resp)
|
||||
if err != nil {
|
||||
return nil, errors.Wrapf(err, "parse robots.txt from %s", robotsURL)
|
||||
}
|
||||
c.robotsDataCache.Store(host, data)
|
||||
return data, nil
|
||||
case http.StatusNotFound:
|
||||
data := &robotstxt.RobotsData{}
|
||||
c.robotsDataCache.Store(host, data)
|
||||
return data, nil
|
||||
case http.StatusUnauthorized, http.StatusForbidden:
|
||||
return nil, errors.Errorf("access to %s denied (status %d)", robotsURL, resp.StatusCode)
|
||||
default:
|
||||
return nil, errors.Errorf("unexpected status %d fetching %s", resp.StatusCode, robotsURL)
|
||||
}
|
||||
}
|
||||
|
||||
type jina struct {
|
||||
hc *http.Client
|
||||
token string
|
||||
}
|
||||
|
||||
func NewJina(token string) Crawler {
|
||||
return &jina{
|
||||
hc: &http.Client{},
|
||||
|
||||
// If token is empty, will not affect to use, but rate limit will be lower.
|
||||
// See https://jina.ai/api-dashboard/rate-limit.
|
||||
token: token,
|
||||
}
|
||||
}
|
||||
|
||||
func (c *jina) Markdown(ctx context.Context, u string) ([]byte, error) {
|
||||
proxyURL := fmt.Sprintf("https://r.jina.ai/%s", u)
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, proxyURL, nil)
|
||||
if err != nil {
|
||||
return nil, errors.Wrapf(err, "create request for %s", u)
|
||||
}
|
||||
|
||||
req.Header.Set("X-Engine", "browser")
|
||||
req.Header.Set("X-Robots-Txt", userAgent)
|
||||
if c.token != "" {
|
||||
req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", c.token))
|
||||
}
|
||||
|
||||
resp, err := c.hc.Do(req)
|
||||
if err != nil {
|
||||
return nil, errors.Wrapf(err, "fetch %s", proxyURL)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, errors.Errorf("received non-200 status code %d from %s", resp.StatusCode, proxyURL)
|
||||
}
|
||||
|
||||
mdBytes, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, errors.Wrapf(err, "read body from %s", proxyURL)
|
||||
}
|
||||
|
||||
return mdBytes, nil
|
||||
}
|
||||
@@ -13,39 +13,19 @@
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
package rpc
|
||||
package jsonrpc
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"net/http"
|
||||
|
||||
"github.com/glidea/zenfeed/pkg/api"
|
||||
)
|
||||
|
||||
type Handler[Request any, Response any] func(ctx context.Context, req *Request) (*Response, error)
|
||||
|
||||
var (
|
||||
ErrBadRequest = func(err error) Error { return newError(http.StatusBadRequest, err) }
|
||||
ErrNotFound = func(err error) Error { return newError(http.StatusNotFound, err) }
|
||||
ErrInternal = func(err error) Error { return newError(http.StatusInternalServerError, err) }
|
||||
)
|
||||
|
||||
type Error struct {
|
||||
Code int `json:"code"`
|
||||
Message string `json:"message"`
|
||||
}
|
||||
|
||||
func (e Error) Error() string {
|
||||
return e.Message
|
||||
}
|
||||
|
||||
func newError(code int, err error) Error {
|
||||
return Error{
|
||||
Code: code,
|
||||
Message: err.Error(),
|
||||
}
|
||||
}
|
||||
|
||||
func API[Request any, Response any](handler Handler[Request, Response]) http.Handler {
|
||||
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
allowCORS(w)
|
||||
@@ -65,11 +45,11 @@ func API[Request any, Response any](handler Handler[Request, Response]) http.Han
|
||||
|
||||
resp, err := handler(r.Context(), &req)
|
||||
if err != nil {
|
||||
var rpcErr Error
|
||||
if errors.As(err, &rpcErr) {
|
||||
var apiErr api.Error
|
||||
if errors.As(err, &apiErr) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(rpcErr.Code)
|
||||
_ = json.NewEncoder(w).Encode(rpcErr)
|
||||
w.WriteHeader(apiErr.Code)
|
||||
_ = json.NewEncoder(w).Encode(apiErr)
|
||||
|
||||
return
|
||||
}
|
||||
@@ -13,7 +13,7 @@
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
package rpc
|
||||
package jsonrpc
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
@@ -27,6 +27,7 @@ import (
|
||||
|
||||
. "github.com/onsi/gomega"
|
||||
|
||||
"github.com/glidea/zenfeed/pkg/api"
|
||||
"github.com/glidea/zenfeed/pkg/test"
|
||||
)
|
||||
|
||||
@@ -58,15 +59,15 @@ func TestAPI(t *testing.T) {
|
||||
}
|
||||
|
||||
badRequestHandler := func(ctx context.Context, req *TestRequest) (*TestResponse, error) {
|
||||
return nil, ErrBadRequest(errors.New("invalid request"))
|
||||
return nil, api.ErrBadRequest(errors.New("invalid request"))
|
||||
}
|
||||
|
||||
notFoundHandler := func(ctx context.Context, req *TestRequest) (*TestResponse, error) {
|
||||
return nil, ErrNotFound(errors.New("resource not found"))
|
||||
return nil, api.ErrNotFound(errors.New("resource not found"))
|
||||
}
|
||||
|
||||
internalErrorHandler := func(ctx context.Context, req *TestRequest) (*TestResponse, error) {
|
||||
return nil, ErrInternal(errors.New("server error"))
|
||||
return nil, api.ErrInternal(errors.New("server error"))
|
||||
}
|
||||
|
||||
genericErrorHandler := func(ctx context.Context, req *TestRequest) (*TestResponse, error) {
|
||||
Reference in New Issue
Block a user