add rss & crawl & webhook

This commit is contained in:
glidea
2025-06-05 23:29:37 +08:00
parent ead8286a48
commit d520444e9f
43 changed files with 1757 additions and 703 deletions

176
pkg/util/crawl/crawl.go Normal file
View File

@@ -0,0 +1,176 @@
package crawl
import (
"context"
"fmt"
"io"
"net/http"
"net/url"
"sync"
"github.com/pkg/errors"
"github.com/temoto/robotstxt"
"github.com/glidea/zenfeed/pkg/util/text_convert"
)
type Crawler interface {
Markdown(ctx context.Context, u string) ([]byte, error)
}
type local struct {
hc *http.Client
robotsDataCache sync.Map
}
func NewLocal() Crawler {
return &local{
hc: &http.Client{},
}
}
func (c *local) Markdown(ctx context.Context, u string) ([]byte, error) {
// Check if the page is allowed.
if err := c.checkAllowed(ctx, u); err != nil {
return nil, errors.Wrapf(err, "check robots.txt for %s", u)
}
// Prepare the request.
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
if err != nil {
return nil, errors.Wrapf(err, "create request for %s", u)
}
req.Header.Set("User-Agent", userAgent)
// Send the request.
resp, err := c.hc.Do(req)
if err != nil {
return nil, errors.Wrapf(err, "fetch %s", u)
}
defer resp.Body.Close()
// Parse the response.
if resp.StatusCode != http.StatusOK {
return nil, errors.Errorf("received non-200 status code %d from %s", resp.StatusCode, u)
}
bodyBytes, err := io.ReadAll(resp.Body)
if err != nil {
return nil, errors.Wrapf(err, "read body from %s", u)
}
// Convert the body to markdown.
mdBytes, err := textconvert.HTMLToMarkdown(bodyBytes)
if err != nil {
return nil, errors.Wrap(err, "convert html to markdown")
}
return mdBytes, nil
}
const userAgent = "ZenFeed"
func (c *local) checkAllowed(ctx context.Context, u string) error {
parsedURL, err := url.Parse(u)
if err != nil {
return errors.Wrapf(err, "parse url %s", u)
}
d, err := c.getRobotsData(ctx, parsedURL.Host)
if err != nil {
return errors.Wrapf(err, "check robots.txt for %s", parsedURL.Host)
}
if !d.TestAgent(parsedURL.Path, userAgent) {
return errors.Errorf("disallowed by robots.txt for %s", u)
}
return nil
}
// getRobotsData fetches and parses robots.txt for a given host.
func (c *local) getRobotsData(ctx context.Context, host string) (*robotstxt.RobotsData, error) {
// Check the cache.
if data, found := c.robotsDataCache.Load(host); found {
return data.(*robotstxt.RobotsData), nil
}
// Prepare the request.
robotsURL := fmt.Sprintf("https://%s/robots.txt", host)
req, err := http.NewRequestWithContext(ctx, http.MethodGet, robotsURL, nil)
if err != nil {
return nil, errors.Wrapf(err, "create request for %s", robotsURL)
}
req.Header.Set("User-Agent", userAgent)
// Send the request.
resp, err := c.hc.Do(req)
if err != nil {
return nil, errors.Wrapf(err, "fetch %s", robotsURL)
}
defer resp.Body.Close()
// Parse the response.
switch resp.StatusCode {
case http.StatusOK:
data, err := robotstxt.FromResponse(resp)
if err != nil {
return nil, errors.Wrapf(err, "parse robots.txt from %s", robotsURL)
}
c.robotsDataCache.Store(host, data)
return data, nil
case http.StatusNotFound:
data := &robotstxt.RobotsData{}
c.robotsDataCache.Store(host, data)
return data, nil
case http.StatusUnauthorized, http.StatusForbidden:
return nil, errors.Errorf("access to %s denied (status %d)", robotsURL, resp.StatusCode)
default:
return nil, errors.Errorf("unexpected status %d fetching %s", resp.StatusCode, robotsURL)
}
}
type jina struct {
hc *http.Client
token string
}
func NewJina(token string) Crawler {
return &jina{
hc: &http.Client{},
// If token is empty, will not affect to use, but rate limit will be lower.
// See https://jina.ai/api-dashboard/rate-limit.
token: token,
}
}
func (c *jina) Markdown(ctx context.Context, u string) ([]byte, error) {
proxyURL := fmt.Sprintf("https://r.jina.ai/%s", u)
req, err := http.NewRequestWithContext(ctx, http.MethodGet, proxyURL, nil)
if err != nil {
return nil, errors.Wrapf(err, "create request for %s", u)
}
req.Header.Set("X-Engine", "browser")
req.Header.Set("X-Robots-Txt", userAgent)
if c.token != "" {
req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", c.token))
}
resp, err := c.hc.Do(req)
if err != nil {
return nil, errors.Wrapf(err, "fetch %s", proxyURL)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, errors.Errorf("received non-200 status code %d from %s", resp.StatusCode, proxyURL)
}
mdBytes, err := io.ReadAll(resp.Body)
if err != nil {
return nil, errors.Wrapf(err, "read body from %s", proxyURL)
}
return mdBytes, nil
}

View File

@@ -13,39 +13,19 @@
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package rpc
package jsonrpc
import (
"context"
"encoding/json"
"errors"
"net/http"
"github.com/glidea/zenfeed/pkg/api"
)
type Handler[Request any, Response any] func(ctx context.Context, req *Request) (*Response, error)
var (
ErrBadRequest = func(err error) Error { return newError(http.StatusBadRequest, err) }
ErrNotFound = func(err error) Error { return newError(http.StatusNotFound, err) }
ErrInternal = func(err error) Error { return newError(http.StatusInternalServerError, err) }
)
type Error struct {
Code int `json:"code"`
Message string `json:"message"`
}
func (e Error) Error() string {
return e.Message
}
func newError(code int, err error) Error {
return Error{
Code: code,
Message: err.Error(),
}
}
func API[Request any, Response any](handler Handler[Request, Response]) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
allowCORS(w)
@@ -65,11 +45,11 @@ func API[Request any, Response any](handler Handler[Request, Response]) http.Han
resp, err := handler(r.Context(), &req)
if err != nil {
var rpcErr Error
if errors.As(err, &rpcErr) {
var apiErr api.Error
if errors.As(err, &apiErr) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(rpcErr.Code)
_ = json.NewEncoder(w).Encode(rpcErr)
w.WriteHeader(apiErr.Code)
_ = json.NewEncoder(w).Encode(apiErr)
return
}

View File

@@ -13,7 +13,7 @@
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package rpc
package jsonrpc
import (
"bytes"
@@ -27,6 +27,7 @@ import (
. "github.com/onsi/gomega"
"github.com/glidea/zenfeed/pkg/api"
"github.com/glidea/zenfeed/pkg/test"
)
@@ -58,15 +59,15 @@ func TestAPI(t *testing.T) {
}
badRequestHandler := func(ctx context.Context, req *TestRequest) (*TestResponse, error) {
return nil, ErrBadRequest(errors.New("invalid request"))
return nil, api.ErrBadRequest(errors.New("invalid request"))
}
notFoundHandler := func(ctx context.Context, req *TestRequest) (*TestResponse, error) {
return nil, ErrNotFound(errors.New("resource not found"))
return nil, api.ErrNotFound(errors.New("resource not found"))
}
internalErrorHandler := func(ctx context.Context, req *TestRequest) (*TestResponse, error) {
return nil, ErrInternal(errors.New("server error"))
return nil, api.ErrInternal(errors.New("server error"))
}
genericErrorHandler := func(ctx context.Context, req *TestRequest) (*TestResponse, error) {